git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 101         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 102         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 103         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 104         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 105         _NETRC_MACHINE = 'youtube'
 106         # Listed in order of quality
 107         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 108         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 109         _video_extensions = {
 110                 '13': '3gp',
 111                 '17': 'mp4',
 112                 '18': 'mp4',
 113                 '22': 'mp4',
 114                 '37': 'mp4',
 115                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 116                 '43': 'webm',
 117                 '44': 'webm',
 118                 '45': 'webm',
 119                 '46': 'webm',
 120         }
 121         _video_dimensions = {
 122                 '5': '240x400',
 123                 '6': '???',
 124                 '13': '???',
 125                 '17': '144x176',
 126                 '18': '360x640',
 127                 '22': '720x1280',
 128                 '34': '360x640',
 129                 '35': '480x854',
 130                 '37': '1080x1920',
 131                 '38': '3072x4096',
 132                 '43': '360x640',
 133                 '44': '480x854',
 134                 '45': '720x1280',
 135                 '46': '1080x1920',
 136         }
 137         IE_NAME = u'youtube'
 138
 139         def report_lang(self):
 140                 """Report attempt to set language."""
 141                 self._downloader.to_screen(u'[youtube] Setting language')
 142
 143         def report_login(self):
 144                 """Report attempt to log in."""
 145                 self._downloader.to_screen(u'[youtube] Logging in')
 146
 147         def report_age_confirmation(self):
 148                 """Report attempt to confirm age."""
 149                 self._downloader.to_screen(u'[youtube] Confirming age')
 150
 151         def report_video_webpage_download(self, video_id):
 152                 """Report attempt to download video webpage."""
 153                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 154
 155         def report_video_info_webpage_download(self, video_id):
 156                 """Report attempt to download video info webpage."""
 157                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 158
 159         def report_video_subtitles_download(self, video_id):
 160                 """Report attempt to download video info webpage."""
 161                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 162
 163         def report_information_extraction(self, video_id):
 164                 """Report attempt to extract video information."""
 165                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 166
 167         def report_unavailable_format(self, video_id, format):
 168                 """Report extracted video URL."""
 169                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 170
 171         def report_rtmp_download(self):
 172                 """Indicate the download will use the RTMP protocol."""
 173                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 174
 175         def _closed_captions_xml_to_srt(self, xml_string):
 176                 srt = ''
 177                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 178                 # TODO parse xml instead of regex
 179                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 180                         if not dur: dur = '4'
 181                         start = float(start)
 182                         end = start + float(dur)
 183                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 184                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 185                         caption = unescapeHTML(caption)
 186                         caption = unescapeHTML(caption) # double cycle, intentional
 187                         srt += str(n+1) + '\n'
 188                         srt += start + ' --> ' + end + '\n'
 189                         srt += caption + '\n\n'
 190                 return srt
 191
 192         def _print_formats(self, formats):
 193                 print 'Available formats:'
 194                 for x in formats:
 195                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 196
 197         def _real_initialize(self):
 198                 if self._downloader is None:
 199                         return
 200
 201                 username = None
 202                 password = None
 203                 downloader_params = self._downloader.params
 204
 205                 # Attempt to use provided username and password or .netrc data
 206                 if downloader_params.get('username', None) is not None:
 207                         username = downloader_params['username']
 208                         password = downloader_params['password']
 209                 elif downloader_params.get('usenetrc', False):
 210                         try:
 211                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 212                                 if info is not None:
 213                                         username = info[0]
 214                                         password = info[2]
 215                                 else:
 216                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 217                         except (IOError, netrc.NetrcParseError), err:
 218                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 219                                 return
 220
 221                 # Set language
 222                 request = urllib2.Request(self._LANG_URL)
 223                 try:
 224                         self.report_lang()
 225                         urllib2.urlopen(request).read()
 226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 227                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 228                         return
 229
 230                 # No authentication to be performed
 231                 if username is None:
 232                         return
 233
 234                 # Log in
 235                 login_form = {
 236                                 'current_form': 'loginForm',
 237                                 'next':         '/',
 238                                 'action_login': 'Log In',
 239                                 'username':     username,
 240                                 'password':     password,
 241                                 }
 242                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 243                 try:
 244                         self.report_login()
 245                         login_results = urllib2.urlopen(request).read()
 246                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 247                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 248                                 return
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 251                         return
 252
 253                 # Confirm age
 254                 age_form = {
 255                                 'next_url':             '/',
 256                                 'action_confirm':       'Confirm',
 257                                 }
 258                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 259                 try:
 260                         self.report_age_confirmation()
 261                         age_results = urllib2.urlopen(request).read()
 262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 263                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 264                         return
 265
 266         def _real_extract(self, url):
 267                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 268                 mobj = re.search(self._NEXT_URL_RE, url)
 269                 if mobj:
 270                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 271
 272                 # Extract video id from URL
 273                 mobj = re.match(self._VALID_URL, url)
 274                 if mobj is None:
 275                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 276                         return
 277                 video_id = mobj.group(2)
 278
 279                 # Get video webpage
 280                 self.report_video_webpage_download(video_id)
 281                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 282                 try:
 283                         video_webpage = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 286                         return
 287
 288                 # Attempt to extract SWF player URL
 289                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 290                 if mobj is not None:
 291                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 292                 else:
 293                         player_url = None
 294
 295                 # Get video info
 296                 self.report_video_info_webpage_download(video_id)
 297                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 298                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 299                                         % (video_id, el_type))
 300                         request = urllib2.Request(video_info_url)
 301                         try:
 302                                 video_info_webpage = urllib2.urlopen(request).read()
 303                                 video_info = parse_qs(video_info_webpage)
 304                                 if 'token' in video_info:
 305                                         break
 306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 308                                 return
 309                 if 'token' not in video_info:
 310                         if 'reason' in video_info:
 311                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 312                         else:
 313                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 314                         return
 315
 316                 # Check for "rental" videos
 317                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 318                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 319                         return
 320
 321                 # Start extracting information
 322                 self.report_information_extraction(video_id)
 323
 324                 # uploader
 325                 if 'author' not in video_info:
 326                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 327                         return
 328                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 329
 330                 # title
 331                 if 'title' not in video_info:
 332                         self._downloader.trouble(u'ERROR: unable to extract video title')
 333                         return
 334                 video_title = urllib.unquote_plus(video_info['title'][0])
 335                 video_title = video_title.decode('utf-8')
 336
 337                 # thumbnail image
 338                 if 'thumbnail_url' not in video_info:
 339                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 340                         video_thumbnail = ''
 341                 else:   # don't panic if we can't find it
 342                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 343
 344                 # upload date
 345                 upload_date = u'NA'
 346                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 347                 if mobj is not None:
 348                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 349                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 350                         for expression in format_expressions:
 351                                 try:
 352                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 353                                 except:
 354                                         pass
 355
 356                 # description
 357                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 358                 if video_description: video_description = clean_html(video_description)
 359                 else: video_description = ''
 360
 361                 # closed captions
 362                 video_subtitles = None
 363                 if self._downloader.params.get('writesubtitles', False):
 364                         try:
 365                                 self.report_video_subtitles_download(video_id)
 366                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 367                                 try:
 368                                         srt_list = urllib2.urlopen(request).read()
 369                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 370                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 371                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 372                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 373                                 if not srt_lang_list:
 374                                         raise Trouble(u'WARNING: video has no closed captions')
 375                                 if self._downloader.params.get('subtitleslang', False):
 376                                         srt_lang = self._downloader.params.get('subtitleslang')
 377                                 elif 'en' in srt_lang_list:
 378                                         srt_lang = 'en'
 379                                 else:
 380                                         srt_lang = srt_lang_list.keys()[0]
 381                                 if not srt_lang in srt_lang_list:
 382                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 383                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 384                                 try:
 385                                         srt_xml = urllib2.urlopen(request).read()
 386                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 388                                 if not srt_xml:
 389                                         raise Trouble(u'WARNING: unable to download video subtitles')
 390                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 391                         except Trouble as trouble:
 392                                 self._downloader.trouble(trouble[0])
 393
 394                 # token
 395                 video_token = urllib.unquote_plus(video_info['token'][0])
 396
 397                 # Decide which formats to download
 398                 req_format = self._downloader.params.get('format', None)
 399
 400                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 401                         self.report_rtmp_download()
 402                         video_url_list = [(None, video_info['conn'][0])]
 403                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 404                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 405                         url_data = [parse_qs(uds) for uds in url_data_strs]
 406                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 407                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 408
 409                         format_limit = self._downloader.params.get('format_limit', None)
 410                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 411                         if format_limit is not None and format_limit in available_formats:
 412                                 format_list = available_formats[available_formats.index(format_limit):]
 413                         else:
 414                                 format_list = available_formats
 415                         existing_formats = [x for x in format_list if x in url_map]
 416                         if len(existing_formats) == 0:
 417                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 418                                 return
 419                         if self._downloader.params.get('listformats', None):
 420                                 self._print_formats(existing_formats)
 421                                 return
 422                         if req_format is None or req_format == 'best':
 423                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 424                         elif req_format == 'worst':
 425                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 426                         elif req_format in ('-1', 'all'):
 427                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 428                         else:
 429                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 430                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 431                                 req_formats = req_format.split('/')
 432                                 video_url_list = None
 433                                 for rf in req_formats:
 434                                         if rf in url_map:
 435                                                 video_url_list = [(rf, url_map[rf])]
 436                                                 break
 437                                 if video_url_list is None:
 438                                         self._downloader.trouble(u'ERROR: requested format not available')
 439                                         return
 440                 else:
 441                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 442                         return
 443
 444                 results = []
 445                 for format_param, video_real_url in video_url_list:
 446                         # Extension
 447                         video_extension = self._video_extensions.get(format_param, 'flv')
 448
 449                         results.append({
 450                                 'id':           video_id.decode('utf-8'),
 451                                 'url':          video_real_url.decode('utf-8'),
 452                                 'uploader':     video_uploader.decode('utf-8'),
 453                                 'upload_date':  upload_date,
 454                                 'title':        video_title,
 455                                 'ext':          video_extension.decode('utf-8'),
 456                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 457                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 458                                 'description':  video_description,
 459                                 'player_url':   player_url,
 460                                 'subtitles':    video_subtitles
 461                         })
 462                 return results
 463
 464
 465 class MetacafeIE(InfoExtractor):
 466         """Information Extractor for metacafe.com."""
 467
 468         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 469         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 470         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 471         IE_NAME = u'metacafe'
 472
 473         def __init__(self, downloader=None):
 474                 InfoExtractor.__init__(self, downloader)
 475
 476         def report_disclaimer(self):
 477                 """Report disclaimer retrieval."""
 478                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 479
 480         def report_age_confirmation(self):
 481                 """Report attempt to confirm age."""
 482                 self._downloader.to_screen(u'[metacafe] Confirming age')
 483
 484         def report_download_webpage(self, video_id):
 485                 """Report webpage download."""
 486                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 487
 488         def report_extraction(self, video_id):
 489                 """Report information extraction."""
 490                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 491
 492         def _real_initialize(self):
 493                 # Retrieve disclaimer
 494                 request = urllib2.Request(self._DISCLAIMER)
 495                 try:
 496                         self.report_disclaimer()
 497                         disclaimer = urllib2.urlopen(request).read()
 498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 499                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 500                         return
 501
 502                 # Confirm age
 503                 disclaimer_form = {
 504                         'filters': '0',
 505                         'submit': "Continue - I'm over 18",
 506                         }
 507                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 508                 try:
 509                         self.report_age_confirmation()
 510                         disclaimer = urllib2.urlopen(request).read()
 511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 512                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 513                         return
 514
 515         def _real_extract(self, url):
 516                 # Extract id and simplified title from URL
 517                 mobj = re.match(self._VALID_URL, url)
 518                 if mobj is None:
 519                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 520                         return
 521
 522                 video_id = mobj.group(1)
 523
 524                 # Check if video comes from YouTube
 525                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 526                 if mobj2 is not None:
 527                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 528                         return
 529
 530                 # Retrieve video webpage to extract further information
 531                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 532                 try:
 533                         self.report_download_webpage(video_id)
 534                         webpage = urllib2.urlopen(request).read()
 535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 536                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 537                         return
 538
 539                 # Extract URL, uploader and title from webpage
 540                 self.report_extraction(video_id)
 541                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 542                 if mobj is not None:
 543                         mediaURL = urllib.unquote(mobj.group(1))
 544                         video_extension = mediaURL[-3:]
 545
 546                         # Extract gdaKey if available
 547                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 548                         if mobj is None:
 549                                 video_url = mediaURL
 550                         else:
 551                                 gdaKey = mobj.group(1)
 552                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 553                 else:
 554                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 555                         if mobj is None:
 556                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 557                                 return
 558                         vardict = parse_qs(mobj.group(1))
 559                         if 'mediaData' not in vardict:
 560                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 561                                 return
 562                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 563                         if mobj is None:
 564                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 565                                 return
 566                         mediaURL = mobj.group(1).replace('\\/', '/')
 567                         video_extension = mediaURL[-3:]
 568                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 569
 570                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 571                 if mobj is None:
 572                         self._downloader.trouble(u'ERROR: unable to extract title')
 573                         return
 574                 video_title = mobj.group(1).decode('utf-8')
 575
 576                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 577                 if mobj is None:
 578                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 579                         return
 580                 video_uploader = mobj.group(1)
 581
 582                 return [{
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'upload_date':  u'NA',
 587                         'title':        video_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         'format':       u'NA',
 590                         'player_url':   None,
 591                 }]
 592
 593
 594 class DailymotionIE(InfoExtractor):
 595         """Information Extractor for Dailymotion"""
 596
 597         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 598         IE_NAME = u'dailymotion'
 599
 600         def __init__(self, downloader=None):
 601                 InfoExtractor.__init__(self, downloader)
 602
 603         def report_download_webpage(self, video_id):
 604                 """Report webpage download."""
 605                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 606
 607         def report_extraction(self, video_id):
 608                 """Report information extraction."""
 609                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 610
 611         def _real_extract(self, url):
 612                 # Extract id and simplified title from URL
 613                 mobj = re.match(self._VALID_URL, url)
 614                 if mobj is None:
 615                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 616                         return
 617
 618                 video_id = mobj.group(1)
 619
 620                 video_extension = 'mp4'
 621
 622                 # Retrieve video webpage to extract further information
 623                 request = urllib2.Request(url)
 624                 request.add_header('Cookie', 'family_filter=off')
 625                 try:
 626                         self.report_download_webpage(video_id)
 627                         webpage = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 630                         return
 631
 632                 # Extract URL, uploader and title from webpage
 633                 self.report_extraction(video_id)
 634                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 635                 if mobj is None:
 636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                         return
 638                 flashvars = urllib.unquote(mobj.group(1))
 639                 mobj = re.search(r'"hqURL":"(.+?)"', flashvars)
 640                 if mobj is None:
 641                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                         return
 643                 hqURL = mobj.group(1).replace('\\/', '/')
 644
 645                 # TODO: support ldurl and sdurl qualities
 646
 647                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 648                 if mobj is None:
 649                         self._downloader.trouble(u'ERROR: unable to extract title')
 650                         return
 651                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 652
 653                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 654                 if mobj is None:
 655                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656                         return
 657                 video_uploader = mobj.group(1)
 658
 659                 return [{
 660                         'id':           video_id.decode('utf-8'),
 661                         'url':          hqURL.decode('utf-8'),
 662                         'uploader':     video_uploader.decode('utf-8'),
 663                         'upload_date':  u'NA',
 664                         'title':        video_title,
 665                         'ext':          video_extension.decode('utf-8'),
 666                         'format':       u'NA',
 667                         'player_url':   None,
 668                 }]
 669
 670
 671 class GoogleIE(InfoExtractor):
 672         """Information extractor for video.google.com."""
 673
 674         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 675         IE_NAME = u'video.google'
 676
 677         def __init__(self, downloader=None):
 678                 InfoExtractor.__init__(self, downloader)
 679
 680         def report_download_webpage(self, video_id):
 681                 """Report webpage download."""
 682                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 683
 684         def report_extraction(self, video_id):
 685                 """Report information extraction."""
 686                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 687
 688         def _real_extract(self, url):
 689                 # Extract id from URL
 690                 mobj = re.match(self._VALID_URL, url)
 691                 if mobj is None:
 692                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 693                         return
 694
 695                 video_id = mobj.group(1)
 696
 697                 video_extension = 'mp4'
 698
 699                 # Retrieve video webpage to extract further information
 700                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 701                 try:
 702                         self.report_download_webpage(video_id)
 703                         webpage = urllib2.urlopen(request).read()
 704                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 705                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 706                         return
 707
 708                 # Extract URL, uploader, and title from webpage
 709                 self.report_extraction(video_id)
 710                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 711                 if mobj is None:
 712                         video_extension = 'flv'
 713                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 714                 if mobj is None:
 715                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 716                         return
 717                 mediaURL = urllib.unquote(mobj.group(1))
 718                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 719                 mediaURL = mediaURL.replace('\\x26', '\x26')
 720
 721                 video_url = mediaURL
 722
 723                 mobj = re.search(r'<title>(.*)</title>', webpage)
 724                 if mobj is None:
 725                         self._downloader.trouble(u'ERROR: unable to extract title')
 726                         return
 727                 video_title = mobj.group(1).decode('utf-8')
 728
 729                 # Extract video description
 730                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 731                 if mobj is None:
 732                         self._downloader.trouble(u'ERROR: unable to extract video description')
 733                         return
 734                 video_description = mobj.group(1).decode('utf-8')
 735                 if not video_description:
 736                         video_description = 'No description available.'
 737
 738                 # Extract video thumbnail
 739                 if self._downloader.params.get('forcethumbnail', False):
 740                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 741                         try:
 742                                 webpage = urllib2.urlopen(request).read()
 743                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 745                                 return
 746                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 747                         if mobj is None:
 748                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 749                                 return
 750                         video_thumbnail = mobj.group(1)
 751                 else:   # we need something to pass to process_info
 752                         video_thumbnail = ''
 753
 754                 return [{
 755                         'id':           video_id.decode('utf-8'),
 756                         'url':          video_url.decode('utf-8'),
 757                         'uploader':     u'NA',
 758                         'upload_date':  u'NA',
 759                         'title':        video_title,
 760                         'ext':          video_extension.decode('utf-8'),
 761                         'format':       u'NA',
 762                         'player_url':   None,
 763                 }]
 764
 765
 766 class PhotobucketIE(InfoExtractor):
 767         """Information extractor for photobucket.com."""
 768
 769         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 770         IE_NAME = u'photobucket'
 771
 772         def __init__(self, downloader=None):
 773                 InfoExtractor.__init__(self, downloader)
 774
 775         def report_download_webpage(self, video_id):
 776                 """Report webpage download."""
 777                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 778
 779         def report_extraction(self, video_id):
 780                 """Report information extraction."""
 781                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 782
 783         def _real_extract(self, url):
 784                 # Extract id from URL
 785                 mobj = re.match(self._VALID_URL, url)
 786                 if mobj is None:
 787                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 788                         return
 789
 790                 video_id = mobj.group(1)
 791
 792                 video_extension = 'flv'
 793
 794                 # Retrieve video webpage to extract further information
 795                 request = urllib2.Request(url)
 796                 try:
 797                         self.report_download_webpage(video_id)
 798                         webpage = urllib2.urlopen(request).read()
 799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 800                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 801                         return
 802
 803                 # Extract URL, uploader, and title from webpage
 804                 self.report_extraction(video_id)
 805                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 808                         return
 809                 mediaURL = urllib.unquote(mobj.group(1))
 810
 811                 video_url = mediaURL
 812
 813                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 814                 if mobj is None:
 815                         self._downloader.trouble(u'ERROR: unable to extract title')
 816                         return
 817                 video_title = mobj.group(1).decode('utf-8')
 818
 819                 video_uploader = mobj.group(2).decode('utf-8')
 820
 821                 return [{
 822                         'id':           video_id.decode('utf-8'),
 823                         'url':          video_url.decode('utf-8'),
 824                         'uploader':     video_uploader,
 825                         'upload_date':  u'NA',
 826                         'title':        video_title,
 827                         'ext':          video_extension.decode('utf-8'),
 828                         'format':       u'NA',
 829                         'player_url':   None,
 830                 }]
 831
 832
 833 class YahooIE(InfoExtractor):
 834         """Information extractor for video.yahoo.com."""
 835
 836         # _VALID_URL matches all Yahoo! Video URLs
 837         # _VPAGE_URL matches only the extractable '/watch/' URLs
 838         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 839         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 840         IE_NAME = u'video.yahoo'
 841
 842         def __init__(self, downloader=None):
 843                 InfoExtractor.__init__(self, downloader)
 844
 845         def report_download_webpage(self, video_id):
 846                 """Report webpage download."""
 847                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 848
 849         def report_extraction(self, video_id):
 850                 """Report information extraction."""
 851                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 852
 853         def _real_extract(self, url, new_video=True):
 854                 # Extract ID from URL
 855                 mobj = re.match(self._VALID_URL, url)
 856                 if mobj is None:
 857                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 858                         return
 859
 860                 video_id = mobj.group(2)
 861                 video_extension = 'flv'
 862
 863                 # Rewrite valid but non-extractable URLs as
 864                 # extractable English language /watch/ URLs
 865                 if re.match(self._VPAGE_URL, url) is None:
 866                         request = urllib2.Request(url)
 867                         try:
 868                                 webpage = urllib2.urlopen(request).read()
 869                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 870                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 871                                 return
 872
 873                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 874                         if mobj is None:
 875                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 876                                 return
 877                         yahoo_id = mobj.group(1)
 878
 879                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 880                         if mobj is None:
 881                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 882                                 return
 883                         yahoo_vid = mobj.group(1)
 884
 885                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 886                         return self._real_extract(url, new_video=False)
 887
 888                 # Retrieve video webpage to extract further information
 889                 request = urllib2.Request(url)
 890                 try:
 891                         self.report_download_webpage(video_id)
 892                         webpage = urllib2.urlopen(request).read()
 893                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 894                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 895                         return
 896
 897                 # Extract uploader and title from webpage
 898                 self.report_extraction(video_id)
 899                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 900                 if mobj is None:
 901                         self._downloader.trouble(u'ERROR: unable to extract video title')
 902                         return
 903                 video_title = mobj.group(1).decode('utf-8')
 904
 905                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 906                 if mobj is None:
 907                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 908                         return
 909                 video_uploader = mobj.group(1).decode('utf-8')
 910
 911                 # Extract video thumbnail
 912                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 913                 if mobj is None:
 914                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 915                         return
 916                 video_thumbnail = mobj.group(1).decode('utf-8')
 917
 918                 # Extract video description
 919                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 920                 if mobj is None:
 921                         self._downloader.trouble(u'ERROR: unable to extract video description')
 922                         return
 923                 video_description = mobj.group(1).decode('utf-8')
 924                 if not video_description:
 925                         video_description = 'No description available.'
 926
 927                 # Extract video height and width
 928                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 929                 if mobj is None:
 930                         self._downloader.trouble(u'ERROR: unable to extract video height')
 931                         return
 932                 yv_video_height = mobj.group(1)
 933
 934                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 935                 if mobj is None:
 936                         self._downloader.trouble(u'ERROR: unable to extract video width')
 937                         return
 938                 yv_video_width = mobj.group(1)
 939
 940                 # Retrieve video playlist to extract media URL
 941                 # I'm not completely sure what all these options are, but we
 942                 # seem to need most of them, otherwise the server sends a 401.
 943                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 944                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 945                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 946                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 947                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 948                 try:
 949                         self.report_download_webpage(video_id)
 950                         webpage = urllib2.urlopen(request).read()
 951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 952                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 953                         return
 954
 955                 # Extract media URL from playlist XML
 956                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 957                 if mobj is None:
 958                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 959                         return
 960                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 961                 video_url = unescapeHTML(video_url)
 962
 963                 return [{
 964                         'id':           video_id.decode('utf-8'),
 965                         'url':          video_url,
 966                         'uploader':     video_uploader,
 967                         'upload_date':  u'NA',
 968                         'title':        video_title,
 969                         'ext':          video_extension.decode('utf-8'),
 970                         'thumbnail':    video_thumbnail.decode('utf-8'),
 971                         'description':  video_description,
 972                         'thumbnail':    video_thumbnail,
 973                         'player_url':   None,
 974                 }]
 975
 976
 977 class VimeoIE(InfoExtractor):
 978         """Information extractor for vimeo.com."""
 979
 980         # _VALID_URL matches Vimeo URLs
 981         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 982         IE_NAME = u'vimeo'
 983
 984         def __init__(self, downloader=None):
 985                 InfoExtractor.__init__(self, downloader)
 986
 987         def report_download_webpage(self, video_id):
 988                 """Report webpage download."""
 989                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 990
 991         def report_extraction(self, video_id):
 992                 """Report information extraction."""
 993                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 994
 995         def _real_extract(self, url, new_video=True):
 996                 # Extract ID from URL
 997                 mobj = re.match(self._VALID_URL, url)
 998                 if mobj is None:
 999                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000                         return
1001
1002                 video_id = mobj.group(1)
1003
1004                 # Retrieve video webpage to extract further information
1005                 request = urllib2.Request(url, None, std_headers)
1006                 try:
1007                         self.report_download_webpage(video_id)
1008                         webpage = urllib2.urlopen(request).read()
1009                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1011                         return
1012
1013                 # Now we begin extracting as much information as we can from what we
1014                 # retrieved. First we extract the information common to all extractors,
1015                 # and latter we extract those that are Vimeo specific.
1016                 self.report_extraction(video_id)
1017
1018                 # Extract the config JSON
1019                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020                 try:
1021                         config = json.loads(config)
1022                 except:
1023                         self._downloader.trouble(u'ERROR: unable to extract info section')
1024                         return
1025
1026                 # Extract title
1027                 video_title = config["video"]["title"]
1028
1029                 # Extract uploader
1030                 video_uploader = config["video"]["owner"]["name"]
1031
1032                 # Extract video thumbnail
1033                 video_thumbnail = config["video"]["thumbnail"]
1034
1035                 # Extract video description
1036                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037                 if video_description: video_description = clean_html(video_description)
1038                 else: video_description = ''
1039
1040                 # Extract upload date
1041                 video_upload_date = u'NA'
1042                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043                 if mobj is not None:
1044                         video_upload_date = mobj.group(1)
1045
1046                 # Vimeo specific: extract request signature and timestamp
1047                 sig = config['request']['signature']
1048                 timestamp = config['request']['timestamp']
1049
1050                 # Vimeo specific: extract video codec and quality information
1051                 # TODO bind to format param
1052                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053                 for codec in codecs:
1054                         if codec[0] in config["video"]["files"]:
1055                                 video_codec = codec[0]
1056                                 video_extension = codec[1]
1057                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058                                 else: quality = 'sd'
1059                                 break
1060                 else:
1061                         self._downloader.trouble(u'ERROR: no known codec found')
1062                         return
1063
1064                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1066
1067                 return [{
1068                         'id':           video_id,
1069                         'url':          video_url,
1070                         'uploader':     video_uploader,
1071                         'upload_date':  video_upload_date,
1072                         'title':        video_title,
1073                         'ext':          video_extension,
1074                         'thumbnail':    video_thumbnail,
1075                         'description':  video_description,
1076                         'player_url':   None,
1077                 }]
1078
1079
1080 class GenericIE(InfoExtractor):
1081         """Generic last-resort information extractor."""
1082
1083         _VALID_URL = r'.*'
1084         IE_NAME = u'generic'
1085
1086         def __init__(self, downloader=None):
1087                 InfoExtractor.__init__(self, downloader)
1088
1089         def report_download_webpage(self, video_id):
1090                 """Report webpage download."""
1091                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1092                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1093
1094         def report_extraction(self, video_id):
1095                 """Report information extraction."""
1096                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1097
1098         def report_following_redirect(self, new_url):
1099                 """Report information extraction."""
1100                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1101
1102         def _test_redirect(self, url):
1103                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104                 class HeadRequest(urllib2.Request):
1105                         def get_method(self):
1106                                 return "HEAD"
1107
1108                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1109                         """
1110                         Subclass the HTTPRedirectHandler to make it use our
1111                         HeadRequest also on the redirected URL
1112                         """
1113                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1114                                 if code in (301, 302, 303, 307):
1115                                         newurl = newurl.replace(' ', '%20')
1116                                         newheaders = dict((k,v) for k,v in req.headers.items()
1117                                                                           if k.lower() not in ("content-length", "content-type"))
1118                                         return HeadRequest(newurl,
1119                                                                            headers=newheaders,
1120                                                                            origin_req_host=req.get_origin_req_host(),
1121                                                                            unverifiable=True)
1122                                 else:
1123                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1124
1125                 class HTTPMethodFallback(urllib2.BaseHandler):
1126                         """
1127                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1128                         """
1129                         def http_error_405(self, req, fp, code, msg, headers):
1130                                 fp.read()
1131                                 fp.close()
1132
1133                                 newheaders = dict((k,v) for k,v in req.headers.items()
1134                                                                   if k.lower() not in ("content-length", "content-type"))
1135                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1136                                                                                                  headers=newheaders,
1137                                                                                                  origin_req_host=req.get_origin_req_host(),
1138                                                                                                  unverifiable=True))
1139
1140                 # Build our opener
1141                 opener = urllib2.OpenerDirector()
1142                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1143                                                 HTTPMethodFallback, HEADRedirectHandler,
1144                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1145                         opener.add_handler(handler())
1146
1147                 response = opener.open(HeadRequest(url))
1148                 new_url = response.geturl()
1149
1150                 if url == new_url: return False
1151
1152                 self.report_following_redirect(new_url)
1153                 self._downloader.download([new_url])
1154                 return True
1155
1156         def _real_extract(self, url):
1157                 if self._test_redirect(url): return
1158
1159                 video_id = url.split('/')[-1]
1160                 request = urllib2.Request(url)
1161                 try:
1162                         self.report_download_webpage(video_id)
1163                         webpage = urllib2.urlopen(request).read()
1164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1166                         return
1167                 except ValueError, err:
1168                         # since this is the last-resort InfoExtractor, if
1169                         # this error is thrown, it'll be thrown here
1170                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171                         return
1172
1173                 self.report_extraction(video_id)
1174                 # Start with something easy: JW Player in SWFObject
1175                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1176                 if mobj is None:
1177                         # Broaden the search a little bit
1178                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1179                 if mobj is None:
1180                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181                         return
1182
1183                 # It's possible that one of the regexes
1184                 # matched, but returned an empty group:
1185                 if mobj.group(1) is None:
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187                         return
1188
1189                 video_url = urllib.unquote(mobj.group(1))
1190                 video_id = os.path.basename(video_url)
1191
1192                 # here's a fun little line of code for you:
1193                 video_extension = os.path.splitext(video_id)[1][1:]
1194                 video_id = os.path.splitext(video_id)[0]
1195
1196                 # it's tempting to parse this further, but you would
1197                 # have to take into account all the variations like
1198                 #   Video Title - Site Name
1199                 #   Site Name | Video Title
1200                 #   Video Title - Tagline | Site Name
1201                 # and so on and so forth; it's just not practical
1202                 mobj = re.search(r'<title>(.*)</title>', webpage)
1203                 if mobj is None:
1204                         self._downloader.trouble(u'ERROR: unable to extract title')
1205                         return
1206                 video_title = mobj.group(1).decode('utf-8')
1207
1208                 # video uploader is domain name
1209                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract title')
1212                         return
1213                 video_uploader = mobj.group(1).decode('utf-8')
1214
1215                 return [{
1216                         'id':           video_id.decode('utf-8'),
1217                         'url':          video_url.decode('utf-8'),
1218                         'uploader':     video_uploader,
1219                         'upload_date':  u'NA',
1220                         'title':        video_title,
1221                         'ext':          video_extension.decode('utf-8'),
1222                         'format':       u'NA',
1223                         'player_url':   None,
1224                 }]
1225
1226
1227 class YoutubeSearchIE(InfoExtractor):
1228         """Information Extractor for YouTube search queries."""
1229         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231         _max_youtube_results = 1000
1232         IE_NAME = u'youtube:search'
1233
1234         def __init__(self, downloader=None):
1235                 InfoExtractor.__init__(self, downloader)
1236
1237         def report_download_page(self, query, pagenum):
1238                 """Report attempt to download search page with given number."""
1239                 query = query.decode(preferredencoding())
1240                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1241
1242         def _real_extract(self, query):
1243                 mobj = re.match(self._VALID_URL, query)
1244                 if mobj is None:
1245                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1246                         return
1247
1248                 prefix, query = query.split(':')
1249                 prefix = prefix[8:]
1250                 query = query.encode('utf-8')
1251                 if prefix == '':
1252                         self._download_n_results(query, 1)
1253                         return
1254                 elif prefix == 'all':
1255                         self._download_n_results(query, self._max_youtube_results)
1256                         return
1257                 else:
1258                         try:
1259                                 n = long(prefix)
1260                                 if n <= 0:
1261                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1262                                         return
1263                                 elif n > self._max_youtube_results:
1264                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265                                         n = self._max_youtube_results
1266                                 self._download_n_results(query, n)
1267                                 return
1268                         except ValueError: # parsing prefix as integer fails
1269                                 self._download_n_results(query, 1)
1270                                 return
1271
1272         def _download_n_results(self, query, n):
1273                 """Downloads a specified number of results for a query"""
1274
1275                 video_ids = []
1276                 pagenum = 0
1277                 limit = n
1278
1279                 while (50 * pagenum) < limit:
1280                         self.report_download_page(query, pagenum+1)
1281                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282                         request = urllib2.Request(result_url)
1283                         try:
1284                                 data = urllib2.urlopen(request).read()
1285                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1287                                 return
1288                         api_response = json.loads(data)['data']
1289
1290                         new_ids = list(video['id'] for video in api_response['items'])
1291                         video_ids += new_ids
1292
1293                         limit = min(n, api_response['totalItems'])
1294                         pagenum += 1
1295
1296                 if len(video_ids) > n:
1297                         video_ids = video_ids[:n]
1298                 for id in video_ids:
1299                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1300                 return
1301
1302
1303 class GoogleSearchIE(InfoExtractor):
1304         """Information Extractor for Google Video search queries."""
1305         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1308         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1309         _max_google_results = 1000
1310         IE_NAME = u'video.google:search'
1311
1312         def __init__(self, downloader=None):
1313                 InfoExtractor.__init__(self, downloader)
1314
1315         def report_download_page(self, query, pagenum):
1316                 """Report attempt to download playlist page with given number."""
1317                 query = query.decode(preferredencoding())
1318                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1319
1320         def _real_extract(self, query):
1321                 mobj = re.match(self._VALID_URL, query)
1322                 if mobj is None:
1323                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1324                         return
1325
1326                 prefix, query = query.split(':')
1327                 prefix = prefix[8:]
1328                 query = query.encode('utf-8')
1329                 if prefix == '':
1330                         self._download_n_results(query, 1)
1331                         return
1332                 elif prefix == 'all':
1333                         self._download_n_results(query, self._max_google_results)
1334                         return
1335                 else:
1336                         try:
1337                                 n = long(prefix)
1338                                 if n <= 0:
1339                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1340                                         return
1341                                 elif n > self._max_google_results:
1342                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343                                         n = self._max_google_results
1344                                 self._download_n_results(query, n)
1345                                 return
1346                         except ValueError: # parsing prefix as integer fails
1347                                 self._download_n_results(query, 1)
1348                                 return
1349
1350         def _download_n_results(self, query, n):
1351                 """Downloads a specified number of results for a query"""
1352
1353                 video_ids = []
1354                 pagenum = 0
1355
1356                 while True:
1357                         self.report_download_page(query, pagenum)
1358                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359                         request = urllib2.Request(result_url)
1360                         try:
1361                                 page = urllib2.urlopen(request).read()
1362                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1364                                 return
1365
1366                         # Extract video identifiers
1367                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368                                 video_id = mobj.group(1)
1369                                 if video_id not in video_ids:
1370                                         video_ids.append(video_id)
1371                                         if len(video_ids) == n:
1372                                                 # Specified n videos reached
1373                                                 for id in video_ids:
1374                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1375                                                 return
1376
1377                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378                                 for id in video_ids:
1379                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1380                                 return
1381
1382                         pagenum = pagenum + 1
1383
1384
1385 class YahooSearchIE(InfoExtractor):
1386         """Information Extractor for Yahoo! Video search queries."""
1387         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1390         _MORE_PAGES_INDICATOR = r'\s*Next'
1391         _max_yahoo_results = 1000
1392         IE_NAME = u'video.yahoo:search'
1393
1394         def __init__(self, downloader=None):
1395                 InfoExtractor.__init__(self, downloader)
1396
1397         def report_download_page(self, query, pagenum):
1398                 """Report attempt to download playlist page with given number."""
1399                 query = query.decode(preferredencoding())
1400                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1401
1402         def _real_extract(self, query):
1403                 mobj = re.match(self._VALID_URL, query)
1404                 if mobj is None:
1405                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1406                         return
1407
1408                 prefix, query = query.split(':')
1409                 prefix = prefix[8:]
1410                 query = query.encode('utf-8')
1411                 if prefix == '':
1412                         self._download_n_results(query, 1)
1413                         return
1414                 elif prefix == 'all':
1415                         self._download_n_results(query, self._max_yahoo_results)
1416                         return
1417                 else:
1418                         try:
1419                                 n = long(prefix)
1420                                 if n <= 0:
1421                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1422                                         return
1423                                 elif n > self._max_yahoo_results:
1424                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425                                         n = self._max_yahoo_results
1426                                 self._download_n_results(query, n)
1427                                 return
1428                         except ValueError: # parsing prefix as integer fails
1429                                 self._download_n_results(query, 1)
1430                                 return
1431
1432         def _download_n_results(self, query, n):
1433                 """Downloads a specified number of results for a query"""
1434
1435                 video_ids = []
1436                 already_seen = set()
1437                 pagenum = 1
1438
1439                 while True:
1440                         self.report_download_page(query, pagenum)
1441                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442                         request = urllib2.Request(result_url)
1443                         try:
1444                                 page = urllib2.urlopen(request).read()
1445                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1447                                 return
1448
1449                         # Extract video identifiers
1450                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451                                 video_id = mobj.group(1)
1452                                 if video_id not in already_seen:
1453                                         video_ids.append(video_id)
1454                                         already_seen.add(video_id)
1455                                         if len(video_ids) == n:
1456                                                 # Specified n videos reached
1457                                                 for id in video_ids:
1458                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1459                                                 return
1460
1461                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462                                 for id in video_ids:
1463                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1464                                 return
1465
1466                         pagenum = pagenum + 1
1467
1468
1469 class YoutubePlaylistIE(InfoExtractor):
1470         """Information Extractor for YouTube playlists."""
1471
1472         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=.*?%s'
1475         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476         IE_NAME = u'youtube:playlist'
1477
1478         def __init__(self, downloader=None):
1479                 InfoExtractor.__init__(self, downloader)
1480
1481         def report_download_page(self, playlist_id, pagenum):
1482                 """Report attempt to download playlist page with given number."""
1483                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1484
1485         def _real_extract(self, url):
1486                 # Extract playlist id
1487                 mobj = re.match(self._VALID_URL, url)
1488                 if mobj is None:
1489                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1490                         return
1491
1492                 # Single video case
1493                 if mobj.group(3) is not None:
1494                         self._downloader.download([mobj.group(3)])
1495                         return
1496
1497                 # Download playlist pages
1498                 # prefix is 'p' as default for playlists but there are other types that need extra care
1499                 playlist_prefix = mobj.group(1)
1500                 if playlist_prefix == 'a':
1501                         playlist_access = 'artist'
1502                 else:
1503                         playlist_prefix = 'p'
1504                         playlist_access = 'view_play_list'
1505                 playlist_id = mobj.group(2)
1506                 video_ids = []
1507                 pagenum = 1
1508
1509                 while True:
1510                         self.report_download_page(playlist_id, pagenum)
1511                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512                         request = urllib2.Request(url)
1513                         try:
1514                                 page = urllib2.urlopen(request).read()
1515                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1517                                 return
1518
1519                         # Extract video identifiers
1520                         ids_in_page = []
1521                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522                                 if mobj.group(1) not in ids_in_page:
1523                                         ids_in_page.append(mobj.group(1))
1524                         video_ids.extend(ids_in_page)
1525
1526                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1527                                 break
1528                         pagenum = pagenum + 1
1529
1530                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531                 playlistend = self._downloader.params.get('playlistend', -1)
1532                 if playlistend == -1:
1533                         video_ids = video_ids[playliststart:]
1534                 else:
1535                         video_ids = video_ids[playliststart:playlistend]
1536
1537                 for id in video_ids:
1538                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1539                 return
1540
1541
1542 class YoutubeUserIE(InfoExtractor):
1543         """Information Extractor for YouTube users."""
1544
1545         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547         _GDATA_PAGE_SIZE = 50
1548         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550         IE_NAME = u'youtube:user'
1551
1552         def __init__(self, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554
1555         def report_download_page(self, username, start_index):
1556                 """Report attempt to download user page."""
1557                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1559
1560         def _real_extract(self, url):
1561                 # Extract username
1562                 mobj = re.match(self._VALID_URL, url)
1563                 if mobj is None:
1564                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1565                         return
1566
1567                 username = mobj.group(1)
1568
1569                 # Download video ids using YouTube Data API. Result size per
1570                 # query is limited (currently to 50 videos) so we need to query
1571                 # page by page until there are no video ids - it means we got
1572                 # all of them.
1573
1574                 video_ids = []
1575                 pagenum = 0
1576
1577                 while True:
1578                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579                         self.report_download_page(username, start_index)
1580
1581                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1582
1583                         try:
1584                                 page = urllib2.urlopen(request).read()
1585                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1587                                 return
1588
1589                         # Extract video identifiers
1590                         ids_in_page = []
1591
1592                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593                                 if mobj.group(1) not in ids_in_page:
1594                                         ids_in_page.append(mobj.group(1))
1595
1596                         video_ids.extend(ids_in_page)
1597
1598                         # A little optimization - if current page is not
1599                         # "full", ie. does not contain PAGE_SIZE video ids then
1600                         # we can assume that this page is the last one - there
1601                         # are no more ids on further pages - no need to query
1602                         # again.
1603
1604                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1605                                 break
1606
1607                         pagenum += 1
1608
1609                 all_ids_count = len(video_ids)
1610                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611                 playlistend = self._downloader.params.get('playlistend', -1)
1612
1613                 if playlistend == -1:
1614                         video_ids = video_ids[playliststart:]
1615                 else:
1616                         video_ids = video_ids[playliststart:playlistend]
1617
1618                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1619                                 (username, all_ids_count, len(video_ids)))
1620
1621                 for video_id in video_ids:
1622                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1623
1624
1625 class BlipTVUserIE(InfoExtractor):
1626         """Information Extractor for blip.tv users."""
1627
1628         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1629         _PAGE_SIZE = 12
1630         IE_NAME = u'blip.tv:user'
1631
1632         def __init__(self, downloader=None):
1633                 InfoExtractor.__init__(self, downloader)
1634
1635         def report_download_page(self, username, pagenum):
1636                 """Report attempt to download user page."""
1637                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1638                                 (self.IE_NAME, username, pagenum))
1639
1640         def _real_extract(self, url):
1641                 # Extract username
1642                 mobj = re.match(self._VALID_URL, url)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1645                         return
1646
1647                 username = mobj.group(1)
1648
1649                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1650
1651                 request = urllib2.Request(url)
1652
1653                 try:
1654                         page = urllib2.urlopen(request).read().decode('utf-8')
1655                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1656                         page_base = page_base % mobj.group(1)
1657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1658                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1659                         return
1660
1661
1662                 # Download video ids using BlipTV Ajax calls. Result size per
1663                 # query is limited (currently to 12 videos) so we need to query
1664                 # page by page until there are no video ids - it means we got
1665                 # all of them.
1666
1667                 video_ids = []
1668                 pagenum = 1
1669
1670                 while True:
1671                         self.report_download_page(username, pagenum)
1672
1673                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1674
1675                         try:
1676                                 page = urllib2.urlopen(request).read().decode('utf-8')
1677                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1679                                 return
1680
1681                         # Extract video identifiers
1682                         ids_in_page = []
1683
1684                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1685                                 if mobj.group(1) not in ids_in_page:
1686                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1687
1688                         video_ids.extend(ids_in_page)
1689
1690                         # A little optimization - if current page is not
1691                         # "full", ie. does not contain PAGE_SIZE video ids then
1692                         # we can assume that this page is the last one - there
1693                         # are no more ids on further pages - no need to query
1694                         # again.
1695
1696                         if len(ids_in_page) < self._PAGE_SIZE:
1697                                 break
1698
1699                         pagenum += 1
1700
1701                 all_ids_count = len(video_ids)
1702                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1703                 playlistend = self._downloader.params.get('playlistend', -1)
1704
1705                 if playlistend == -1:
1706                         video_ids = video_ids[playliststart:]
1707                 else:
1708                         video_ids = video_ids[playliststart:playlistend]
1709
1710                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1711                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1712
1713                 for video_id in video_ids:
1714                         self._downloader.download([u'http://blip.tv/'+video_id])
1715
1716
1717 class DepositFilesIE(InfoExtractor):
1718         """Information extractor for depositfiles.com"""
1719
1720         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1721         IE_NAME = u'DepositFiles'
1722
1723         def __init__(self, downloader=None):
1724                 InfoExtractor.__init__(self, downloader)
1725
1726         def report_download_webpage(self, file_id):
1727                 """Report webpage download."""
1728                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1729
1730         def report_extraction(self, file_id):
1731                 """Report information extraction."""
1732                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1733
1734         def _real_extract(self, url):
1735                 file_id = url.split('/')[-1]
1736                 # Rebuild url in english locale
1737                 url = 'http://depositfiles.com/en/files/' + file_id
1738
1739                 # Retrieve file webpage with 'Free download' button pressed
1740                 free_download_indication = { 'gateway_result' : '1' }
1741                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1742                 try:
1743                         self.report_download_webpage(file_id)
1744                         webpage = urllib2.urlopen(request).read()
1745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1746                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1747                         return
1748
1749                 # Search for the real file URL
1750                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1751                 if (mobj is None) or (mobj.group(1) is None):
1752                         # Try to figure out reason of the error.
1753                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1754                         if (mobj is not None) and (mobj.group(1) is not None):
1755                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1756                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1757                         else:
1758                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1759                         return
1760
1761                 file_url = mobj.group(1)
1762                 file_extension = os.path.splitext(file_url)[1][1:]
1763
1764                 # Search for file title
1765                 mobj = re.search(r'<b title="(.*?)">', webpage)
1766                 if mobj is None:
1767                         self._downloader.trouble(u'ERROR: unable to extract title')
1768                         return
1769                 file_title = mobj.group(1).decode('utf-8')
1770
1771                 return [{
1772                         'id':           file_id.decode('utf-8'),
1773                         'url':          file_url.decode('utf-8'),
1774                         'uploader':     u'NA',
1775                         'upload_date':  u'NA',
1776                         'title':        file_title,
1777                         'ext':          file_extension.decode('utf-8'),
1778                         'format':       u'NA',
1779                         'player_url':   None,
1780                 }]
1781
1782
1783 class FacebookIE(InfoExtractor):
1784         """Information Extractor for Facebook"""
1785
1786         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788         _NETRC_MACHINE = 'facebook'
1789         _available_formats = ['video', 'highqual', 'lowqual']
1790         _video_extensions = {
1791                 'video': 'mp4',
1792                 'highqual': 'mp4',
1793                 'lowqual': 'mp4',
1794         }
1795         IE_NAME = u'facebook'
1796
1797         def __init__(self, downloader=None):
1798                 InfoExtractor.__init__(self, downloader)
1799
1800         def _reporter(self, message):
1801                 """Add header and report message."""
1802                 self._downloader.to_screen(u'[facebook] %s' % message)
1803
1804         def report_login(self):
1805                 """Report attempt to log in."""
1806                 self._reporter(u'Logging in')
1807
1808         def report_video_webpage_download(self, video_id):
1809                 """Report attempt to download video webpage."""
1810                 self._reporter(u'%s: Downloading video webpage' % video_id)
1811
1812         def report_information_extraction(self, video_id):
1813                 """Report attempt to extract video information."""
1814                 self._reporter(u'%s: Extracting video information' % video_id)
1815
1816         def _parse_page(self, video_webpage):
1817                 """Extract video information from page"""
1818                 # General data
1819                 data = {'title': r'\("video_title", "(.*?)"\)',
1820                         'description': r'<div class="datawrap">(.*?)</div>',
1821                         'owner': r'\("video_owner_name", "(.*?)"\)',
1822                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1823                         }
1824                 video_info = {}
1825                 for piece in data.keys():
1826                         mobj = re.search(data[piece], video_webpage)
1827                         if mobj is not None:
1828                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1829
1830                 # Video urls
1831                 video_urls = {}
1832                 for fmt in self._available_formats:
1833                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1834                         if mobj is not None:
1835                                 # URL is in a Javascript segment inside an escaped Unicode format within
1836                                 # the generally utf-8 page
1837                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1838                 video_info['video_urls'] = video_urls
1839
1840                 return video_info
1841
1842         def _real_initialize(self):
1843                 if self._downloader is None:
1844                         return
1845
1846                 useremail = None
1847                 password = None
1848                 downloader_params = self._downloader.params
1849
1850                 # Attempt to use provided username and password or .netrc data
1851                 if downloader_params.get('username', None) is not None:
1852                         useremail = downloader_params['username']
1853                         password = downloader_params['password']
1854                 elif downloader_params.get('usenetrc', False):
1855                         try:
1856                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1857                                 if info is not None:
1858                                         useremail = info[0]
1859                                         password = info[2]
1860                                 else:
1861                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1862                         except (IOError, netrc.NetrcParseError), err:
1863                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1864                                 return
1865
1866                 if useremail is None:
1867                         return
1868
1869                 # Log in
1870                 login_form = {
1871                         'email': useremail,
1872                         'pass': password,
1873                         'login': 'Log+In'
1874                         }
1875                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1876                 try:
1877                         self.report_login()
1878                         login_results = urllib2.urlopen(request).read()
1879                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1880                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1881                                 return
1882                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1884                         return
1885
1886         def _real_extract(self, url):
1887                 mobj = re.match(self._VALID_URL, url)
1888                 if mobj is None:
1889                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1890                         return
1891                 video_id = mobj.group('ID')
1892
1893                 # Get video webpage
1894                 self.report_video_webpage_download(video_id)
1895                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1896                 try:
1897                         page = urllib2.urlopen(request)
1898                         video_webpage = page.read()
1899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1901                         return
1902
1903                 # Start extracting information
1904                 self.report_information_extraction(video_id)
1905
1906                 # Extract information
1907                 video_info = self._parse_page(video_webpage)
1908
1909                 # uploader
1910                 if 'owner' not in video_info:
1911                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1912                         return
1913                 video_uploader = video_info['owner']
1914
1915                 # title
1916                 if 'title' not in video_info:
1917                         self._downloader.trouble(u'ERROR: unable to extract video title')
1918                         return
1919                 video_title = video_info['title']
1920                 video_title = video_title.decode('utf-8')
1921
1922                 # thumbnail image
1923                 if 'thumbnail' not in video_info:
1924                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1925                         video_thumbnail = ''
1926                 else:
1927                         video_thumbnail = video_info['thumbnail']
1928
1929                 # upload date
1930                 upload_date = u'NA'
1931                 if 'upload_date' in video_info:
1932                         upload_time = video_info['upload_date']
1933                         timetuple = email.utils.parsedate_tz(upload_time)
1934                         if timetuple is not None:
1935                                 try:
1936                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1937                                 except:
1938                                         pass
1939
1940                 # description
1941                 video_description = video_info.get('description', 'No description available.')
1942
1943                 url_map = video_info['video_urls']
1944                 if len(url_map.keys()) > 0:
1945                         # Decide which formats to download
1946                         req_format = self._downloader.params.get('format', None)
1947                         format_limit = self._downloader.params.get('format_limit', None)
1948
1949                         if format_limit is not None and format_limit in self._available_formats:
1950                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1951                         else:
1952                                 format_list = self._available_formats
1953                         existing_formats = [x for x in format_list if x in url_map]
1954                         if len(existing_formats) == 0:
1955                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1956                                 return
1957                         if req_format is None:
1958                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1959                         elif req_format == 'worst':
1960                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1961                         elif req_format == '-1':
1962                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1963                         else:
1964                                 # Specific format
1965                                 if req_format not in url_map:
1966                                         self._downloader.trouble(u'ERROR: requested format not available')
1967                                         return
1968                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1969
1970                 results = []
1971                 for format_param, video_real_url in video_url_list:
1972                         # Extension
1973                         video_extension = self._video_extensions.get(format_param, 'mp4')
1974
1975                         results.append({
1976                                 'id':           video_id.decode('utf-8'),
1977                                 'url':          video_real_url.decode('utf-8'),
1978                                 'uploader':     video_uploader.decode('utf-8'),
1979                                 'upload_date':  upload_date,
1980                                 'title':        video_title,
1981                                 'ext':          video_extension.decode('utf-8'),
1982                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1983                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1984                                 'description':  video_description.decode('utf-8'),
1985                                 'player_url':   None,
1986                         })
1987                 return results
1988
1989 class BlipTVIE(InfoExtractor):
1990         """Information extractor for blip.tv"""
1991
1992         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1993         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1994         IE_NAME = u'blip.tv'
1995
1996         def report_extraction(self, file_id):
1997                 """Report information extraction."""
1998                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1999
2000         def report_direct_download(self, title):
2001                 """Report information extraction."""
2002                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2003
2004         def _real_extract(self, url):
2005                 mobj = re.match(self._VALID_URL, url)
2006                 if mobj is None:
2007                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2008                         return
2009
2010                 if '?' in url:
2011                         cchar = '&'
2012                 else:
2013                         cchar = '?'
2014                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2015                 request = urllib2.Request(json_url.encode('utf-8'))
2016                 self.report_extraction(mobj.group(1))
2017                 info = None
2018                 try:
2019                         urlh = urllib2.urlopen(request)
2020                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2021                                 basename = url.split('/')[-1]
2022                                 title,ext = os.path.splitext(basename)
2023                                 title = title.decode('UTF-8')
2024                                 ext = ext.replace('.', '')
2025                                 self.report_direct_download(title)
2026                                 info = {
2027                                         'id': title,
2028                                         'url': url,
2029                                         'title': title,
2030                                         'ext': ext,
2031                                         'urlhandle': urlh
2032                                 }
2033                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2034                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2035                         return
2036                 if info is None: # Regular URL
2037                         try:
2038                                 json_code = urlh.read()
2039                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2041                                 return
2042
2043                         try:
2044                                 json_data = json.loads(json_code)
2045                                 if 'Post' in json_data:
2046                                         data = json_data['Post']
2047                                 else:
2048                                         data = json_data
2049
2050                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2051                                 video_url = data['media']['url']
2052                                 umobj = re.match(self._URL_EXT, video_url)
2053                                 if umobj is None:
2054                                         raise ValueError('Can not determine filename extension')
2055                                 ext = umobj.group(1)
2056
2057                                 info = {
2058                                         'id': data['item_id'],
2059                                         'url': video_url,
2060                                         'uploader': data['display_name'],
2061                                         'upload_date': upload_date,
2062                                         'title': data['title'],
2063                                         'ext': ext,
2064                                         'format': data['media']['mimeType'],
2065                                         'thumbnail': data['thumbnailUrl'],
2066                                         'description': data['description'],
2067                                         'player_url': data['embedUrl']
2068                                 }
2069                         except (ValueError,KeyError), err:
2070                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2071                                 return
2072
2073                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2074                 return [info]
2075
2076
2077 class MyVideoIE(InfoExtractor):
2078         """Information Extractor for myvideo.de."""
2079
2080         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2081         IE_NAME = u'myvideo'
2082
2083         def __init__(self, downloader=None):
2084                 InfoExtractor.__init__(self, downloader)
2085
2086         def report_download_webpage(self, video_id):
2087                 """Report webpage download."""
2088                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2089
2090         def report_extraction(self, video_id):
2091                 """Report information extraction."""
2092                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2093
2094         def _real_extract(self,url):
2095                 mobj = re.match(self._VALID_URL, url)
2096                 if mobj is None:
2097                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2098                         return
2099
2100                 video_id = mobj.group(1)
2101
2102                 # Get video webpage
2103                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2104                 try:
2105                         self.report_download_webpage(video_id)
2106                         webpage = urllib2.urlopen(request).read()
2107                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2108                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2109                         return
2110
2111                 self.report_extraction(video_id)
2112                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2113                                  webpage)
2114                 if mobj is None:
2115                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2116                         return
2117                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2118
2119                 mobj = re.search('<title>([^<]+)</title>', webpage)
2120                 if mobj is None:
2121                         self._downloader.trouble(u'ERROR: unable to extract title')
2122                         return
2123
2124                 video_title = mobj.group(1)
2125
2126                 return [{
2127                         'id':           video_id,
2128                         'url':          video_url,
2129                         'uploader':     u'NA',
2130                         'upload_date':  u'NA',
2131                         'title':        video_title,
2132                         'ext':          u'flv',
2133                         'format':       u'NA',
2134                         'player_url':   None,
2135                 }]
2136
2137 class ComedyCentralIE(InfoExtractor):
2138         """Information extractor for The Daily Show and Colbert Report """
2139
2140         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2141         IE_NAME = u'comedycentral'
2142
2143         def report_extraction(self, episode_id):
2144                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2145
2146         def report_config_download(self, episode_id):
2147                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2148
2149         def report_index_download(self, episode_id):
2150                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2151
2152         def report_player_url(self, episode_id):
2153                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2154
2155         def _real_extract(self, url):
2156                 mobj = re.match(self._VALID_URL, url)
2157                 if mobj is None:
2158                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2159                         return
2160
2161                 if mobj.group('shortname'):
2162                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2163                                 url = u'http://www.thedailyshow.com/full-episodes/'
2164                         else:
2165                                 url = u'http://www.colbertnation.com/full-episodes/'
2166                         mobj = re.match(self._VALID_URL, url)
2167                         assert mobj is not None
2168
2169                 dlNewest = not mobj.group('episode')
2170                 if dlNewest:
2171                         epTitle = mobj.group('showname')
2172                 else:
2173                         epTitle = mobj.group('episode')
2174
2175                 req = urllib2.Request(url)
2176                 self.report_extraction(epTitle)
2177                 try:
2178                         htmlHandle = urllib2.urlopen(req)
2179                         html = htmlHandle.read()
2180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2182                         return
2183                 if dlNewest:
2184                         url = htmlHandle.geturl()
2185                         mobj = re.match(self._VALID_URL, url)
2186                         if mobj is None:
2187                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2188                                 return
2189                         if mobj.group('episode') == '':
2190                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2191                                 return
2192                         epTitle = mobj.group('episode')
2193
2194                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2195                 if len(mMovieParams) == 0:
2196                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2197                         return
2198
2199                 playerUrl_raw = mMovieParams[0][0]
2200                 self.report_player_url(epTitle)
2201                 try:
2202                         urlHandle = urllib2.urlopen(playerUrl_raw)
2203                         playerUrl = urlHandle.geturl()
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2206                         return
2207
2208                 uri = mMovieParams[0][1]
2209                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2210                 self.report_index_download(epTitle)
2211                 try:
2212                         indexXml = urllib2.urlopen(indexUrl).read()
2213                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2215                         return
2216
2217                 results = []
2218
2219                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2220                 itemEls = idoc.findall('.//item')
2221                 for itemEl in itemEls:
2222                         mediaId = itemEl.findall('./guid')[0].text
2223                         shortMediaId = mediaId.split(':')[-1]
2224                         showId = mediaId.split(':')[-2].replace('.com', '')
2225                         officialTitle = itemEl.findall('./title')[0].text
2226                         officialDate = itemEl.findall('./pubDate')[0].text
2227
2228                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2229                                                 urllib.urlencode({'uri': mediaId}))
2230                         configReq = urllib2.Request(configUrl)
2231                         self.report_config_download(epTitle)
2232                         try:
2233                                 configXml = urllib2.urlopen(configReq).read()
2234                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2236                                 return
2237
2238                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2239                         turls = []
2240                         for rendition in cdoc.findall('.//rendition'):
2241                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2242                                 turls.append(finfo)
2243
2244                         if len(turls) == 0:
2245                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2246                                 continue
2247
2248                         # For now, just pick the highest bitrate
2249                         format,video_url = turls[-1]
2250
2251                         effTitle = showId + u'-' + epTitle
2252                         info = {
2253                                 'id': shortMediaId,
2254                                 'url': video_url,
2255                                 'uploader': showId,
2256                                 'upload_date': officialDate,
2257                                 'title': effTitle,
2258                                 'ext': 'mp4',
2259                                 'format': format,
2260                                 'thumbnail': None,
2261                                 'description': officialTitle,
2262                                 'player_url': playerUrl
2263                         }
2264
2265                         results.append(info)
2266
2267                 return results
2268
2269
2270 class EscapistIE(InfoExtractor):
2271         """Information extractor for The Escapist """
2272
2273         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2274         IE_NAME = u'escapist'
2275
2276         def report_extraction(self, showName):
2277                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2278
2279         def report_config_download(self, showName):
2280                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2281
2282         def _real_extract(self, url):
2283                 mobj = re.match(self._VALID_URL, url)
2284                 if mobj is None:
2285                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2286                         return
2287                 showName = mobj.group('showname')
2288                 videoId = mobj.group('episode')
2289
2290                 self.report_extraction(showName)
2291                 try:
2292                         webPage = urllib2.urlopen(url)
2293                         webPageBytes = webPage.read()
2294                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2295                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2296                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2298                         return
2299
2300                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2301                 description = unescapeHTML(descMatch.group(1))
2302                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2303                 imgUrl = unescapeHTML(imgMatch.group(1))
2304                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2305                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2306                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2307                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2308
2309                 self.report_config_download(showName)
2310                 try:
2311                         configJSON = urllib2.urlopen(configUrl).read()
2312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2314                         return
2315
2316                 # Technically, it's JavaScript, not JSON
2317                 configJSON = configJSON.replace("'", '"')
2318
2319                 try:
2320                         config = json.loads(configJSON)
2321                 except (ValueError,), err:
2322                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2323                         return
2324
2325                 playlist = config['playlist']
2326                 videoUrl = playlist[1]['url']
2327
2328                 info = {
2329                         'id': videoId,
2330                         'url': videoUrl,
2331                         'uploader': showName,
2332                         'upload_date': None,
2333                         'title': showName,
2334                         'ext': 'flv',
2335                         'format': 'flv',
2336                         'thumbnail': imgUrl,
2337                         'description': description,
2338                         'player_url': playerUrl,
2339                 }
2340
2341                 return [info]
2342
2343
2344 class CollegeHumorIE(InfoExtractor):
2345         """Information extractor for collegehumor.com"""
2346
2347         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2348         IE_NAME = u'collegehumor'
2349
2350         def report_webpage(self, video_id):
2351                 """Report information extraction."""
2352                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2353
2354         def report_extraction(self, video_id):
2355                 """Report information extraction."""
2356                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2357
2358         def _real_extract(self, url):
2359                 mobj = re.match(self._VALID_URL, url)
2360                 if mobj is None:
2361                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362                         return
2363                 video_id = mobj.group('videoid')
2364
2365                 self.report_webpage(video_id)
2366                 request = urllib2.Request(url)
2367                 try:
2368                         webpage = urllib2.urlopen(request).read()
2369                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2371                         return
2372
2373                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2374                 if m is None:
2375                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2376                         return
2377                 internal_video_id = m.group('internalvideoid')
2378
2379                 info = {
2380                         'id': video_id,
2381                         'internal_id': internal_video_id,
2382                 }
2383
2384                 self.report_extraction(video_id)
2385                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2386                 try:
2387                         metaXml = urllib2.urlopen(xmlUrl).read()
2388                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2390                         return
2391
2392                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2393                 try:
2394                         videoNode = mdoc.findall('./video')[0]
2395                         info['description'] = videoNode.findall('./description')[0].text
2396                         info['title'] = videoNode.findall('./caption')[0].text
2397                         info['url'] = videoNode.findall('./file')[0].text
2398                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2399                         info['ext'] = info['url'].rpartition('.')[2]
2400                         info['format'] = info['ext']
2401                 except IndexError:
2402                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2403                         return
2404
2405                 return [info]
2406
2407
2408 class XVideosIE(InfoExtractor):
2409         """Information extractor for xvideos.com"""
2410
2411         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412         IE_NAME = u'xvideos'
2413
2414         def report_webpage(self, video_id):
2415                 """Report information extraction."""
2416                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2417
2418         def report_extraction(self, video_id):
2419                 """Report information extraction."""
2420                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2421
2422         def _real_extract(self, url):
2423                 mobj = re.match(self._VALID_URL, url)
2424                 if mobj is None:
2425                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2426                         return
2427                 video_id = mobj.group(1).decode('utf-8')
2428
2429                 self.report_webpage(video_id)
2430
2431                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2432                 try:
2433                         webpage = urllib2.urlopen(request).read()
2434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2436                         return
2437
2438                 self.report_extraction(video_id)
2439
2440
2441                 # Extract video URL
2442                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2443                 if mobj is None:
2444                         self._downloader.trouble(u'ERROR: unable to extract video url')
2445                         return
2446                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2447
2448
2449                 # Extract title
2450                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2451                 if mobj is None:
2452                         self._downloader.trouble(u'ERROR: unable to extract video title')
2453                         return
2454                 video_title = mobj.group(1).decode('utf-8')
2455
2456
2457                 # Extract video thumbnail
2458                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2459                 if mobj is None:
2460                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2461                         return
2462                 video_thumbnail = mobj.group(0).decode('utf-8')
2463
2464                 info = {
2465                         'id': video_id,
2466                         'url': video_url,
2467                         'uploader': None,
2468                         'upload_date': None,
2469                         'title': video_title,
2470                         'ext': 'flv',
2471                         'format': 'flv',
2472                         'thumbnail': video_thumbnail,
2473                         'description': None,
2474                         'player_url': None,
2475                 }
2476
2477                 return [info]
2478
2479
2480 class SoundcloudIE(InfoExtractor):
2481         """Information extractor for soundcloud.com
2482            To access the media, the uid of the song and a stream token
2483            must be extracted from the page source and the script must make
2484            a request to media.soundcloud.com/crossdomain.xml. Then
2485            the media can be grabbed by requesting from an url composed
2486            of the stream token and uid
2487          """
2488
2489         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2490         IE_NAME = u'soundcloud'
2491
2492         def __init__(self, downloader=None):
2493                 InfoExtractor.__init__(self, downloader)
2494
2495         def report_webpage(self, video_id):
2496                 """Report information extraction."""
2497                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2498
2499         def report_extraction(self, video_id):
2500                 """Report information extraction."""
2501                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2502
2503         def _real_extract(self, url):
2504                 mobj = re.match(self._VALID_URL, url)
2505                 if mobj is None:
2506                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507                         return
2508
2509                 # extract uploader (which is in the url)
2510                 uploader = mobj.group(1).decode('utf-8')
2511                 # extract simple title (uploader + slug of song title)
2512                 slug_title =  mobj.group(2).decode('utf-8')
2513                 simple_title = uploader + u'-' + slug_title
2514
2515                 self.report_webpage('%s/%s' % (uploader, slug_title))
2516
2517                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2518                 try:
2519                         webpage = urllib2.urlopen(request).read()
2520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2522                         return
2523
2524                 self.report_extraction('%s/%s' % (uploader, slug_title))
2525
2526                 # extract uid and stream token that soundcloud hands out for access
2527                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2528                 if mobj:
2529                         video_id = mobj.group(1)
2530                         stream_token = mobj.group(2)
2531
2532                 # extract unsimplified title
2533                 mobj = re.search('"title":"(.*?)",', webpage)
2534                 if mobj:
2535                         title = mobj.group(1).decode('utf-8')
2536                 else:
2537                         title = simple_title
2538
2539                 # construct media url (with uid/token)
2540                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2541                 mediaURL = mediaURL % (video_id, stream_token)
2542
2543                 # description
2544                 description = u'No description available'
2545                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2546                 if mobj:
2547                         description = mobj.group(1)
2548
2549                 # upload date
2550                 upload_date = None
2551                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2552                 if mobj:
2553                         try:
2554                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2555                         except Exception, e:
2556                                 self._downloader.to_stderr(str(e))
2557
2558                 # for soundcloud, a request to a cross domain is required for cookies
2559                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2560
2561                 return [{
2562                         'id':           video_id.decode('utf-8'),
2563                         'url':          mediaURL,
2564                         'uploader':     uploader.decode('utf-8'),
2565                         'upload_date':  upload_date,
2566                         'title':        title,
2567                         'ext':          u'mp3',
2568                         'format':       u'NA',
2569                         'player_url':   None,
2570                         'description': description.decode('utf-8')
2571                 }]
2572
2573
2574 class InfoQIE(InfoExtractor):
2575         """Information extractor for infoq.com"""
2576
2577         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2578         IE_NAME = u'infoq'
2579
2580         def report_webpage(self, video_id):
2581                 """Report information extraction."""
2582                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2583
2584         def report_extraction(self, video_id):
2585                 """Report information extraction."""
2586                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2587
2588         def _real_extract(self, url):
2589                 mobj = re.match(self._VALID_URL, url)
2590                 if mobj is None:
2591                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592                         return
2593
2594                 self.report_webpage(url)
2595
2596                 request = urllib2.Request(url)
2597                 try:
2598                         webpage = urllib2.urlopen(request).read()
2599                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2600                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2601                         return
2602
2603                 self.report_extraction(url)
2604
2605
2606                 # Extract video URL
2607                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2608                 if mobj is None:
2609                         self._downloader.trouble(u'ERROR: unable to extract video url')
2610                         return
2611                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2612
2613
2614                 # Extract title
2615                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2616                 if mobj is None:
2617                         self._downloader.trouble(u'ERROR: unable to extract video title')
2618                         return
2619                 video_title = mobj.group(1).decode('utf-8')
2620
2621                 # Extract description
2622                 video_description = u'No description available.'
2623                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2624                 if mobj is not None:
2625                         video_description = mobj.group(1).decode('utf-8')
2626
2627                 video_filename = video_url.split('/')[-1]
2628                 video_id, extension = video_filename.split('.')
2629
2630                 info = {
2631                         'id': video_id,
2632                         'url': video_url,
2633                         'uploader': None,
2634                         'upload_date': None,
2635                         'title': video_title,
2636                         'ext': extension,
2637                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2638                         'thumbnail': None,
2639                         'description': video_description,
2640                         'player_url': None,
2641                 }
2642
2643                 return [info]
2644
2645 class MixcloudIE(InfoExtractor):
2646         """Information extractor for www.mixcloud.com"""
2647         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648         IE_NAME = u'mixcloud'
2649
2650         def __init__(self, downloader=None):
2651                 InfoExtractor.__init__(self, downloader)
2652
2653         def report_download_json(self, file_id):
2654                 """Report JSON download."""
2655                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2656
2657         def report_extraction(self, file_id):
2658                 """Report information extraction."""
2659                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2660
2661         def get_urls(self, jsonData, fmt, bitrate='best'):
2662                 """Get urls from 'audio_formats' section in json"""
2663                 file_url = None
2664                 try:
2665                         bitrate_list = jsonData[fmt]
2666                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2667                                 bitrate = max(bitrate_list) # select highest
2668
2669                         url_list = jsonData[fmt][bitrate]
2670                 except TypeError: # we have no bitrate info.
2671                         url_list = jsonData[fmt]
2672                 return url_list
2673
2674         def check_urls(self, url_list):
2675                 """Returns 1st active url from list"""
2676                 for url in url_list:
2677                         try:
2678                                 urllib2.urlopen(url)
2679                                 return url
2680                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2681                                 url = None
2682
2683                 return None
2684
2685         def _print_formats(self, formats):
2686                 print 'Available formats:'
2687                 for fmt in formats.keys():
2688                         for b in formats[fmt]:
2689                                 try:
2690                                         ext = formats[fmt][b][0]
2691                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2692                                 except TypeError: # we have no bitrate info
2693                                         ext = formats[fmt][0]
2694                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2695                                         break
2696
2697         def _real_extract(self, url):
2698                 mobj = re.match(self._VALID_URL, url)
2699                 if mobj is None:
2700                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701                         return
2702                 # extract uploader & filename from url
2703                 uploader = mobj.group(1).decode('utf-8')
2704                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2705
2706                 # construct API request
2707                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2708                 # retrieve .json file with links to files
2709                 request = urllib2.Request(file_url)
2710                 try:
2711                         self.report_download_json(file_url)
2712                         jsonData = urllib2.urlopen(request).read()
2713                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2714                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2715                         return
2716
2717                 # parse JSON
2718                 json_data = json.loads(jsonData)
2719                 player_url = json_data['player_swf_url']
2720                 formats = dict(json_data['audio_formats'])
2721
2722                 req_format = self._downloader.params.get('format', None)
2723                 bitrate = None
2724
2725                 if self._downloader.params.get('listformats', None):
2726                         self._print_formats(formats)
2727                         return
2728
2729                 if req_format is None or req_format == 'best':
2730                         for format_param in formats.keys():
2731                                 url_list = self.get_urls(formats, format_param)
2732                                 # check urls
2733                                 file_url = self.check_urls(url_list)
2734                                 if file_url is not None:
2735                                         break # got it!
2736                 else:
2737                         if req_format not in formats.keys():
2738                                 self._downloader.trouble(u'ERROR: format is not available')
2739                                 return
2740
2741                         url_list = self.get_urls(formats, req_format)
2742                         file_url = self.check_urls(url_list)
2743                         format_param = req_format
2744
2745                 return [{
2746                         'id': file_id.decode('utf-8'),
2747                         'url': file_url.decode('utf-8'),
2748                         'uploader':     uploader.decode('utf-8'),
2749                         'upload_date': u'NA',
2750                         'title': json_data['name'],
2751                         'ext': file_url.split('.')[-1].decode('utf-8'),
2752                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2753                         'thumbnail': json_data['thumbnail_url'],
2754                         'description': json_data['description'],
2755                         'player_url': player_url.decode('utf-8'),
2756                 }]
2757
2758 class StanfordOpenClassroomIE(InfoExtractor):
2759         """Information extractor for Stanford's Open ClassRoom"""
2760
2761         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2762         IE_NAME = u'stanfordoc'
2763
2764         def report_download_webpage(self, objid):
2765                 """Report information extraction."""
2766                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2767
2768         def report_extraction(self, video_id):
2769                 """Report information extraction."""
2770                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2771
2772         def _real_extract(self, url):
2773                 mobj = re.match(self._VALID_URL, url)
2774                 if mobj is None:
2775                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2776                         return
2777
2778                 if mobj.group('course') and mobj.group('video'): # A specific video
2779                         course = mobj.group('course')
2780                         video = mobj.group('video')
2781                         info = {
2782                                 'id': course + '_' + video,
2783                         }
2784
2785                         self.report_extraction(info['id'])
2786                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2787                         xmlUrl = baseUrl + video + '.xml'
2788                         try:
2789                                 metaXml = urllib2.urlopen(xmlUrl).read()
2790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2792                                 return
2793                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2794                         try:
2795                                 info['title'] = mdoc.findall('./title')[0].text
2796                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2797                         except IndexError:
2798                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2799                                 return
2800                         info['ext'] = info['url'].rpartition('.')[2]
2801                         info['format'] = info['ext']
2802                         return [info]
2803                 elif mobj.group('course'): # A course page
2804                         course = mobj.group('course')
2805                         info = {
2806                                 'id': course,
2807                                 'type': 'playlist',
2808                         }
2809
2810                         self.report_download_webpage(info['id'])
2811                         try:
2812                                 coursepage = urllib2.urlopen(url).read()
2813                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2815                                 return
2816
2817                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2818                         if m:
2819                                 info['title'] = unescapeHTML(m.group(1))
2820                         else:
2821                                 info['title'] = info['id']
2822
2823                         m = re.search('<description>([^<]+)</description>', coursepage)
2824                         if m:
2825                                 info['description'] = unescapeHTML(m.group(1))
2826
2827                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2828                         info['list'] = [
2829                                 {
2830                                         'type': 'reference',
2831                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2832                                 }
2833                                         for vpage in links]
2834                         results = []
2835                         for entry in info['list']:
2836                                 assert entry['type'] == 'reference'
2837                                 results += self.extract(entry['url'])
2838                         return results
2839
2840                 else: # Root page
2841                         info = {
2842                                 'id': 'Stanford OpenClassroom',
2843                                 'type': 'playlist',
2844                         }
2845
2846                         self.report_download_webpage(info['id'])
2847                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2848                         try:
2849                                 rootpage = urllib2.urlopen(rootURL).read()
2850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2851                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2852                                 return
2853
2854                         info['title'] = info['id']
2855
2856                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2857                         info['list'] = [
2858                                 {
2859                                         'type': 'reference',
2860                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2861                                 }
2862                                         for cpage in links]
2863
2864                         results = []
2865                         for entry in info['list']:
2866                                 assert entry['type'] == 'reference'
2867                                 results += self.extract(entry['url'])
2868                         return results
2869
2870 class MTVIE(InfoExtractor):
2871         """Information extractor for MTV.com"""
2872
2873         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2874         IE_NAME = u'mtv'
2875
2876         def report_webpage(self, video_id):
2877                 """Report information extraction."""
2878                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2879
2880         def report_extraction(self, video_id):
2881                 """Report information extraction."""
2882                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2883
2884         def _real_extract(self, url):
2885                 mobj = re.match(self._VALID_URL, url)
2886                 if mobj is None:
2887                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888                         return
2889                 if not mobj.group('proto'):
2890                         url = 'http://' + url
2891                 video_id = mobj.group('videoid')
2892                 self.report_webpage(video_id)
2893
2894                 request = urllib2.Request(url)
2895                 try:
2896                         webpage = urllib2.urlopen(request).read()
2897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2899                         return
2900
2901                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2902                 if mobj is None:
2903                         self._downloader.trouble(u'ERROR: unable to extract song name')
2904                         return
2905                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2906                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2907                 if mobj is None:
2908                         self._downloader.trouble(u'ERROR: unable to extract performer')
2909                         return
2910                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2911                 video_title = performer + ' - ' + song_name
2912
2913                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2914                 if mobj is None:
2915                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2916                         return
2917                 mtvn_uri = mobj.group(1)
2918
2919                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2920                 if mobj is None:
2921                         self._downloader.trouble(u'ERROR: unable to extract content id')
2922                         return
2923                 content_id = mobj.group(1)
2924
2925                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926                 self.report_extraction(video_id)
2927                 request = urllib2.Request(videogen_url)
2928                 try:
2929                         metadataXml = urllib2.urlopen(request).read()
2930                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2931                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2932                         return
2933
2934                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2935                 renditions = mdoc.findall('.//rendition')
2936
2937                 # For now, always pick the highest quality.
2938                 rendition = renditions[-1]
2939
2940                 try:
2941                         _,_,ext = rendition.attrib['type'].partition('/')
2942                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2943                         video_url = rendition.find('./src').text
2944                 except KeyError:
2945                         self._downloader.trouble('Invalid rendition field.')
2946                         return
2947
2948                 info = {
2949                         'id': video_id,
2950                         'url': video_url,
2951                         'uploader': performer,
2952                         'title': video_title,
2953                         'ext': ext,
2954                         'format': format,
2955                 }
2956
2957                 return [info]
2958
2959
2960 class YoukuIE(InfoExtractor):
2961
2962         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2963         IE_NAME = u'Youku'
2964
2965         def __init__(self, downloader=None):
2966                 InfoExtractor.__init__(self, downloader)
2967
2968         def report_download_webpage(self, file_id):
2969                 """Report webpage download."""
2970                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2971
2972         def report_extraction(self, file_id):
2973                 """Report information extraction."""
2974                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2975
2976         def _gen_sid(self):
2977                 nowTime = int(time.time() * 1000)
2978                 random1 = random.randint(1000,1998)
2979                 random2 = random.randint(1000,9999)
2980
2981                 return "%d%d%d" %(nowTime,random1,random2)
2982
2983         def _get_file_ID_mix_string(self, seed):
2984                 mixed = []
2985                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2986                 seed = float(seed)
2987                 for i in range(len(source)):
2988                         seed  =  (seed * 211 + 30031 ) % 65536
2989                         index  =  math.floor(seed / 65536 * len(source) )
2990                         mixed.append(source[int(index)])
2991                         source.remove(source[int(index)])
2992                 #return ''.join(mixed)
2993                 return mixed
2994
2995         def _get_file_id(self, fileId, seed):
2996                 mixed = self._get_file_ID_mix_string(seed)
2997                 ids = fileId.split('*')
2998                 realId = []
2999                 for ch in ids:
3000                         if ch:
3001                                 realId.append(mixed[int(ch)])
3002                 return ''.join(realId)
3003
3004         def _real_extract(self, url):
3005                 mobj = re.match(self._VALID_URL, url)
3006                 if mobj is None:
3007                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3008                         return
3009                 video_id = mobj.group('ID')
3010
3011                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3012
3013                 request = urllib2.Request(info_url, None, std_headers)
3014                 try:
3015                         self.report_download_webpage(video_id)
3016                         jsondata = urllib2.urlopen(request).read()
3017                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3018                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3019                         return
3020
3021                 self.report_extraction(video_id)
3022                 try:
3023                         config = json.loads(jsondata)
3024
3025                         video_title =  config['data'][0]['title']
3026                         seed = config['data'][0]['seed']
3027
3028                         format = self._downloader.params.get('format', None)
3029                         supported_format = config['data'][0]['streamfileids'].keys()
3030
3031                         if format is None or format == 'best':
3032                                 if 'hd2' in supported_format:
3033                                         format = 'hd2'
3034                                 else:
3035                                         format = 'flv'
3036                                 ext = u'flv'
3037                         elif format == 'worst':
3038                                 format = 'mp4'
3039                                 ext = u'mp4'
3040                         else:
3041                                 format = 'flv'
3042                                 ext = u'flv'
3043
3044
3045                         fileid = config['data'][0]['streamfileids'][format]
3046                         seg_number = len(config['data'][0]['segs'][format])
3047
3048                         keys=[]
3049                         for i in xrange(seg_number):
3050                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3051
3052                         #TODO check error
3053                         #youku only could be viewed from mainland china
3054                 except:
3055                         self._downloader.trouble(u'ERROR: unable to extract info section')
3056                         return
3057
3058                 files_info=[]
3059                 sid = self._gen_sid()
3060                 fileid = self._get_file_id(fileid, seed)
3061
3062                 #column 8,9 of fileid represent the segment number
3063                 #fileid[7:9] should be changed
3064                 for index, key in enumerate(keys):
3065
3066                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3067                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3068
3069                         info = {
3070                                 'id': '%s_part%02d' % (video_id, index),
3071                                 'url': download_url,
3072                                 'uploader': None,
3073                                 'title': video_title,
3074                                 'ext': ext,
3075                                 'format': u'NA'
3076                         }
3077                         files_info.append(info)
3078
3079                 return files_info
3080
3081
3082 class XNXXIE(InfoExtractor):
3083         """Information extractor for xnxx.com"""
3084
3085         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3086         IE_NAME = u'xnxx'
3087         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3088         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3089         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3090
3091         def report_webpage(self, video_id):
3092                 """Report information extraction"""
3093                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3094
3095         def report_extraction(self, video_id):
3096                 """Report information extraction"""
3097                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3098
3099         def _real_extract(self, url):
3100                 mobj = re.match(self._VALID_URL, url)
3101                 if mobj is None:
3102                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3103                         return
3104                 video_id = mobj.group(1).decode('utf-8')
3105
3106                 self.report_webpage(video_id)
3107
3108                 # Get webpage content
3109                 try:
3110                         webpage = urllib2.urlopen(url).read()
3111                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3113                         return
3114
3115                 result = re.search(self.VIDEO_URL_RE, webpage)
3116                 if result is None:
3117                         self._downloader.trouble(u'ERROR: unable to extract video url')
3118                         return
3119                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3120
3121                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3122                 if result is None:
3123                         self._downloader.trouble(u'ERROR: unable to extract video title')
3124                         return
3125                 video_title = result.group(1).decode('utf-8')
3126
3127                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3128                 if result is None:
3129                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3130                         return
3131                 video_thumbnail = result.group(1).decode('utf-8')
3132
3133                 info = {'id': video_id,
3134                                 'url': video_url,
3135                                 'uploader': None,
3136                                 'upload_date': None,
3137                                 'title': video_title,
3138                                 'ext': 'flv',
3139                                 'format': 'flv',
3140                                 'thumbnail': video_thumbnail,
3141                                 'description': None,
3142                                 'player_url': None}
3143
3144                 return [info]