_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 106                              (?:                                                  # the various things that can precede the ID:
 107                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 108                                  |(?:                                             # or the v= param in all its forms
 109                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 110                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 111                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 112                                      v=
 113                                  )
 114                              )?                                                   # optional -> youtube.com/xxxx is OK
 115                          )?                                                       # all until now is optional -> you can pass the naked ID
 116                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 117                          (?(1).+)?                                                # if we found the ID, everything can follow
 118                          $"""
 119         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 120         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 121         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 122         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 123         _NETRC_MACHINE = 'youtube'
 124         # Listed in order of quality
 125         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 126         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 127         _video_extensions = {
 128                 '13': '3gp',
 129                 '17': 'mp4',
 130                 '18': 'mp4',
 131                 '22': 'mp4',
 132                 '37': 'mp4',
 133                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 134                 '43': 'webm',
 135                 '44': 'webm',
 136                 '45': 'webm',
 137                 '46': 'webm',
 138         }
 139         _video_dimensions = {
 140                 '5': '240x400',
 141                 '6': '???',
 142                 '13': '???',
 143                 '17': '144x176',
 144                 '18': '360x640',
 145                 '22': '720x1280',
 146                 '34': '360x640',
 147                 '35': '480x854',
 148                 '37': '1080x1920',
 149                 '38': '3072x4096',
 150                 '43': '360x640',
 151                 '44': '480x854',
 152                 '45': '720x1280',
 153                 '46': '1080x1920',
 154         }
 155         IE_NAME = u'youtube'
 156
 157         def suitable(self, url):
 158                 """Receives a URL and returns True if suitable for this IE."""
 159                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 160
 161         def report_lang(self):
 162                 """Report attempt to set language."""
 163                 self._downloader.to_screen(u'[youtube] Setting language')
 164
 165         def report_login(self):
 166                 """Report attempt to log in."""
 167                 self._downloader.to_screen(u'[youtube] Logging in')
 168
 169         def report_age_confirmation(self):
 170                 """Report attempt to confirm age."""
 171                 self._downloader.to_screen(u'[youtube] Confirming age')
 172
 173         def report_video_webpage_download(self, video_id):
 174                 """Report attempt to download video webpage."""
 175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 176
 177         def report_video_info_webpage_download(self, video_id):
 178                 """Report attempt to download video info webpage."""
 179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 180
 181         def report_video_subtitles_download(self, video_id):
 182                 """Report attempt to download video info webpage."""
 183                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 184
 185         def report_information_extraction(self, video_id):
 186                 """Report attempt to extract video information."""
 187                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 188
 189         def report_unavailable_format(self, video_id, format):
 190                 """Report extracted video URL."""
 191                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 192
 193         def report_rtmp_download(self):
 194                 """Indicate the download will use the RTMP protocol."""
 195                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 196
 197         def _closed_captions_xml_to_srt(self, xml_string):
 198                 srt = ''
 199                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 200                 # TODO parse xml instead of regex
 201                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 202                         if not dur: dur = '4'
 203                         start = float(start)
 204                         end = start + float(dur)
 205                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 206                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 207                         caption = unescapeHTML(caption)
 208                         caption = unescapeHTML(caption) # double cycle, intentional
 209                         srt += str(n+1) + '\n'
 210                         srt += start + ' --> ' + end + '\n'
 211                         srt += caption + '\n\n'
 212                 return srt
 213
 214         def _print_formats(self, formats):
 215                 print 'Available formats:'
 216                 for x in formats:
 217                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 218
 219         def _real_initialize(self):
 220                 if self._downloader is None:
 221                         return
 222
 223                 username = None
 224                 password = None
 225                 downloader_params = self._downloader.params
 226
 227                 # Attempt to use provided username and password or .netrc data
 228                 if downloader_params.get('username', None) is not None:
 229                         username = downloader_params['username']
 230                         password = downloader_params['password']
 231                 elif downloader_params.get('usenetrc', False):
 232                         try:
 233                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 234                                 if info is not None:
 235                                         username = info[0]
 236                                         password = info[2]
 237                                 else:
 238                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 239                         except (IOError, netrc.NetrcParseError), err:
 240                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 241                                 return
 242
 243                 # Set language
 244                 request = urllib2.Request(self._LANG_URL)
 245                 try:
 246                         self.report_lang()
 247                         urllib2.urlopen(request).read()
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 250                         return
 251
 252                 # No authentication to be performed
 253                 if username is None:
 254                         return
 255
 256                 # Log in
 257                 login_form = {
 258                                 'current_form': 'loginForm',
 259                                 'next':         '/',
 260                                 'action_login': 'Log In',
 261                                 'username':     username,
 262                                 'password':     password,
 263                                 }
 264                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 265                 try:
 266                         self.report_login()
 267                         login_results = urllib2.urlopen(request).read()
 268                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 269                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 270                                 return
 271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 272                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 273                         return
 274
 275                 # Confirm age
 276                 age_form = {
 277                                 'next_url':             '/',
 278                                 'action_confirm':       'Confirm',
 279                                 }
 280                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 281                 try:
 282                         self.report_age_confirmation()
 283                         age_results = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 286                         return
 287
 288         def _real_extract(self, url):
 289                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 290                 mobj = re.search(self._NEXT_URL_RE, url)
 291                 if mobj:
 292                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 293
 294                 # Extract video id from URL
 295                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 296                 if mobj is None:
 297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 298                         return
 299                 video_id = mobj.group(2)
 300
 301                 # Get video webpage
 302                 self.report_video_webpage_download(video_id)
 303                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 304                 try:
 305                         video_webpage = urllib2.urlopen(request).read()
 306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 308                         return
 309
 310                 # Attempt to extract SWF player URL
 311                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 312                 if mobj is not None:
 313                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 314                 else:
 315                         player_url = None
 316
 317                 # Get video info
 318                 self.report_video_info_webpage_download(video_id)
 319                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 320                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 321                                         % (video_id, el_type))
 322                         request = urllib2.Request(video_info_url)
 323                         try:
 324                                 video_info_webpage = urllib2.urlopen(request).read()
 325                                 video_info = parse_qs(video_info_webpage)
 326                                 if 'token' in video_info:
 327                                         break
 328                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 329                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 330                                 return
 331                 if 'token' not in video_info:
 332                         if 'reason' in video_info:
 333                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 334                         else:
 335                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 336                         return
 337
 338                 # Check for "rental" videos
 339                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 340                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 341                         return
 342
 343                 # Start extracting information
 344                 self.report_information_extraction(video_id)
 345
 346                 # uploader
 347                 if 'author' not in video_info:
 348                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 349                         return
 350                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 351
 352                 # title
 353                 if 'title' not in video_info:
 354                         self._downloader.trouble(u'ERROR: unable to extract video title')
 355                         return
 356                 video_title = urllib.unquote_plus(video_info['title'][0])
 357                 video_title = video_title.decode('utf-8')
 358
 359                 # thumbnail image
 360                 if 'thumbnail_url' not in video_info:
 361                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 362                         video_thumbnail = ''
 363                 else:   # don't panic if we can't find it
 364                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 365
 366                 # upload date
 367                 upload_date = u'NA'
 368                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 369                 if mobj is not None:
 370                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 371                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 372                         for expression in format_expressions:
 373                                 try:
 374                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 375                                 except:
 376                                         pass
 377
 378                 # description
 379                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 380                 if video_description: video_description = clean_html(video_description)
 381                 else: video_description = ''
 382
 383                 # closed captions
 384                 video_subtitles = None
 385                 if self._downloader.params.get('writesubtitles', False):
 386                         try:
 387                                 self.report_video_subtitles_download(video_id)
 388                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 389                                 try:
 390                                         srt_list = urllib2.urlopen(request).read()
 391                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 393                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 394                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 395                                 if not srt_lang_list:
 396                                         raise Trouble(u'WARNING: video has no closed captions')
 397                                 if self._downloader.params.get('subtitleslang', False):
 398                                         srt_lang = self._downloader.params.get('subtitleslang')
 399                                 elif 'en' in srt_lang_list:
 400                                         srt_lang = 'en'
 401                                 else:
 402                                         srt_lang = srt_lang_list.keys()[0]
 403                                 if not srt_lang in srt_lang_list:
 404                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 405                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 406                                 try:
 407                                         srt_xml = urllib2.urlopen(request).read()
 408                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 409                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 410                                 if not srt_xml:
 411                                         raise Trouble(u'WARNING: unable to download video subtitles')
 412                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 413                         except Trouble as trouble:
 414                                 self._downloader.trouble(trouble[0])
 415
 416                 # token
 417                 video_token = urllib.unquote_plus(video_info['token'][0])
 418
 419                 # Decide which formats to download
 420                 req_format = self._downloader.params.get('format', None)
 421
 422                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 423                         self.report_rtmp_download()
 424                         video_url_list = [(None, video_info['conn'][0])]
 425                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 426                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 427                         url_data = [parse_qs(uds) for uds in url_data_strs]
 428                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 429                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 430
 431                         format_limit = self._downloader.params.get('format_limit', None)
 432                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 433                         if format_limit is not None and format_limit in available_formats:
 434                                 format_list = available_formats[available_formats.index(format_limit):]
 435                         else:
 436                                 format_list = available_formats
 437                         existing_formats = [x for x in format_list if x in url_map]
 438                         if len(existing_formats) == 0:
 439                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 440                                 return
 441                         if self._downloader.params.get('listformats', None):
 442                                 self._print_formats(existing_formats)
 443                                 return
 444                         if req_format is None or req_format == 'best':
 445                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 446                         elif req_format == 'worst':
 447                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 448                         elif req_format in ('-1', 'all'):
 449                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 450                         else:
 451                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 452                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 453                                 req_formats = req_format.split('/')
 454                                 video_url_list = None
 455                                 for rf in req_formats:
 456                                         if rf in url_map:
 457                                                 video_url_list = [(rf, url_map[rf])]
 458                                                 break
 459                                 if video_url_list is None:
 460                                         self._downloader.trouble(u'ERROR: requested format not available')
 461                                         return
 462                 else:
 463                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 464                         return
 465
 466                 results = []
 467                 for format_param, video_real_url in video_url_list:
 468                         # Extension
 469                         video_extension = self._video_extensions.get(format_param, 'flv')
 470
 471                         results.append({
 472                                 'id':           video_id.decode('utf-8'),
 473                                 'url':          video_real_url.decode('utf-8'),
 474                                 'uploader':     video_uploader.decode('utf-8'),
 475                                 'upload_date':  upload_date,
 476                                 'title':        video_title,
 477                                 'ext':          video_extension.decode('utf-8'),
 478                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 479                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 480                                 'description':  video_description,
 481                                 'player_url':   player_url,
 482                                 'subtitles':    video_subtitles
 483                         })
 484                 return results
 485
 486
 487 class MetacafeIE(InfoExtractor):
 488         """Information Extractor for metacafe.com."""
 489
 490         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 491         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 492         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 493         IE_NAME = u'metacafe'
 494
 495         def __init__(self, downloader=None):
 496                 InfoExtractor.__init__(self, downloader)
 497
 498         def report_disclaimer(self):
 499                 """Report disclaimer retrieval."""
 500                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 501
 502         def report_age_confirmation(self):
 503                 """Report attempt to confirm age."""
 504                 self._downloader.to_screen(u'[metacafe] Confirming age')
 505
 506         def report_download_webpage(self, video_id):
 507                 """Report webpage download."""
 508                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 509
 510         def report_extraction(self, video_id):
 511                 """Report information extraction."""
 512                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 513
 514         def _real_initialize(self):
 515                 # Retrieve disclaimer
 516                 request = urllib2.Request(self._DISCLAIMER)
 517                 try:
 518                         self.report_disclaimer()
 519                         disclaimer = urllib2.urlopen(request).read()
 520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 521                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 522                         return
 523
 524                 # Confirm age
 525                 disclaimer_form = {
 526                         'filters': '0',
 527                         'submit': "Continue - I'm over 18",
 528                         }
 529                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 530                 try:
 531                         self.report_age_confirmation()
 532                         disclaimer = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 535                         return
 536
 537         def _real_extract(self, url):
 538                 # Extract id and simplified title from URL
 539                 mobj = re.match(self._VALID_URL, url)
 540                 if mobj is None:
 541                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 542                         return
 543
 544                 video_id = mobj.group(1)
 545
 546                 # Check if video comes from YouTube
 547                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 548                 if mobj2 is not None:
 549                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 550                         return
 551
 552                 # Retrieve video webpage to extract further information
 553                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 554                 try:
 555                         self.report_download_webpage(video_id)
 556                         webpage = urllib2.urlopen(request).read()
 557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 558                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 559                         return
 560
 561                 # Extract URL, uploader and title from webpage
 562                 self.report_extraction(video_id)
 563                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 564                 if mobj is not None:
 565                         mediaURL = urllib.unquote(mobj.group(1))
 566                         video_extension = mediaURL[-3:]
 567
 568                         # Extract gdaKey if available
 569                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 570                         if mobj is None:
 571                                 video_url = mediaURL
 572                         else:
 573                                 gdaKey = mobj.group(1)
 574                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 575                 else:
 576                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 577                         if mobj is None:
 578                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 579                                 return
 580                         vardict = parse_qs(mobj.group(1))
 581                         if 'mediaData' not in vardict:
 582                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 583                                 return
 584                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         mediaURL = mobj.group(1).replace('\\/', '/')
 589                         video_extension = mediaURL[-3:]
 590                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 591
 592                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 593                 if mobj is None:
 594                         self._downloader.trouble(u'ERROR: unable to extract title')
 595                         return
 596                 video_title = mobj.group(1).decode('utf-8')
 597
 598                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 599                 if mobj is None:
 600                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 601                         return
 602                 video_uploader = mobj.group(1)
 603
 604                 return [{
 605                         'id':           video_id.decode('utf-8'),
 606                         'url':          video_url.decode('utf-8'),
 607                         'uploader':     video_uploader.decode('utf-8'),
 608                         'upload_date':  u'NA',
 609                         'title':        video_title,
 610                         'ext':          video_extension.decode('utf-8'),
 611                         'format':       u'NA',
 612                         'player_url':   None,
 613                 }]
 614
 615
 616 class DailymotionIE(InfoExtractor):
 617         """Information Extractor for Dailymotion"""
 618
 619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 620         IE_NAME = u'dailymotion'
 621
 622         def __init__(self, downloader=None):
 623                 InfoExtractor.__init__(self, downloader)
 624
 625         def report_download_webpage(self, video_id):
 626                 """Report webpage download."""
 627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 628
 629         def report_extraction(self, video_id):
 630                 """Report information extraction."""
 631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 632
 633         def _real_extract(self, url):
 634                 # Extract id and simplified title from URL
 635                 mobj = re.match(self._VALID_URL, url)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 638                         return
 639
 640                 video_id = mobj.group(1)
 641
 642                 video_extension = 'mp4'
 643
 644                 # Retrieve video webpage to extract further information
 645                 request = urllib2.Request(url)
 646                 request.add_header('Cookie', 'family_filter=off')
 647                 try:
 648                         self.report_download_webpage(video_id)
 649                         webpage = urllib2.urlopen(request).read()
 650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 652                         return
 653
 654                 # Extract URL, uploader and title from webpage
 655                 self.report_extraction(video_id)
 656                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 659                         return
 660                 flashvars = urllib.unquote(mobj.group(1))
 661                 if 'hqURL' in flashvars: max_quality = 'hqURL'
 662                 elif 'sdURL' in flashvars: max_quality = 'sdURL'
 663                 else: max_quality = 'ldURL'
 664                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 665                 if mobj is None:
 666                         mobj = re.search(r'"video_url":"(.*?)",', urllib.unquote(webpage))
 667                 if mobj is None:
 668                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 669                         return
 670                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 671
 672                 # TODO: support choosing qualities
 673
 674                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 675                 if mobj is None:
 676                         self._downloader.trouble(u'ERROR: unable to extract title')
 677                         return
 678                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 679
 680                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 683                         return
 684                 video_uploader = mobj.group(1)
 685
 686                 return [{
 687                         'id':           video_id.decode('utf-8'),
 688                         'url':          video_url.decode('utf-8'),
 689                         'uploader':     video_uploader.decode('utf-8'),
 690                         'upload_date':  u'NA',
 691                         'title':        video_title,
 692                         'ext':          video_extension.decode('utf-8'),
 693                         'format':       u'NA',
 694                         'player_url':   None,
 695                 }]
 696
 697
 698 class GoogleIE(InfoExtractor):
 699         """Information extractor for video.google.com."""
 700
 701         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 702         IE_NAME = u'video.google'
 703
 704         def __init__(self, downloader=None):
 705                 InfoExtractor.__init__(self, downloader)
 706
 707         def report_download_webpage(self, video_id):
 708                 """Report webpage download."""
 709                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 710
 711         def report_extraction(self, video_id):
 712                 """Report information extraction."""
 713                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 714
 715         def _real_extract(self, url):
 716                 # Extract id from URL
 717                 mobj = re.match(self._VALID_URL, url)
 718                 if mobj is None:
 719                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 720                         return
 721
 722                 video_id = mobj.group(1)
 723
 724                 video_extension = 'mp4'
 725
 726                 # Retrieve video webpage to extract further information
 727                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 728                 try:
 729                         self.report_download_webpage(video_id)
 730                         webpage = urllib2.urlopen(request).read()
 731                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 732                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 733                         return
 734
 735                 # Extract URL, uploader, and title from webpage
 736                 self.report_extraction(video_id)
 737                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 738                 if mobj is None:
 739                         video_extension = 'flv'
 740                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 741                 if mobj is None:
 742                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 743                         return
 744                 mediaURL = urllib.unquote(mobj.group(1))
 745                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 746                 mediaURL = mediaURL.replace('\\x26', '\x26')
 747
 748                 video_url = mediaURL
 749
 750                 mobj = re.search(r'<title>(.*)</title>', webpage)
 751                 if mobj is None:
 752                         self._downloader.trouble(u'ERROR: unable to extract title')
 753                         return
 754                 video_title = mobj.group(1).decode('utf-8')
 755
 756                 # Extract video description
 757                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 758                 if mobj is None:
 759                         self._downloader.trouble(u'ERROR: unable to extract video description')
 760                         return
 761                 video_description = mobj.group(1).decode('utf-8')
 762                 if not video_description:
 763                         video_description = 'No description available.'
 764
 765                 # Extract video thumbnail
 766                 if self._downloader.params.get('forcethumbnail', False):
 767                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 768                         try:
 769                                 webpage = urllib2.urlopen(request).read()
 770                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 771                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 772                                 return
 773                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 774                         if mobj is None:
 775                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 776                                 return
 777                         video_thumbnail = mobj.group(1)
 778                 else:   # we need something to pass to process_info
 779                         video_thumbnail = ''
 780
 781                 return [{
 782                         'id':           video_id.decode('utf-8'),
 783                         'url':          video_url.decode('utf-8'),
 784                         'uploader':     u'NA',
 785                         'upload_date':  u'NA',
 786                         'title':        video_title,
 787                         'ext':          video_extension.decode('utf-8'),
 788                         'format':       u'NA',
 789                         'player_url':   None,
 790                 }]
 791
 792
 793 class PhotobucketIE(InfoExtractor):
 794         """Information extractor for photobucket.com."""
 795
 796         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 797         IE_NAME = u'photobucket'
 798
 799         def __init__(self, downloader=None):
 800                 InfoExtractor.__init__(self, downloader)
 801
 802         def report_download_webpage(self, video_id):
 803                 """Report webpage download."""
 804                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 805
 806         def report_extraction(self, video_id):
 807                 """Report information extraction."""
 808                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 809
 810         def _real_extract(self, url):
 811                 # Extract id from URL
 812                 mobj = re.match(self._VALID_URL, url)
 813                 if mobj is None:
 814                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 815                         return
 816
 817                 video_id = mobj.group(1)
 818
 819                 video_extension = 'flv'
 820
 821                 # Retrieve video webpage to extract further information
 822                 request = urllib2.Request(url)
 823                 try:
 824                         self.report_download_webpage(video_id)
 825                         webpage = urllib2.urlopen(request).read()
 826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 827                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 828                         return
 829
 830                 # Extract URL, uploader, and title from webpage
 831                 self.report_extraction(video_id)
 832                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 833                 if mobj is None:
 834                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 835                         return
 836                 mediaURL = urllib.unquote(mobj.group(1))
 837
 838                 video_url = mediaURL
 839
 840                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 841                 if mobj is None:
 842                         self._downloader.trouble(u'ERROR: unable to extract title')
 843                         return
 844                 video_title = mobj.group(1).decode('utf-8')
 845
 846                 video_uploader = mobj.group(2).decode('utf-8')
 847
 848                 return [{
 849                         'id':           video_id.decode('utf-8'),
 850                         'url':          video_url.decode('utf-8'),
 851                         'uploader':     video_uploader,
 852                         'upload_date':  u'NA',
 853                         'title':        video_title,
 854                         'ext':          video_extension.decode('utf-8'),
 855                         'format':       u'NA',
 856                         'player_url':   None,
 857                 }]
 858
 859
 860 class YahooIE(InfoExtractor):
 861         """Information extractor for video.yahoo.com."""
 862
 863         # _VALID_URL matches all Yahoo! Video URLs
 864         # _VPAGE_URL matches only the extractable '/watch/' URLs
 865         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 866         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 867         IE_NAME = u'video.yahoo'
 868
 869         def __init__(self, downloader=None):
 870                 InfoExtractor.__init__(self, downloader)
 871
 872         def report_download_webpage(self, video_id):
 873                 """Report webpage download."""
 874                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 875
 876         def report_extraction(self, video_id):
 877                 """Report information extraction."""
 878                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 879
 880         def _real_extract(self, url, new_video=True):
 881                 # Extract ID from URL
 882                 mobj = re.match(self._VALID_URL, url)
 883                 if mobj is None:
 884                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 885                         return
 886
 887                 video_id = mobj.group(2)
 888                 video_extension = 'flv'
 889
 890                 # Rewrite valid but non-extractable URLs as
 891                 # extractable English language /watch/ URLs
 892                 if re.match(self._VPAGE_URL, url) is None:
 893                         request = urllib2.Request(url)
 894                         try:
 895                                 webpage = urllib2.urlopen(request).read()
 896                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 897                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 898                                 return
 899
 900                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 901                         if mobj is None:
 902                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 903                                 return
 904                         yahoo_id = mobj.group(1)
 905
 906                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 907                         if mobj is None:
 908                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 909                                 return
 910                         yahoo_vid = mobj.group(1)
 911
 912                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 913                         return self._real_extract(url, new_video=False)
 914
 915                 # Retrieve video webpage to extract further information
 916                 request = urllib2.Request(url)
 917                 try:
 918                         self.report_download_webpage(video_id)
 919                         webpage = urllib2.urlopen(request).read()
 920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 921                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 922                         return
 923
 924                 # Extract uploader and title from webpage
 925                 self.report_extraction(video_id)
 926                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 927                 if mobj is None:
 928                         self._downloader.trouble(u'ERROR: unable to extract video title')
 929                         return
 930                 video_title = mobj.group(1).decode('utf-8')
 931
 932                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 933                 if mobj is None:
 934                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 935                         return
 936                 video_uploader = mobj.group(1).decode('utf-8')
 937
 938                 # Extract video thumbnail
 939                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 942                         return
 943                 video_thumbnail = mobj.group(1).decode('utf-8')
 944
 945                 # Extract video description
 946                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 947                 if mobj is None:
 948                         self._downloader.trouble(u'ERROR: unable to extract video description')
 949                         return
 950                 video_description = mobj.group(1).decode('utf-8')
 951                 if not video_description:
 952                         video_description = 'No description available.'
 953
 954                 # Extract video height and width
 955                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 956                 if mobj is None:
 957                         self._downloader.trouble(u'ERROR: unable to extract video height')
 958                         return
 959                 yv_video_height = mobj.group(1)
 960
 961                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 962                 if mobj is None:
 963                         self._downloader.trouble(u'ERROR: unable to extract video width')
 964                         return
 965                 yv_video_width = mobj.group(1)
 966
 967                 # Retrieve video playlist to extract media URL
 968                 # I'm not completely sure what all these options are, but we
 969                 # seem to need most of them, otherwise the server sends a 401.
 970                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 971                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 972                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 973                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 974                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 975                 try:
 976                         self.report_download_webpage(video_id)
 977                         webpage = urllib2.urlopen(request).read()
 978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 979                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 980                         return
 981
 982                 # Extract media URL from playlist XML
 983                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 984                 if mobj is None:
 985                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 986                         return
 987                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 988                 video_url = unescapeHTML(video_url)
 989
 990                 return [{
 991                         'id':           video_id.decode('utf-8'),
 992                         'url':          video_url,
 993                         'uploader':     video_uploader,
 994                         'upload_date':  u'NA',
 995                         'title':        video_title,
 996                         'ext':          video_extension.decode('utf-8'),
 997                         'thumbnail':    video_thumbnail.decode('utf-8'),
 998                         'description':  video_description,
 999                         'thumbnail':    video_thumbnail,
1000                         'player_url':   None,
1001                 }]
1002
1003
1004 class VimeoIE(InfoExtractor):
1005         """Information extractor for vimeo.com."""
1006
1007         # _VALID_URL matches Vimeo URLs
1008         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1009         IE_NAME = u'vimeo'
1010
1011         def __init__(self, downloader=None):
1012                 InfoExtractor.__init__(self, downloader)
1013
1014         def report_download_webpage(self, video_id):
1015                 """Report webpage download."""
1016                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1017
1018         def report_extraction(self, video_id):
1019                 """Report information extraction."""
1020                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1021
1022         def _real_extract(self, url, new_video=True):
1023                 # Extract ID from URL
1024                 mobj = re.match(self._VALID_URL, url)
1025                 if mobj is None:
1026                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1027                         return
1028
1029                 video_id = mobj.group(1)
1030
1031                 # Retrieve video webpage to extract further information
1032                 request = urllib2.Request(url, None, std_headers)
1033                 try:
1034                         self.report_download_webpage(video_id)
1035                         webpage = urllib2.urlopen(request).read()
1036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1038                         return
1039
1040                 # Now we begin extracting as much information as we can from what we
1041                 # retrieved. First we extract the information common to all extractors,
1042                 # and latter we extract those that are Vimeo specific.
1043                 self.report_extraction(video_id)
1044
1045                 # Extract the config JSON
1046                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1047                 try:
1048                         config = json.loads(config)
1049                 except:
1050                         self._downloader.trouble(u'ERROR: unable to extract info section')
1051                         return
1052
1053                 # Extract title
1054                 video_title = config["video"]["title"]
1055
1056                 # Extract uploader
1057                 video_uploader = config["video"]["owner"]["name"]
1058
1059                 # Extract video thumbnail
1060                 video_thumbnail = config["video"]["thumbnail"]
1061
1062                 # Extract video description
1063                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1064                 if video_description: video_description = clean_html(video_description)
1065                 else: video_description = ''
1066
1067                 # Extract upload date
1068                 video_upload_date = u'NA'
1069                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1070                 if mobj is not None:
1071                         video_upload_date = mobj.group(1)
1072
1073                 # Vimeo specific: extract request signature and timestamp
1074                 sig = config['request']['signature']
1075                 timestamp = config['request']['timestamp']
1076
1077                 # Vimeo specific: extract video codec and quality information
1078                 # TODO bind to format param
1079                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1080                 for codec in codecs:
1081                         if codec[0] in config["video"]["files"]:
1082                                 video_codec = codec[0]
1083                                 video_extension = codec[1]
1084                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1085                                 else: quality = 'sd'
1086                                 break
1087                 else:
1088                         self._downloader.trouble(u'ERROR: no known codec found')
1089                         return
1090
1091                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1092                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1093
1094                 return [{
1095                         'id':           video_id,
1096                         'url':          video_url,
1097                         'uploader':     video_uploader,
1098                         'upload_date':  video_upload_date,
1099                         'title':        video_title,
1100                         'ext':          video_extension,
1101                         'thumbnail':    video_thumbnail,
1102                         'description':  video_description,
1103                         'player_url':   None,
1104                 }]
1105
1106
1107 class GenericIE(InfoExtractor):
1108         """Generic last-resort information extractor."""
1109
1110         _VALID_URL = r'.*'
1111         IE_NAME = u'generic'
1112
1113         def __init__(self, downloader=None):
1114                 InfoExtractor.__init__(self, downloader)
1115
1116         def report_download_webpage(self, video_id):
1117                 """Report webpage download."""
1118                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1119                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1120
1121         def report_extraction(self, video_id):
1122                 """Report information extraction."""
1123                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1124
1125         def report_following_redirect(self, new_url):
1126                 """Report information extraction."""
1127                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1128
1129         def _test_redirect(self, url):
1130                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1131                 class HeadRequest(urllib2.Request):
1132                         def get_method(self):
1133                                 return "HEAD"
1134
1135                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1136                         """
1137                         Subclass the HTTPRedirectHandler to make it use our
1138                         HeadRequest also on the redirected URL
1139                         """
1140                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1141                                 if code in (301, 302, 303, 307):
1142                                         newurl = newurl.replace(' ', '%20')
1143                                         newheaders = dict((k,v) for k,v in req.headers.items()
1144                                                                           if k.lower() not in ("content-length", "content-type"))
1145                                         return HeadRequest(newurl,
1146                                                                            headers=newheaders,
1147                                                                            origin_req_host=req.get_origin_req_host(),
1148                                                                            unverifiable=True)
1149                                 else:
1150                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1151
1152                 class HTTPMethodFallback(urllib2.BaseHandler):
1153                         """
1154                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1155                         """
1156                         def http_error_405(self, req, fp, code, msg, headers):
1157                                 fp.read()
1158                                 fp.close()
1159
1160                                 newheaders = dict((k,v) for k,v in req.headers.items()
1161                                                                   if k.lower() not in ("content-length", "content-type"))
1162                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1163                                                                                                  headers=newheaders,
1164                                                                                                  origin_req_host=req.get_origin_req_host(),
1165                                                                                                  unverifiable=True))
1166
1167                 # Build our opener
1168                 opener = urllib2.OpenerDirector()
1169                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1170                                                 HTTPMethodFallback, HEADRedirectHandler,
1171                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1172                         opener.add_handler(handler())
1173
1174                 response = opener.open(HeadRequest(url))
1175                 new_url = response.geturl()
1176
1177                 if url == new_url: return False
1178
1179                 self.report_following_redirect(new_url)
1180                 self._downloader.download([new_url])
1181                 return True
1182
1183         def _real_extract(self, url):
1184                 if self._test_redirect(url): return
1185
1186                 video_id = url.split('/')[-1]
1187                 request = urllib2.Request(url)
1188                 try:
1189                         self.report_download_webpage(video_id)
1190                         webpage = urllib2.urlopen(request).read()
1191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1193                         return
1194                 except ValueError, err:
1195                         # since this is the last-resort InfoExtractor, if
1196                         # this error is thrown, it'll be thrown here
1197                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1198                         return
1199
1200                 self.report_extraction(video_id)
1201                 # Start with something easy: JW Player in SWFObject
1202                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1203                 if mobj is None:
1204                         # Broaden the search a little bit
1205                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1208                         return
1209
1210                 # It's possible that one of the regexes
1211                 # matched, but returned an empty group:
1212                 if mobj.group(1) is None:
1213                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1214                         return
1215
1216                 video_url = urllib.unquote(mobj.group(1))
1217                 video_id = os.path.basename(video_url)
1218
1219                 # here's a fun little line of code for you:
1220                 video_extension = os.path.splitext(video_id)[1][1:]
1221                 video_id = os.path.splitext(video_id)[0]
1222
1223                 # it's tempting to parse this further, but you would
1224                 # have to take into account all the variations like
1225                 #   Video Title - Site Name
1226                 #   Site Name | Video Title
1227                 #   Video Title - Tagline | Site Name
1228                 # and so on and so forth; it's just not practical
1229                 mobj = re.search(r'<title>(.*)</title>', webpage)
1230                 if mobj is None:
1231                         self._downloader.trouble(u'ERROR: unable to extract title')
1232                         return
1233                 video_title = mobj.group(1).decode('utf-8')
1234
1235                 # video uploader is domain name
1236                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1237                 if mobj is None:
1238                         self._downloader.trouble(u'ERROR: unable to extract title')
1239                         return
1240                 video_uploader = mobj.group(1).decode('utf-8')
1241
1242                 return [{
1243                         'id':           video_id.decode('utf-8'),
1244                         'url':          video_url.decode('utf-8'),
1245                         'uploader':     video_uploader,
1246                         'upload_date':  u'NA',
1247                         'title':        video_title,
1248                         'ext':          video_extension.decode('utf-8'),
1249                         'format':       u'NA',
1250                         'player_url':   None,
1251                 }]
1252
1253
1254 class YoutubeSearchIE(InfoExtractor):
1255         """Information Extractor for YouTube search queries."""
1256         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1257         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1258         _max_youtube_results = 1000
1259         IE_NAME = u'youtube:search'
1260
1261         def __init__(self, downloader=None):
1262                 InfoExtractor.__init__(self, downloader)
1263
1264         def report_download_page(self, query, pagenum):
1265                 """Report attempt to download search page with given number."""
1266                 query = query.decode(preferredencoding())
1267                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1268
1269         def _real_extract(self, query):
1270                 mobj = re.match(self._VALID_URL, query)
1271                 if mobj is None:
1272                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1273                         return
1274
1275                 prefix, query = query.split(':')
1276                 prefix = prefix[8:]
1277                 query = query.encode('utf-8')
1278                 if prefix == '':
1279                         self._download_n_results(query, 1)
1280                         return
1281                 elif prefix == 'all':
1282                         self._download_n_results(query, self._max_youtube_results)
1283                         return
1284                 else:
1285                         try:
1286                                 n = long(prefix)
1287                                 if n <= 0:
1288                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1289                                         return
1290                                 elif n > self._max_youtube_results:
1291                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1292                                         n = self._max_youtube_results
1293                                 self._download_n_results(query, n)
1294                                 return
1295                         except ValueError: # parsing prefix as integer fails
1296                                 self._download_n_results(query, 1)
1297                                 return
1298
1299         def _download_n_results(self, query, n):
1300                 """Downloads a specified number of results for a query"""
1301
1302                 video_ids = []
1303                 pagenum = 0
1304                 limit = n
1305
1306                 while (50 * pagenum) < limit:
1307                         self.report_download_page(query, pagenum+1)
1308                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1309                         request = urllib2.Request(result_url)
1310                         try:
1311                                 data = urllib2.urlopen(request).read()
1312                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1313                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1314                                 return
1315                         api_response = json.loads(data)['data']
1316
1317                         new_ids = list(video['id'] for video in api_response['items'])
1318                         video_ids += new_ids
1319
1320                         limit = min(n, api_response['totalItems'])
1321                         pagenum += 1
1322
1323                 if len(video_ids) > n:
1324                         video_ids = video_ids[:n]
1325                 for id in video_ids:
1326                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1327                 return
1328
1329
1330 class GoogleSearchIE(InfoExtractor):
1331         """Information Extractor for Google Video search queries."""
1332         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1333         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1334         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1335         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1336         _max_google_results = 1000
1337         IE_NAME = u'video.google:search'
1338
1339         def __init__(self, downloader=None):
1340                 InfoExtractor.__init__(self, downloader)
1341
1342         def report_download_page(self, query, pagenum):
1343                 """Report attempt to download playlist page with given number."""
1344                 query = query.decode(preferredencoding())
1345                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1346
1347         def _real_extract(self, query):
1348                 mobj = re.match(self._VALID_URL, query)
1349                 if mobj is None:
1350                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1351                         return
1352
1353                 prefix, query = query.split(':')
1354                 prefix = prefix[8:]
1355                 query = query.encode('utf-8')
1356                 if prefix == '':
1357                         self._download_n_results(query, 1)
1358                         return
1359                 elif prefix == 'all':
1360                         self._download_n_results(query, self._max_google_results)
1361                         return
1362                 else:
1363                         try:
1364                                 n = long(prefix)
1365                                 if n <= 0:
1366                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1367                                         return
1368                                 elif n > self._max_google_results:
1369                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1370                                         n = self._max_google_results
1371                                 self._download_n_results(query, n)
1372                                 return
1373                         except ValueError: # parsing prefix as integer fails
1374                                 self._download_n_results(query, 1)
1375                                 return
1376
1377         def _download_n_results(self, query, n):
1378                 """Downloads a specified number of results for a query"""
1379
1380                 video_ids = []
1381                 pagenum = 0
1382
1383                 while True:
1384                         self.report_download_page(query, pagenum)
1385                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1386                         request = urllib2.Request(result_url)
1387                         try:
1388                                 page = urllib2.urlopen(request).read()
1389                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1390                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1391                                 return
1392
1393                         # Extract video identifiers
1394                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1395                                 video_id = mobj.group(1)
1396                                 if video_id not in video_ids:
1397                                         video_ids.append(video_id)
1398                                         if len(video_ids) == n:
1399                                                 # Specified n videos reached
1400                                                 for id in video_ids:
1401                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1402                                                 return
1403
1404                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1405                                 for id in video_ids:
1406                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1407                                 return
1408
1409                         pagenum = pagenum + 1
1410
1411
1412 class YahooSearchIE(InfoExtractor):
1413         """Information Extractor for Yahoo! Video search queries."""
1414         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1415         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1416         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1417         _MORE_PAGES_INDICATOR = r'\s*Next'
1418         _max_yahoo_results = 1000
1419         IE_NAME = u'video.yahoo:search'
1420
1421         def __init__(self, downloader=None):
1422                 InfoExtractor.__init__(self, downloader)
1423
1424         def report_download_page(self, query, pagenum):
1425                 """Report attempt to download playlist page with given number."""
1426                 query = query.decode(preferredencoding())
1427                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1428
1429         def _real_extract(self, query):
1430                 mobj = re.match(self._VALID_URL, query)
1431                 if mobj is None:
1432                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1433                         return
1434
1435                 prefix, query = query.split(':')
1436                 prefix = prefix[8:]
1437                 query = query.encode('utf-8')
1438                 if prefix == '':
1439                         self._download_n_results(query, 1)
1440                         return
1441                 elif prefix == 'all':
1442                         self._download_n_results(query, self._max_yahoo_results)
1443                         return
1444                 else:
1445                         try:
1446                                 n = long(prefix)
1447                                 if n <= 0:
1448                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1449                                         return
1450                                 elif n > self._max_yahoo_results:
1451                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1452                                         n = self._max_yahoo_results
1453                                 self._download_n_results(query, n)
1454                                 return
1455                         except ValueError: # parsing prefix as integer fails
1456                                 self._download_n_results(query, 1)
1457                                 return
1458
1459         def _download_n_results(self, query, n):
1460                 """Downloads a specified number of results for a query"""
1461
1462                 video_ids = []
1463                 already_seen = set()
1464                 pagenum = 1
1465
1466                 while True:
1467                         self.report_download_page(query, pagenum)
1468                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1469                         request = urllib2.Request(result_url)
1470                         try:
1471                                 page = urllib2.urlopen(request).read()
1472                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1474                                 return
1475
1476                         # Extract video identifiers
1477                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1478                                 video_id = mobj.group(1)
1479                                 if video_id not in already_seen:
1480                                         video_ids.append(video_id)
1481                                         already_seen.add(video_id)
1482                                         if len(video_ids) == n:
1483                                                 # Specified n videos reached
1484                                                 for id in video_ids:
1485                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1486                                                 return
1487
1488                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1489                                 for id in video_ids:
1490                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1491                                 return
1492
1493                         pagenum = pagenum + 1
1494
1495
1496 class YoutubePlaylistIE(InfoExtractor):
1497         """Information Extractor for YouTube playlists."""
1498
1499         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1500         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1501         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=.*?%s'
1502         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1503         IE_NAME = u'youtube:playlist'
1504
1505         def __init__(self, downloader=None):
1506                 InfoExtractor.__init__(self, downloader)
1507
1508         def report_download_page(self, playlist_id, pagenum):
1509                 """Report attempt to download playlist page with given number."""
1510                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1511
1512         def _real_extract(self, url):
1513                 # Extract playlist id
1514                 mobj = re.match(self._VALID_URL, url)
1515                 if mobj is None:
1516                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1517                         return
1518
1519                 # Single video case
1520                 if mobj.group(3) is not None:
1521                         self._downloader.download([mobj.group(3)])
1522                         return
1523
1524                 # Download playlist pages
1525                 # prefix is 'p' as default for playlists but there are other types that need extra care
1526                 playlist_prefix = mobj.group(1)
1527                 if playlist_prefix == 'a':
1528                         playlist_access = 'artist'
1529                 else:
1530                         playlist_prefix = 'p'
1531                         playlist_access = 'view_play_list'
1532                 playlist_id = mobj.group(2)
1533                 video_ids = []
1534                 pagenum = 1
1535
1536                 while True:
1537                         self.report_download_page(playlist_id, pagenum)
1538                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1539                         request = urllib2.Request(url)
1540                         try:
1541                                 page = urllib2.urlopen(request).read()
1542                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1543                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1544                                 return
1545
1546                         # Extract video identifiers
1547                         ids_in_page = []
1548                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1549                                 if mobj.group(1) not in ids_in_page:
1550                                         ids_in_page.append(mobj.group(1))
1551                         video_ids.extend(ids_in_page)
1552
1553                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1554                                 break
1555                         pagenum = pagenum + 1
1556
1557                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1558                 playlistend = self._downloader.params.get('playlistend', -1)
1559                 if playlistend == -1:
1560                         video_ids = video_ids[playliststart:]
1561                 else:
1562                         video_ids = video_ids[playliststart:playlistend]
1563
1564                 for id in video_ids:
1565                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1566                 return
1567
1568
1569 class YoutubeUserIE(InfoExtractor):
1570         """Information Extractor for YouTube users."""
1571
1572         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1573         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1574         _GDATA_PAGE_SIZE = 50
1575         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1576         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1577         IE_NAME = u'youtube:user'
1578
1579         def __init__(self, downloader=None):
1580                 InfoExtractor.__init__(self, downloader)
1581
1582         def report_download_page(self, username, start_index):
1583                 """Report attempt to download user page."""
1584                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1585                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1586
1587         def _real_extract(self, url):
1588                 # Extract username
1589                 mobj = re.match(self._VALID_URL, url)
1590                 if mobj is None:
1591                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1592                         return
1593
1594                 username = mobj.group(1)
1595
1596                 # Download video ids using YouTube Data API. Result size per
1597                 # query is limited (currently to 50 videos) so we need to query
1598                 # page by page until there are no video ids - it means we got
1599                 # all of them.
1600
1601                 video_ids = []
1602                 pagenum = 0
1603
1604                 while True:
1605                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1606                         self.report_download_page(username, start_index)
1607
1608                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1609
1610                         try:
1611                                 page = urllib2.urlopen(request).read()
1612                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1614                                 return
1615
1616                         # Extract video identifiers
1617                         ids_in_page = []
1618
1619                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1620                                 if mobj.group(1) not in ids_in_page:
1621                                         ids_in_page.append(mobj.group(1))
1622
1623                         video_ids.extend(ids_in_page)
1624
1625                         # A little optimization - if current page is not
1626                         # "full", ie. does not contain PAGE_SIZE video ids then
1627                         # we can assume that this page is the last one - there
1628                         # are no more ids on further pages - no need to query
1629                         # again.
1630
1631                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1632                                 break
1633
1634                         pagenum += 1
1635
1636                 all_ids_count = len(video_ids)
1637                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1638                 playlistend = self._downloader.params.get('playlistend', -1)
1639
1640                 if playlistend == -1:
1641                         video_ids = video_ids[playliststart:]
1642                 else:
1643                         video_ids = video_ids[playliststart:playlistend]
1644
1645                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1646                                 (username, all_ids_count, len(video_ids)))
1647
1648                 for video_id in video_ids:
1649                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1650
1651
1652 class BlipTVUserIE(InfoExtractor):
1653         """Information Extractor for blip.tv users."""
1654
1655         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1656         _PAGE_SIZE = 12
1657         IE_NAME = u'blip.tv:user'
1658
1659         def __init__(self, downloader=None):
1660                 InfoExtractor.__init__(self, downloader)
1661
1662         def report_download_page(self, username, pagenum):
1663                 """Report attempt to download user page."""
1664                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1665                                 (self.IE_NAME, username, pagenum))
1666
1667         def _real_extract(self, url):
1668                 # Extract username
1669                 mobj = re.match(self._VALID_URL, url)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1672                         return
1673
1674                 username = mobj.group(1)
1675
1676                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1677
1678                 request = urllib2.Request(url)
1679
1680                 try:
1681                         page = urllib2.urlopen(request).read().decode('utf-8')
1682                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1683                         page_base = page_base % mobj.group(1)
1684                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1685                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1686                         return
1687
1688
1689                 # Download video ids using BlipTV Ajax calls. Result size per
1690                 # query is limited (currently to 12 videos) so we need to query
1691                 # page by page until there are no video ids - it means we got
1692                 # all of them.
1693
1694                 video_ids = []
1695                 pagenum = 1
1696
1697                 while True:
1698                         self.report_download_page(username, pagenum)
1699
1700                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1701
1702                         try:
1703                                 page = urllib2.urlopen(request).read().decode('utf-8')
1704                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1705                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1706                                 return
1707
1708                         # Extract video identifiers
1709                         ids_in_page = []
1710
1711                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1712                                 if mobj.group(1) not in ids_in_page:
1713                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1714
1715                         video_ids.extend(ids_in_page)
1716
1717                         # A little optimization - if current page is not
1718                         # "full", ie. does not contain PAGE_SIZE video ids then
1719                         # we can assume that this page is the last one - there
1720                         # are no more ids on further pages - no need to query
1721                         # again.
1722
1723                         if len(ids_in_page) < self._PAGE_SIZE:
1724                                 break
1725
1726                         pagenum += 1
1727
1728                 all_ids_count = len(video_ids)
1729                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1730                 playlistend = self._downloader.params.get('playlistend', -1)
1731
1732                 if playlistend == -1:
1733                         video_ids = video_ids[playliststart:]
1734                 else:
1735                         video_ids = video_ids[playliststart:playlistend]
1736
1737                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1738                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1739
1740                 for video_id in video_ids:
1741                         self._downloader.download([u'http://blip.tv/'+video_id])
1742
1743
1744 class DepositFilesIE(InfoExtractor):
1745         """Information extractor for depositfiles.com"""
1746
1747         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1748         IE_NAME = u'DepositFiles'
1749
1750         def __init__(self, downloader=None):
1751                 InfoExtractor.__init__(self, downloader)
1752
1753         def report_download_webpage(self, file_id):
1754                 """Report webpage download."""
1755                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1756
1757         def report_extraction(self, file_id):
1758                 """Report information extraction."""
1759                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1760
1761         def _real_extract(self, url):
1762                 file_id = url.split('/')[-1]
1763                 # Rebuild url in english locale
1764                 url = 'http://depositfiles.com/en/files/' + file_id
1765
1766                 # Retrieve file webpage with 'Free download' button pressed
1767                 free_download_indication = { 'gateway_result' : '1' }
1768                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1769                 try:
1770                         self.report_download_webpage(file_id)
1771                         webpage = urllib2.urlopen(request).read()
1772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1773                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1774                         return
1775
1776                 # Search for the real file URL
1777                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1778                 if (mobj is None) or (mobj.group(1) is None):
1779                         # Try to figure out reason of the error.
1780                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1781                         if (mobj is not None) and (mobj.group(1) is not None):
1782                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1783                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1784                         else:
1785                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1786                         return
1787
1788                 file_url = mobj.group(1)
1789                 file_extension = os.path.splitext(file_url)[1][1:]
1790
1791                 # Search for file title
1792                 mobj = re.search(r'<b title="(.*?)">', webpage)
1793                 if mobj is None:
1794                         self._downloader.trouble(u'ERROR: unable to extract title')
1795                         return
1796                 file_title = mobj.group(1).decode('utf-8')
1797
1798                 return [{
1799                         'id':           file_id.decode('utf-8'),
1800                         'url':          file_url.decode('utf-8'),
1801                         'uploader':     u'NA',
1802                         'upload_date':  u'NA',
1803                         'title':        file_title,
1804                         'ext':          file_extension.decode('utf-8'),
1805                         'format':       u'NA',
1806                         'player_url':   None,
1807                 }]
1808
1809
1810 class FacebookIE(InfoExtractor):
1811         """Information Extractor for Facebook"""
1812
1813         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1814         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1815         _NETRC_MACHINE = 'facebook'
1816         _available_formats = ['video', 'highqual', 'lowqual']
1817         _video_extensions = {
1818                 'video': 'mp4',
1819                 'highqual': 'mp4',
1820                 'lowqual': 'mp4',
1821         }
1822         IE_NAME = u'facebook'
1823
1824         def __init__(self, downloader=None):
1825                 InfoExtractor.__init__(self, downloader)
1826
1827         def _reporter(self, message):
1828                 """Add header and report message."""
1829                 self._downloader.to_screen(u'[facebook] %s' % message)
1830
1831         def report_login(self):
1832                 """Report attempt to log in."""
1833                 self._reporter(u'Logging in')
1834
1835         def report_video_webpage_download(self, video_id):
1836                 """Report attempt to download video webpage."""
1837                 self._reporter(u'%s: Downloading video webpage' % video_id)
1838
1839         def report_information_extraction(self, video_id):
1840                 """Report attempt to extract video information."""
1841                 self._reporter(u'%s: Extracting video information' % video_id)
1842
1843         def _parse_page(self, video_webpage):
1844                 """Extract video information from page"""
1845                 # General data
1846                 data = {'title': r'\("video_title", "(.*?)"\)',
1847                         'description': r'<div class="datawrap">(.*?)</div>',
1848                         'owner': r'\("video_owner_name", "(.*?)"\)',
1849                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1850                         }
1851                 video_info = {}
1852                 for piece in data.keys():
1853                         mobj = re.search(data[piece], video_webpage)
1854                         if mobj is not None:
1855                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1856
1857                 # Video urls
1858                 video_urls = {}
1859                 for fmt in self._available_formats:
1860                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1861                         if mobj is not None:
1862                                 # URL is in a Javascript segment inside an escaped Unicode format within
1863                                 # the generally utf-8 page
1864                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1865                 video_info['video_urls'] = video_urls
1866
1867                 return video_info
1868
1869         def _real_initialize(self):
1870                 if self._downloader is None:
1871                         return
1872
1873                 useremail = None
1874                 password = None
1875                 downloader_params = self._downloader.params
1876
1877                 # Attempt to use provided username and password or .netrc data
1878                 if downloader_params.get('username', None) is not None:
1879                         useremail = downloader_params['username']
1880                         password = downloader_params['password']
1881                 elif downloader_params.get('usenetrc', False):
1882                         try:
1883                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1884                                 if info is not None:
1885                                         useremail = info[0]
1886                                         password = info[2]
1887                                 else:
1888                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1889                         except (IOError, netrc.NetrcParseError), err:
1890                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1891                                 return
1892
1893                 if useremail is None:
1894                         return
1895
1896                 # Log in
1897                 login_form = {
1898                         'email': useremail,
1899                         'pass': password,
1900                         'login': 'Log+In'
1901                         }
1902                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1903                 try:
1904                         self.report_login()
1905                         login_results = urllib2.urlopen(request).read()
1906                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1907                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1908                                 return
1909                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1910                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1911                         return
1912
1913         def _real_extract(self, url):
1914                 mobj = re.match(self._VALID_URL, url)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1917                         return
1918                 video_id = mobj.group('ID')
1919
1920                 # Get video webpage
1921                 self.report_video_webpage_download(video_id)
1922                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1923                 try:
1924                         page = urllib2.urlopen(request)
1925                         video_webpage = page.read()
1926                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1927                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1928                         return
1929
1930                 # Start extracting information
1931                 self.report_information_extraction(video_id)
1932
1933                 # Extract information
1934                 video_info = self._parse_page(video_webpage)
1935
1936                 # uploader
1937                 if 'owner' not in video_info:
1938                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1939                         return
1940                 video_uploader = video_info['owner']
1941
1942                 # title
1943                 if 'title' not in video_info:
1944                         self._downloader.trouble(u'ERROR: unable to extract video title')
1945                         return
1946                 video_title = video_info['title']
1947                 video_title = video_title.decode('utf-8')
1948
1949                 # thumbnail image
1950                 if 'thumbnail' not in video_info:
1951                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1952                         video_thumbnail = ''
1953                 else:
1954                         video_thumbnail = video_info['thumbnail']
1955
1956                 # upload date
1957                 upload_date = u'NA'
1958                 if 'upload_date' in video_info:
1959                         upload_time = video_info['upload_date']
1960                         timetuple = email.utils.parsedate_tz(upload_time)
1961                         if timetuple is not None:
1962                                 try:
1963                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1964                                 except:
1965                                         pass
1966
1967                 # description
1968                 video_description = video_info.get('description', 'No description available.')
1969
1970                 url_map = video_info['video_urls']
1971                 if len(url_map.keys()) > 0:
1972                         # Decide which formats to download
1973                         req_format = self._downloader.params.get('format', None)
1974                         format_limit = self._downloader.params.get('format_limit', None)
1975
1976                         if format_limit is not None and format_limit in self._available_formats:
1977                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1978                         else:
1979                                 format_list = self._available_formats
1980                         existing_formats = [x for x in format_list if x in url_map]
1981                         if len(existing_formats) == 0:
1982                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1983                                 return
1984                         if req_format is None:
1985                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1986                         elif req_format == 'worst':
1987                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1988                         elif req_format == '-1':
1989                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1990                         else:
1991                                 # Specific format
1992                                 if req_format not in url_map:
1993                                         self._downloader.trouble(u'ERROR: requested format not available')
1994                                         return
1995                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1996
1997                 results = []
1998                 for format_param, video_real_url in video_url_list:
1999                         # Extension
2000                         video_extension = self._video_extensions.get(format_param, 'mp4')
2001
2002                         results.append({
2003                                 'id':           video_id.decode('utf-8'),
2004                                 'url':          video_real_url.decode('utf-8'),
2005                                 'uploader':     video_uploader.decode('utf-8'),
2006                                 'upload_date':  upload_date,
2007                                 'title':        video_title,
2008                                 'ext':          video_extension.decode('utf-8'),
2009                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2010                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2011                                 'description':  video_description.decode('utf-8'),
2012                                 'player_url':   None,
2013                         })
2014                 return results
2015
2016 class BlipTVIE(InfoExtractor):
2017         """Information extractor for blip.tv"""
2018
2019         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2020         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2021         IE_NAME = u'blip.tv'
2022
2023         def report_extraction(self, file_id):
2024                 """Report information extraction."""
2025                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2026
2027         def report_direct_download(self, title):
2028                 """Report information extraction."""
2029                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2030
2031         def _real_extract(self, url):
2032                 mobj = re.match(self._VALID_URL, url)
2033                 if mobj is None:
2034                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2035                         return
2036
2037                 if '?' in url:
2038                         cchar = '&'
2039                 else:
2040                         cchar = '?'
2041                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2042                 request = urllib2.Request(json_url.encode('utf-8'))
2043                 self.report_extraction(mobj.group(1))
2044                 info = None
2045                 try:
2046                         urlh = urllib2.urlopen(request)
2047                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2048                                 basename = url.split('/')[-1]
2049                                 title,ext = os.path.splitext(basename)
2050                                 title = title.decode('UTF-8')
2051                                 ext = ext.replace('.', '')
2052                                 self.report_direct_download(title)
2053                                 info = {
2054                                         'id': title,
2055                                         'url': url,
2056                                         'title': title,
2057                                         'ext': ext,
2058                                         'urlhandle': urlh
2059                                 }
2060                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2062                         return
2063                 if info is None: # Regular URL
2064                         try:
2065                                 json_code = urlh.read()
2066                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2068                                 return
2069
2070                         try:
2071                                 json_data = json.loads(json_code)
2072                                 if 'Post' in json_data:
2073                                         data = json_data['Post']
2074                                 else:
2075                                         data = json_data
2076
2077                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2078                                 video_url = data['media']['url']
2079                                 umobj = re.match(self._URL_EXT, video_url)
2080                                 if umobj is None:
2081                                         raise ValueError('Can not determine filename extension')
2082                                 ext = umobj.group(1)
2083
2084                                 info = {
2085                                         'id': data['item_id'],
2086                                         'url': video_url,
2087                                         'uploader': data['display_name'],
2088                                         'upload_date': upload_date,
2089                                         'title': data['title'],
2090                                         'ext': ext,
2091                                         'format': data['media']['mimeType'],
2092                                         'thumbnail': data['thumbnailUrl'],
2093                                         'description': data['description'],
2094                                         'player_url': data['embedUrl']
2095                                 }
2096                         except (ValueError,KeyError), err:
2097                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2098                                 return
2099
2100                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2101                 return [info]
2102
2103
2104 class MyVideoIE(InfoExtractor):
2105         """Information Extractor for myvideo.de."""
2106
2107         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2108         IE_NAME = u'myvideo'
2109
2110         def __init__(self, downloader=None):
2111                 InfoExtractor.__init__(self, downloader)
2112
2113         def report_download_webpage(self, video_id):
2114                 """Report webpage download."""
2115                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2116
2117         def report_extraction(self, video_id):
2118                 """Report information extraction."""
2119                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2120
2121         def _real_extract(self,url):
2122                 mobj = re.match(self._VALID_URL, url)
2123                 if mobj is None:
2124                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2125                         return
2126
2127                 video_id = mobj.group(1)
2128
2129                 # Get video webpage
2130                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2131                 try:
2132                         self.report_download_webpage(video_id)
2133                         webpage = urllib2.urlopen(request).read()
2134                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2135                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2136                         return
2137
2138                 self.report_extraction(video_id)
2139                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2140                                  webpage)
2141                 if mobj is None:
2142                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2143                         return
2144                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2145
2146                 mobj = re.search('<title>([^<]+)</title>', webpage)
2147                 if mobj is None:
2148                         self._downloader.trouble(u'ERROR: unable to extract title')
2149                         return
2150
2151                 video_title = mobj.group(1)
2152
2153                 return [{
2154                         'id':           video_id,
2155                         'url':          video_url,
2156                         'uploader':     u'NA',
2157                         'upload_date':  u'NA',
2158                         'title':        video_title,
2159                         'ext':          u'flv',
2160                         'format':       u'NA',
2161                         'player_url':   None,
2162                 }]
2163
2164 class ComedyCentralIE(InfoExtractor):
2165         """Information extractor for The Daily Show and Colbert Report """
2166
2167         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2168         IE_NAME = u'comedycentral'
2169
2170         def report_extraction(self, episode_id):
2171                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2172
2173         def report_config_download(self, episode_id):
2174                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2175
2176         def report_index_download(self, episode_id):
2177                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2178
2179         def report_player_url(self, episode_id):
2180                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2181
2182         def _real_extract(self, url):
2183                 mobj = re.match(self._VALID_URL, url)
2184                 if mobj is None:
2185                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2186                         return
2187
2188                 if mobj.group('shortname'):
2189                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2190                                 url = u'http://www.thedailyshow.com/full-episodes/'
2191                         else:
2192                                 url = u'http://www.colbertnation.com/full-episodes/'
2193                         mobj = re.match(self._VALID_URL, url)
2194                         assert mobj is not None
2195
2196                 dlNewest = not mobj.group('episode')
2197                 if dlNewest:
2198                         epTitle = mobj.group('showname')
2199                 else:
2200                         epTitle = mobj.group('episode')
2201
2202                 req = urllib2.Request(url)
2203                 self.report_extraction(epTitle)
2204                 try:
2205                         htmlHandle = urllib2.urlopen(req)
2206                         html = htmlHandle.read()
2207                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2208                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2209                         return
2210                 if dlNewest:
2211                         url = htmlHandle.geturl()
2212                         mobj = re.match(self._VALID_URL, url)
2213                         if mobj is None:
2214                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2215                                 return
2216                         if mobj.group('episode') == '':
2217                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2218                                 return
2219                         epTitle = mobj.group('episode')
2220
2221                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2222                 if len(mMovieParams) == 0:
2223                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2224                         return
2225
2226                 playerUrl_raw = mMovieParams[0][0]
2227                 self.report_player_url(epTitle)
2228                 try:
2229                         urlHandle = urllib2.urlopen(playerUrl_raw)
2230                         playerUrl = urlHandle.geturl()
2231                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2232                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2233                         return
2234
2235                 uri = mMovieParams[0][1]
2236                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2237                 self.report_index_download(epTitle)
2238                 try:
2239                         indexXml = urllib2.urlopen(indexUrl).read()
2240                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2241                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2242                         return
2243
2244                 results = []
2245
2246                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2247                 itemEls = idoc.findall('.//item')
2248                 for itemEl in itemEls:
2249                         mediaId = itemEl.findall('./guid')[0].text
2250                         shortMediaId = mediaId.split(':')[-1]
2251                         showId = mediaId.split(':')[-2].replace('.com', '')
2252                         officialTitle = itemEl.findall('./title')[0].text
2253                         officialDate = itemEl.findall('./pubDate')[0].text
2254
2255                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2256                                                 urllib.urlencode({'uri': mediaId}))
2257                         configReq = urllib2.Request(configUrl)
2258                         self.report_config_download(epTitle)
2259                         try:
2260                                 configXml = urllib2.urlopen(configReq).read()
2261                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2262                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2263                                 return
2264
2265                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2266                         turls = []
2267                         for rendition in cdoc.findall('.//rendition'):
2268                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2269                                 turls.append(finfo)
2270
2271                         if len(turls) == 0:
2272                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2273                                 continue
2274
2275                         # For now, just pick the highest bitrate
2276                         format,video_url = turls[-1]
2277
2278                         effTitle = showId + u'-' + epTitle
2279                         info = {
2280                                 'id': shortMediaId,
2281                                 'url': video_url,
2282                                 'uploader': showId,
2283                                 'upload_date': officialDate,
2284                                 'title': effTitle,
2285                                 'ext': 'mp4',
2286                                 'format': format,
2287                                 'thumbnail': None,
2288                                 'description': officialTitle,
2289                                 'player_url': playerUrl
2290                         }
2291
2292                         results.append(info)
2293
2294                 return results
2295
2296
2297 class EscapistIE(InfoExtractor):
2298         """Information extractor for The Escapist """
2299
2300         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2301         IE_NAME = u'escapist'
2302
2303         def report_extraction(self, showName):
2304                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2305
2306         def report_config_download(self, showName):
2307                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2308
2309         def _real_extract(self, url):
2310                 mobj = re.match(self._VALID_URL, url)
2311                 if mobj is None:
2312                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2313                         return
2314                 showName = mobj.group('showname')
2315                 videoId = mobj.group('episode')
2316
2317                 self.report_extraction(showName)
2318                 try:
2319                         webPage = urllib2.urlopen(url)
2320                         webPageBytes = webPage.read()
2321                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2322                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2323                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2325                         return
2326
2327                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2328                 description = unescapeHTML(descMatch.group(1))
2329                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2330                 imgUrl = unescapeHTML(imgMatch.group(1))
2331                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2332                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2333                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2334                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2335
2336                 self.report_config_download(showName)
2337                 try:
2338                         configJSON = urllib2.urlopen(configUrl).read()
2339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2341                         return
2342
2343                 # Technically, it's JavaScript, not JSON
2344                 configJSON = configJSON.replace("'", '"')
2345
2346                 try:
2347                         config = json.loads(configJSON)
2348                 except (ValueError,), err:
2349                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2350                         return
2351
2352                 playlist = config['playlist']
2353                 videoUrl = playlist[1]['url']
2354
2355                 info = {
2356                         'id': videoId,
2357                         'url': videoUrl,
2358                         'uploader': showName,
2359                         'upload_date': None,
2360                         'title': showName,
2361                         'ext': 'flv',
2362                         'format': 'flv',
2363                         'thumbnail': imgUrl,
2364                         'description': description,
2365                         'player_url': playerUrl,
2366                 }
2367
2368                 return [info]
2369
2370
2371 class CollegeHumorIE(InfoExtractor):
2372         """Information extractor for collegehumor.com"""
2373
2374         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2375         IE_NAME = u'collegehumor'
2376
2377         def report_webpage(self, video_id):
2378                 """Report information extraction."""
2379                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2380
2381         def report_extraction(self, video_id):
2382                 """Report information extraction."""
2383                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2384
2385         def _real_extract(self, url):
2386                 mobj = re.match(self._VALID_URL, url)
2387                 if mobj is None:
2388                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2389                         return
2390                 video_id = mobj.group('videoid')
2391
2392                 self.report_webpage(video_id)
2393                 request = urllib2.Request(url)
2394                 try:
2395                         webpage = urllib2.urlopen(request).read()
2396                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2397                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2398                         return
2399
2400                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2401                 if m is None:
2402                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2403                         return
2404                 internal_video_id = m.group('internalvideoid')
2405
2406                 info = {
2407                         'id': video_id,
2408                         'internal_id': internal_video_id,
2409                 }
2410
2411                 self.report_extraction(video_id)
2412                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2413                 try:
2414                         metaXml = urllib2.urlopen(xmlUrl).read()
2415                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2416                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2417                         return
2418
2419                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2420                 try:
2421                         videoNode = mdoc.findall('./video')[0]
2422                         info['description'] = videoNode.findall('./description')[0].text
2423                         info['title'] = videoNode.findall('./caption')[0].text
2424                         info['url'] = videoNode.findall('./file')[0].text
2425                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2426                         info['ext'] = info['url'].rpartition('.')[2]
2427                         info['format'] = info['ext']
2428                 except IndexError:
2429                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2430                         return
2431
2432                 return [info]
2433
2434
2435 class XVideosIE(InfoExtractor):
2436         """Information extractor for xvideos.com"""
2437
2438         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2439         IE_NAME = u'xvideos'
2440
2441         def report_webpage(self, video_id):
2442                 """Report information extraction."""
2443                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2444
2445         def report_extraction(self, video_id):
2446                 """Report information extraction."""
2447                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2448
2449         def _real_extract(self, url):
2450                 mobj = re.match(self._VALID_URL, url)
2451                 if mobj is None:
2452                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2453                         return
2454                 video_id = mobj.group(1).decode('utf-8')
2455
2456                 self.report_webpage(video_id)
2457
2458                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2459                 try:
2460                         webpage = urllib2.urlopen(request).read()
2461                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2462                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2463                         return
2464
2465                 self.report_extraction(video_id)
2466
2467
2468                 # Extract video URL
2469                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2470                 if mobj is None:
2471                         self._downloader.trouble(u'ERROR: unable to extract video url')
2472                         return
2473                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2474
2475
2476                 # Extract title
2477                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2478                 if mobj is None:
2479                         self._downloader.trouble(u'ERROR: unable to extract video title')
2480                         return
2481                 video_title = mobj.group(1).decode('utf-8')
2482
2483
2484                 # Extract video thumbnail
2485                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2486                 if mobj is None:
2487                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2488                         return
2489                 video_thumbnail = mobj.group(0).decode('utf-8')
2490
2491                 info = {
2492                         'id': video_id,
2493                         'url': video_url,
2494                         'uploader': None,
2495                         'upload_date': None,
2496                         'title': video_title,
2497                         'ext': 'flv',
2498                         'format': 'flv',
2499                         'thumbnail': video_thumbnail,
2500                         'description': None,
2501                         'player_url': None,
2502                 }
2503
2504                 return [info]
2505
2506
2507 class SoundcloudIE(InfoExtractor):
2508         """Information extractor for soundcloud.com
2509            To access the media, the uid of the song and a stream token
2510            must be extracted from the page source and the script must make
2511            a request to media.soundcloud.com/crossdomain.xml. Then
2512            the media can be grabbed by requesting from an url composed
2513            of the stream token and uid
2514          """
2515
2516         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2517         IE_NAME = u'soundcloud'
2518
2519         def __init__(self, downloader=None):
2520                 InfoExtractor.__init__(self, downloader)
2521
2522         def report_webpage(self, video_id):
2523                 """Report information extraction."""
2524                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2525
2526         def report_extraction(self, video_id):
2527                 """Report information extraction."""
2528                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2529
2530         def _real_extract(self, url):
2531                 mobj = re.match(self._VALID_URL, url)
2532                 if mobj is None:
2533                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2534                         return
2535
2536                 # extract uploader (which is in the url)
2537                 uploader = mobj.group(1).decode('utf-8')
2538                 # extract simple title (uploader + slug of song title)
2539                 slug_title =  mobj.group(2).decode('utf-8')
2540                 simple_title = uploader + u'-' + slug_title
2541
2542                 self.report_webpage('%s/%s' % (uploader, slug_title))
2543
2544                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2545                 try:
2546                         webpage = urllib2.urlopen(request).read()
2547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2548                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2549                         return
2550
2551                 self.report_extraction('%s/%s' % (uploader, slug_title))
2552
2553                 # extract uid and stream token that soundcloud hands out for access
2554                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2555                 if mobj:
2556                         video_id = mobj.group(1)
2557                         stream_token = mobj.group(2)
2558
2559                 # extract unsimplified title
2560                 mobj = re.search('"title":"(.*?)",', webpage)
2561                 if mobj:
2562                         title = mobj.group(1).decode('utf-8')
2563                 else:
2564                         title = simple_title
2565
2566                 # construct media url (with uid/token)
2567                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2568                 mediaURL = mediaURL % (video_id, stream_token)
2569
2570                 # description
2571                 description = u'No description available'
2572                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2573                 if mobj:
2574                         description = mobj.group(1)
2575
2576                 # upload date
2577                 upload_date = None
2578                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2579                 if mobj:
2580                         try:
2581                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2582                         except Exception, e:
2583                                 self._downloader.to_stderr(str(e))
2584
2585                 # for soundcloud, a request to a cross domain is required for cookies
2586                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2587
2588                 return [{
2589                         'id':           video_id.decode('utf-8'),
2590                         'url':          mediaURL,
2591                         'uploader':     uploader.decode('utf-8'),
2592                         'upload_date':  upload_date,
2593                         'title':        title,
2594                         'ext':          u'mp3',
2595                         'format':       u'NA',
2596                         'player_url':   None,
2597                         'description': description.decode('utf-8')
2598                 }]
2599
2600
2601 class InfoQIE(InfoExtractor):
2602         """Information extractor for infoq.com"""
2603
2604         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2605         IE_NAME = u'infoq'
2606
2607         def report_webpage(self, video_id):
2608                 """Report information extraction."""
2609                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2610
2611         def report_extraction(self, video_id):
2612                 """Report information extraction."""
2613                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2614
2615         def _real_extract(self, url):
2616                 mobj = re.match(self._VALID_URL, url)
2617                 if mobj is None:
2618                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2619                         return
2620
2621                 self.report_webpage(url)
2622
2623                 request = urllib2.Request(url)
2624                 try:
2625                         webpage = urllib2.urlopen(request).read()
2626                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2627                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2628                         return
2629
2630                 self.report_extraction(url)
2631
2632
2633                 # Extract video URL
2634                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2635                 if mobj is None:
2636                         self._downloader.trouble(u'ERROR: unable to extract video url')
2637                         return
2638                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2639
2640
2641                 # Extract title
2642                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2643                 if mobj is None:
2644                         self._downloader.trouble(u'ERROR: unable to extract video title')
2645                         return
2646                 video_title = mobj.group(1).decode('utf-8')
2647
2648                 # Extract description
2649                 video_description = u'No description available.'
2650                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2651                 if mobj is not None:
2652                         video_description = mobj.group(1).decode('utf-8')
2653
2654                 video_filename = video_url.split('/')[-1]
2655                 video_id, extension = video_filename.split('.')
2656
2657                 info = {
2658                         'id': video_id,
2659                         'url': video_url,
2660                         'uploader': None,
2661                         'upload_date': None,
2662                         'title': video_title,
2663                         'ext': extension,
2664                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2665                         'thumbnail': None,
2666                         'description': video_description,
2667                         'player_url': None,
2668                 }
2669
2670                 return [info]
2671
2672 class MixcloudIE(InfoExtractor):
2673         """Information extractor for www.mixcloud.com"""
2674         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675         IE_NAME = u'mixcloud'
2676
2677         def __init__(self, downloader=None):
2678                 InfoExtractor.__init__(self, downloader)
2679
2680         def report_download_json(self, file_id):
2681                 """Report JSON download."""
2682                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2683
2684         def report_extraction(self, file_id):
2685                 """Report information extraction."""
2686                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2687
2688         def get_urls(self, jsonData, fmt, bitrate='best'):
2689                 """Get urls from 'audio_formats' section in json"""
2690                 file_url = None
2691                 try:
2692                         bitrate_list = jsonData[fmt]
2693                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2694                                 bitrate = max(bitrate_list) # select highest
2695
2696                         url_list = jsonData[fmt][bitrate]
2697                 except TypeError: # we have no bitrate info.
2698                         url_list = jsonData[fmt]
2699                 return url_list
2700
2701         def check_urls(self, url_list):
2702                 """Returns 1st active url from list"""
2703                 for url in url_list:
2704                         try:
2705                                 urllib2.urlopen(url)
2706                                 return url
2707                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2708                                 url = None
2709
2710                 return None
2711
2712         def _print_formats(self, formats):
2713                 print 'Available formats:'
2714                 for fmt in formats.keys():
2715                         for b in formats[fmt]:
2716                                 try:
2717                                         ext = formats[fmt][b][0]
2718                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2719                                 except TypeError: # we have no bitrate info
2720                                         ext = formats[fmt][0]
2721                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2722                                         break
2723
2724         def _real_extract(self, url):
2725                 mobj = re.match(self._VALID_URL, url)
2726                 if mobj is None:
2727                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2728                         return
2729                 # extract uploader & filename from url
2730                 uploader = mobj.group(1).decode('utf-8')
2731                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2732
2733                 # construct API request
2734                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2735                 # retrieve .json file with links to files
2736                 request = urllib2.Request(file_url)
2737                 try:
2738                         self.report_download_json(file_url)
2739                         jsonData = urllib2.urlopen(request).read()
2740                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2741                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2742                         return
2743
2744                 # parse JSON
2745                 json_data = json.loads(jsonData)
2746                 player_url = json_data['player_swf_url']
2747                 formats = dict(json_data['audio_formats'])
2748
2749                 req_format = self._downloader.params.get('format', None)
2750                 bitrate = None
2751
2752                 if self._downloader.params.get('listformats', None):
2753                         self._print_formats(formats)
2754                         return
2755
2756                 if req_format is None or req_format == 'best':
2757                         for format_param in formats.keys():
2758                                 url_list = self.get_urls(formats, format_param)
2759                                 # check urls
2760                                 file_url = self.check_urls(url_list)
2761                                 if file_url is not None:
2762                                         break # got it!
2763                 else:
2764                         if req_format not in formats.keys():
2765                                 self._downloader.trouble(u'ERROR: format is not available')
2766                                 return
2767
2768                         url_list = self.get_urls(formats, req_format)
2769                         file_url = self.check_urls(url_list)
2770                         format_param = req_format
2771
2772                 return [{
2773                         'id': file_id.decode('utf-8'),
2774                         'url': file_url.decode('utf-8'),
2775                         'uploader':     uploader.decode('utf-8'),
2776                         'upload_date': u'NA',
2777                         'title': json_data['name'],
2778                         'ext': file_url.split('.')[-1].decode('utf-8'),
2779                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2780                         'thumbnail': json_data['thumbnail_url'],
2781                         'description': json_data['description'],
2782                         'player_url': player_url.decode('utf-8'),
2783                 }]
2784
2785 class StanfordOpenClassroomIE(InfoExtractor):
2786         """Information extractor for Stanford's Open ClassRoom"""
2787
2788         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2789         IE_NAME = u'stanfordoc'
2790
2791         def report_download_webpage(self, objid):
2792                 """Report information extraction."""
2793                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2794
2795         def report_extraction(self, video_id):
2796                 """Report information extraction."""
2797                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2798
2799         def _real_extract(self, url):
2800                 mobj = re.match(self._VALID_URL, url)
2801                 if mobj is None:
2802                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2803                         return
2804
2805                 if mobj.group('course') and mobj.group('video'): # A specific video
2806                         course = mobj.group('course')
2807                         video = mobj.group('video')
2808                         info = {
2809                                 'id': course + '_' + video,
2810                         }
2811
2812                         self.report_extraction(info['id'])
2813                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2814                         xmlUrl = baseUrl + video + '.xml'
2815                         try:
2816                                 metaXml = urllib2.urlopen(xmlUrl).read()
2817                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2818                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2819                                 return
2820                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821                         try:
2822                                 info['title'] = mdoc.findall('./title')[0].text
2823                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824                         except IndexError:
2825                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2826                                 return
2827                         info['ext'] = info['url'].rpartition('.')[2]
2828                         info['format'] = info['ext']
2829                         return [info]
2830                 elif mobj.group('course'): # A course page
2831                         course = mobj.group('course')
2832                         info = {
2833                                 'id': course,
2834                                 'type': 'playlist',
2835                         }
2836
2837                         self.report_download_webpage(info['id'])
2838                         try:
2839                                 coursepage = urllib2.urlopen(url).read()
2840                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2841                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2842                                 return
2843
2844                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2845                         if m:
2846                                 info['title'] = unescapeHTML(m.group(1))
2847                         else:
2848                                 info['title'] = info['id']
2849
2850                         m = re.search('<description>([^<]+)</description>', coursepage)
2851                         if m:
2852                                 info['description'] = unescapeHTML(m.group(1))
2853
2854                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2855                         info['list'] = [
2856                                 {
2857                                         'type': 'reference',
2858                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2859                                 }
2860                                         for vpage in links]
2861                         results = []
2862                         for entry in info['list']:
2863                                 assert entry['type'] == 'reference'
2864                                 results += self.extract(entry['url'])
2865                         return results
2866
2867                 else: # Root page
2868                         info = {
2869                                 'id': 'Stanford OpenClassroom',
2870                                 'type': 'playlist',
2871                         }
2872
2873                         self.report_download_webpage(info['id'])
2874                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2875                         try:
2876                                 rootpage = urllib2.urlopen(rootURL).read()
2877                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2878                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2879                                 return
2880
2881                         info['title'] = info['id']
2882
2883                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2884                         info['list'] = [
2885                                 {
2886                                         'type': 'reference',
2887                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2888                                 }
2889                                         for cpage in links]
2890
2891                         results = []
2892                         for entry in info['list']:
2893                                 assert entry['type'] == 'reference'
2894                                 results += self.extract(entry['url'])
2895                         return results
2896
2897 class MTVIE(InfoExtractor):
2898         """Information extractor for MTV.com"""
2899
2900         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2901         IE_NAME = u'mtv'
2902
2903         def report_webpage(self, video_id):
2904                 """Report information extraction."""
2905                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2906
2907         def report_extraction(self, video_id):
2908                 """Report information extraction."""
2909                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2910
2911         def _real_extract(self, url):
2912                 mobj = re.match(self._VALID_URL, url)
2913                 if mobj is None:
2914                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2915                         return
2916                 if not mobj.group('proto'):
2917                         url = 'http://' + url
2918                 video_id = mobj.group('videoid')
2919                 self.report_webpage(video_id)
2920
2921                 request = urllib2.Request(url)
2922                 try:
2923                         webpage = urllib2.urlopen(request).read()
2924                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2925                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2926                         return
2927
2928                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2929                 if mobj is None:
2930                         self._downloader.trouble(u'ERROR: unable to extract song name')
2931                         return
2932                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2933                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2934                 if mobj is None:
2935                         self._downloader.trouble(u'ERROR: unable to extract performer')
2936                         return
2937                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2938                 video_title = performer + ' - ' + song_name
2939
2940                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2941                 if mobj is None:
2942                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2943                         return
2944                 mtvn_uri = mobj.group(1)
2945
2946                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2947                 if mobj is None:
2948                         self._downloader.trouble(u'ERROR: unable to extract content id')
2949                         return
2950                 content_id = mobj.group(1)
2951
2952                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2953                 self.report_extraction(video_id)
2954                 request = urllib2.Request(videogen_url)
2955                 try:
2956                         metadataXml = urllib2.urlopen(request).read()
2957                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2958                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2959                         return
2960
2961                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2962                 renditions = mdoc.findall('.//rendition')
2963
2964                 # For now, always pick the highest quality.
2965                 rendition = renditions[-1]
2966
2967                 try:
2968                         _,_,ext = rendition.attrib['type'].partition('/')
2969                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2970                         video_url = rendition.find('./src').text
2971                 except KeyError:
2972                         self._downloader.trouble('Invalid rendition field.')
2973                         return
2974
2975                 info = {
2976                         'id': video_id,
2977                         'url': video_url,
2978                         'uploader': performer,
2979                         'title': video_title,
2980                         'ext': ext,
2981                         'format': format,
2982                 }
2983
2984                 return [info]
2985
2986
2987 class YoukuIE(InfoExtractor):
2988
2989         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2990         IE_NAME = u'Youku'
2991
2992         def __init__(self, downloader=None):
2993                 InfoExtractor.__init__(self, downloader)
2994
2995         def report_download_webpage(self, file_id):
2996                 """Report webpage download."""
2997                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2998
2999         def report_extraction(self, file_id):
3000                 """Report information extraction."""
3001                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3002
3003         def _gen_sid(self):
3004                 nowTime = int(time.time() * 1000)
3005                 random1 = random.randint(1000,1998)
3006                 random2 = random.randint(1000,9999)
3007
3008                 return "%d%d%d" %(nowTime,random1,random2)
3009
3010         def _get_file_ID_mix_string(self, seed):
3011                 mixed = []
3012                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3013                 seed = float(seed)
3014                 for i in range(len(source)):
3015                         seed  =  (seed * 211 + 30031 ) % 65536
3016                         index  =  math.floor(seed / 65536 * len(source) )
3017                         mixed.append(source[int(index)])
3018                         source.remove(source[int(index)])
3019                 #return ''.join(mixed)
3020                 return mixed
3021
3022         def _get_file_id(self, fileId, seed):
3023                 mixed = self._get_file_ID_mix_string(seed)
3024                 ids = fileId.split('*')
3025                 realId = []
3026                 for ch in ids:
3027                         if ch:
3028                                 realId.append(mixed[int(ch)])
3029                 return ''.join(realId)
3030
3031         def _real_extract(self, url):
3032                 mobj = re.match(self._VALID_URL, url)
3033                 if mobj is None:
3034                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3035                         return
3036                 video_id = mobj.group('ID')
3037
3038                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3039
3040                 request = urllib2.Request(info_url, None, std_headers)
3041                 try:
3042                         self.report_download_webpage(video_id)
3043                         jsondata = urllib2.urlopen(request).read()
3044                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3045                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3046                         return
3047
3048                 self.report_extraction(video_id)
3049                 try:
3050                         config = json.loads(jsondata)
3051
3052                         video_title =  config['data'][0]['title']
3053                         seed = config['data'][0]['seed']
3054
3055                         format = self._downloader.params.get('format', None)
3056                         supported_format = config['data'][0]['streamfileids'].keys()
3057
3058                         if format is None or format == 'best':
3059                                 if 'hd2' in supported_format:
3060                                         format = 'hd2'
3061                                 else:
3062                                         format = 'flv'
3063                                 ext = u'flv'
3064                         elif format == 'worst':
3065                                 format = 'mp4'
3066                                 ext = u'mp4'
3067                         else:
3068                                 format = 'flv'
3069                                 ext = u'flv'
3070
3071
3072                         fileid = config['data'][0]['streamfileids'][format]
3073                         seg_number = len(config['data'][0]['segs'][format])
3074
3075                         keys=[]
3076                         for i in xrange(seg_number):
3077                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3078
3079                         #TODO check error
3080                         #youku only could be viewed from mainland china
3081                 except:
3082                         self._downloader.trouble(u'ERROR: unable to extract info section')
3083                         return
3084
3085                 files_info=[]
3086                 sid = self._gen_sid()
3087                 fileid = self._get_file_id(fileid, seed)
3088
3089                 #column 8,9 of fileid represent the segment number
3090                 #fileid[7:9] should be changed
3091                 for index, key in enumerate(keys):
3092
3093                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3094                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3095
3096                         info = {
3097                                 'id': '%s_part%02d' % (video_id, index),
3098                                 'url': download_url,
3099                                 'uploader': None,
3100                                 'title': video_title,
3101                                 'ext': ext,
3102                                 'format': u'NA'
3103                         }
3104                         files_info.append(info)
3105
3106                 return files_info
3107
3108
3109 class XNXXIE(InfoExtractor):
3110         """Information extractor for xnxx.com"""
3111
3112         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3113         IE_NAME = u'xnxx'
3114         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3115         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3116         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3117
3118         def report_webpage(self, video_id):
3119                 """Report information extraction"""
3120                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3121
3122         def report_extraction(self, video_id):
3123                 """Report information extraction"""
3124                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3125
3126         def _real_extract(self, url):
3127                 mobj = re.match(self._VALID_URL, url)
3128                 if mobj is None:
3129                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3130                         return
3131                 video_id = mobj.group(1).decode('utf-8')
3132
3133                 self.report_webpage(video_id)
3134
3135                 # Get webpage content
3136                 try:
3137                         webpage = urllib2.urlopen(url).read()
3138                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3139                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3140                         return
3141
3142                 result = re.search(self.VIDEO_URL_RE, webpage)
3143                 if result is None:
3144                         self._downloader.trouble(u'ERROR: unable to extract video url')
3145                         return
3146                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3147
3148                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3149                 if result is None:
3150                         self._downloader.trouble(u'ERROR: unable to extract video title')
3151                         return
3152                 video_title = result.group(1).decode('utf-8')
3153
3154                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3155                 if result is None:
3156                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3157                         return
3158                 video_thumbnail = result.group(1).decode('utf-8')
3159
3160                 info = {'id': video_id,
3161                                 'url': video_url,
3162                                 'uploader': None,
3163                                 'upload_date': None,
3164                                 'title': video_title,
3165                                 'ext': 'flv',
3166                                 'format': 'flv',
3167                                 'thumbnail': video_thumbnail,
3168                                 'description': None,
3169                                 'player_url': None}
3170
3171                 return [info]