_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 106                              (?:                                                  # the various things that can precede the ID:
 107                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 108                                  |(?:                                             # or the v= param in all its forms
 109                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 110                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 111                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 112                                      v=
 113                                  )
 114                              )?                                                   # optional -> youtube.com/xxxx is OK
 115                          )?                                                       # all until now is optional -> you can pass the naked ID
 116                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 117                          (?(1).+)?                                                # if we found the ID, everything can follow
 118                          $"""
 119         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 120         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 121         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 122         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 123         _NETRC_MACHINE = 'youtube'
 124         # Listed in order of quality
 125         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 126         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 127         _video_extensions = {
 128                 '13': '3gp',
 129                 '17': 'mp4',
 130                 '18': 'mp4',
 131                 '22': 'mp4',
 132                 '37': 'mp4',
 133                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 134                 '43': 'webm',
 135                 '44': 'webm',
 136                 '45': 'webm',
 137                 '46': 'webm',
 138         }
 139         _video_dimensions = {
 140                 '5': '240x400',
 141                 '6': '???',
 142                 '13': '???',
 143                 '17': '144x176',
 144                 '18': '360x640',
 145                 '22': '720x1280',
 146                 '34': '360x640',
 147                 '35': '480x854',
 148                 '37': '1080x1920',
 149                 '38': '3072x4096',
 150                 '43': '360x640',
 151                 '44': '480x854',
 152                 '45': '720x1280',
 153                 '46': '1080x1920',
 154         }
 155         IE_NAME = u'youtube'
 156
 157         def suitable(self, url):
 158                 """Receives a URL and returns True if suitable for this IE."""
 159                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 160
 161         def report_lang(self):
 162                 """Report attempt to set language."""
 163                 self._downloader.to_screen(u'[youtube] Setting language')
 164
 165         def report_login(self):
 166                 """Report attempt to log in."""
 167                 self._downloader.to_screen(u'[youtube] Logging in')
 168
 169         def report_age_confirmation(self):
 170                 """Report attempt to confirm age."""
 171                 self._downloader.to_screen(u'[youtube] Confirming age')
 172
 173         def report_video_webpage_download(self, video_id):
 174                 """Report attempt to download video webpage."""
 175                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 176
 177         def report_video_info_webpage_download(self, video_id):
 178                 """Report attempt to download video info webpage."""
 179                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 180
 181         def report_video_subtitles_download(self, video_id):
 182                 """Report attempt to download video info webpage."""
 183                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 184
 185         def report_information_extraction(self, video_id):
 186                 """Report attempt to extract video information."""
 187                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 188
 189         def report_unavailable_format(self, video_id, format):
 190                 """Report extracted video URL."""
 191                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 192
 193         def report_rtmp_download(self):
 194                 """Indicate the download will use the RTMP protocol."""
 195                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 196
 197         def _closed_captions_xml_to_srt(self, xml_string):
 198                 srt = ''
 199                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 200                 # TODO parse xml instead of regex
 201                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 202                         if not dur: dur = '4'
 203                         start = float(start)
 204                         end = start + float(dur)
 205                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 206                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 207                         caption = unescapeHTML(caption)
 208                         caption = unescapeHTML(caption) # double cycle, intentional
 209                         srt += str(n+1) + '\n'
 210                         srt += start + ' --> ' + end + '\n'
 211                         srt += caption + '\n\n'
 212                 return srt
 213
 214         def _print_formats(self, formats):
 215                 print 'Available formats:'
 216                 for x in formats:
 217                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 218
 219         def _real_initialize(self):
 220                 if self._downloader is None:
 221                         return
 222
 223                 username = None
 224                 password = None
 225                 downloader_params = self._downloader.params
 226
 227                 # Attempt to use provided username and password or .netrc data
 228                 if downloader_params.get('username', None) is not None:
 229                         username = downloader_params['username']
 230                         password = downloader_params['password']
 231                 elif downloader_params.get('usenetrc', False):
 232                         try:
 233                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 234                                 if info is not None:
 235                                         username = info[0]
 236                                         password = info[2]
 237                                 else:
 238                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 239                         except (IOError, netrc.NetrcParseError), err:
 240                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 241                                 return
 242
 243                 # Set language
 244                 request = urllib2.Request(self._LANG_URL)
 245                 try:
 246                         self.report_lang()
 247                         urllib2.urlopen(request).read()
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 250                         return
 251
 252                 # No authentication to be performed
 253                 if username is None:
 254                         return
 255
 256                 # Log in
 257                 login_form = {
 258                                 'current_form': 'loginForm',
 259                                 'next':         '/',
 260                                 'action_login': 'Log In',
 261                                 'username':     username,
 262                                 'password':     password,
 263                                 }
 264                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 265                 try:
 266                         self.report_login()
 267                         login_results = urllib2.urlopen(request).read()
 268                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 269                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 270                                 return
 271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 272                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 273                         return
 274
 275                 # Confirm age
 276                 age_form = {
 277                                 'next_url':             '/',
 278                                 'action_confirm':       'Confirm',
 279                                 }
 280                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 281                 try:
 282                         self.report_age_confirmation()
 283                         age_results = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 286                         return
 287
 288         def _real_extract(self, url):
 289                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 290                 mobj = re.search(self._NEXT_URL_RE, url)
 291                 if mobj:
 292                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 293
 294                 # Extract video id from URL
 295                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 296                 if mobj is None:
 297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 298                         return
 299                 video_id = mobj.group(2)
 300
 301                 # Get video webpage
 302                 self.report_video_webpage_download(video_id)
 303                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 304                 try:
 305                         video_webpage = urllib2.urlopen(request).read()
 306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 308                         return
 309
 310                 # Attempt to extract SWF player URL
 311                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 312                 if mobj is not None:
 313                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 314                 else:
 315                         player_url = None
 316
 317                 # Get video info
 318                 self.report_video_info_webpage_download(video_id)
 319                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 320                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 321                                         % (video_id, el_type))
 322                         request = urllib2.Request(video_info_url)
 323                         try:
 324                                 video_info_webpage = urllib2.urlopen(request).read()
 325                                 video_info = parse_qs(video_info_webpage)
 326                                 if 'token' in video_info:
 327                                         break
 328                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 329                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 330                                 return
 331                 if 'token' not in video_info:
 332                         if 'reason' in video_info:
 333                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 334                         else:
 335                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 336                         return
 337
 338                 # Check for "rental" videos
 339                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 340                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 341                         return
 342
 343                 # Start extracting information
 344                 self.report_information_extraction(video_id)
 345
 346                 # uploader
 347                 if 'author' not in video_info:
 348                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 349                         return
 350                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 351
 352                 # title
 353                 if 'title' not in video_info:
 354                         self._downloader.trouble(u'ERROR: unable to extract video title')
 355                         return
 356                 video_title = urllib.unquote_plus(video_info['title'][0])
 357                 video_title = video_title.decode('utf-8')
 358
 359                 # thumbnail image
 360                 if 'thumbnail_url' not in video_info:
 361                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 362                         video_thumbnail = ''
 363                 else:   # don't panic if we can't find it
 364                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 365
 366                 # upload date
 367                 upload_date = u'NA'
 368                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 369                 if mobj is not None:
 370                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 371                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 372                         for expression in format_expressions:
 373                                 try:
 374                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 375                                 except:
 376                                         pass
 377
 378                 # description
 379                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 380                 if video_description: video_description = clean_html(video_description)
 381                 else: video_description = ''
 382
 383                 # closed captions
 384                 video_subtitles = None
 385                 if self._downloader.params.get('writesubtitles', False):
 386                         try:
 387                                 self.report_video_subtitles_download(video_id)
 388                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 389                                 try:
 390                                         srt_list = urllib2.urlopen(request).read()
 391                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 392                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 393                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 394                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 395                                 if not srt_lang_list:
 396                                         raise Trouble(u'WARNING: video has no closed captions')
 397                                 if self._downloader.params.get('subtitleslang', False):
 398                                         srt_lang = self._downloader.params.get('subtitleslang')
 399                                 elif 'en' in srt_lang_list:
 400                                         srt_lang = 'en'
 401                                 else:
 402                                         srt_lang = srt_lang_list.keys()[0]
 403                                 if not srt_lang in srt_lang_list:
 404                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 405                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 406                                 try:
 407                                         srt_xml = urllib2.urlopen(request).read()
 408                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 409                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 410                                 if not srt_xml:
 411                                         raise Trouble(u'WARNING: unable to download video subtitles')
 412                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 413                         except Trouble as trouble:
 414                                 self._downloader.trouble(trouble[0])
 415
 416                 # token
 417                 video_token = urllib.unquote_plus(video_info['token'][0])
 418
 419                 # Decide which formats to download
 420                 req_format = self._downloader.params.get('format', None)
 421
 422                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 423                         self.report_rtmp_download()
 424                         video_url_list = [(None, video_info['conn'][0])]
 425                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 426                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 427                         url_data = [parse_qs(uds) for uds in url_data_strs]
 428                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 429                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 430
 431                         format_limit = self._downloader.params.get('format_limit', None)
 432                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 433                         if format_limit is not None and format_limit in available_formats:
 434                                 format_list = available_formats[available_formats.index(format_limit):]
 435                         else:
 436                                 format_list = available_formats
 437                         existing_formats = [x for x in format_list if x in url_map]
 438                         if len(existing_formats) == 0:
 439                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 440                                 return
 441                         if self._downloader.params.get('listformats', None):
 442                                 self._print_formats(existing_formats)
 443                                 return
 444                         if req_format is None or req_format == 'best':
 445                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 446                         elif req_format == 'worst':
 447                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 448                         elif req_format in ('-1', 'all'):
 449                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 450                         else:
 451                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 452                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 453                                 req_formats = req_format.split('/')
 454                                 video_url_list = None
 455                                 for rf in req_formats:
 456                                         if rf in url_map:
 457                                                 video_url_list = [(rf, url_map[rf])]
 458                                                 break
 459                                 if video_url_list is None:
 460                                         self._downloader.trouble(u'ERROR: requested format not available')
 461                                         return
 462                 else:
 463                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 464                         return
 465
 466                 results = []
 467                 for format_param, video_real_url in video_url_list:
 468                         # Extension
 469                         video_extension = self._video_extensions.get(format_param, 'flv')
 470
 471                         results.append({
 472                                 'id':           video_id.decode('utf-8'),
 473                                 'url':          video_real_url.decode('utf-8'),
 474                                 'uploader':     video_uploader.decode('utf-8'),
 475                                 'upload_date':  upload_date,
 476                                 'title':        video_title,
 477                                 'ext':          video_extension.decode('utf-8'),
 478                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 479                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 480                                 'description':  video_description,
 481                                 'player_url':   player_url,
 482                                 'subtitles':    video_subtitles
 483                         })
 484                 return results
 485
 486
 487 class MetacafeIE(InfoExtractor):
 488         """Information Extractor for metacafe.com."""
 489
 490         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 491         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 492         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 493         IE_NAME = u'metacafe'
 494
 495         def __init__(self, downloader=None):
 496                 InfoExtractor.__init__(self, downloader)
 497
 498         def report_disclaimer(self):
 499                 """Report disclaimer retrieval."""
 500                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 501
 502         def report_age_confirmation(self):
 503                 """Report attempt to confirm age."""
 504                 self._downloader.to_screen(u'[metacafe] Confirming age')
 505
 506         def report_download_webpage(self, video_id):
 507                 """Report webpage download."""
 508                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 509
 510         def report_extraction(self, video_id):
 511                 """Report information extraction."""
 512                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 513
 514         def _real_initialize(self):
 515                 # Retrieve disclaimer
 516                 request = urllib2.Request(self._DISCLAIMER)
 517                 try:
 518                         self.report_disclaimer()
 519                         disclaimer = urllib2.urlopen(request).read()
 520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 521                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 522                         return
 523
 524                 # Confirm age
 525                 disclaimer_form = {
 526                         'filters': '0',
 527                         'submit': "Continue - I'm over 18",
 528                         }
 529                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 530                 try:
 531                         self.report_age_confirmation()
 532                         disclaimer = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 535                         return
 536
 537         def _real_extract(self, url):
 538                 # Extract id and simplified title from URL
 539                 mobj = re.match(self._VALID_URL, url)
 540                 if mobj is None:
 541                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 542                         return
 543
 544                 video_id = mobj.group(1)
 545
 546                 # Check if video comes from YouTube
 547                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 548                 if mobj2 is not None:
 549                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 550                         return
 551
 552                 # Retrieve video webpage to extract further information
 553                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 554                 try:
 555                         self.report_download_webpage(video_id)
 556                         webpage = urllib2.urlopen(request).read()
 557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 558                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 559                         return
 560
 561                 # Extract URL, uploader and title from webpage
 562                 self.report_extraction(video_id)
 563                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 564                 if mobj is not None:
 565                         mediaURL = urllib.unquote(mobj.group(1))
 566                         video_extension = mediaURL[-3:]
 567
 568                         # Extract gdaKey if available
 569                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 570                         if mobj is None:
 571                                 video_url = mediaURL
 572                         else:
 573                                 gdaKey = mobj.group(1)
 574                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 575                 else:
 576                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 577                         if mobj is None:
 578                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 579                                 return
 580                         vardict = parse_qs(mobj.group(1))
 581                         if 'mediaData' not in vardict:
 582                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 583                                 return
 584                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         mediaURL = mobj.group(1).replace('\\/', '/')
 589                         video_extension = mediaURL[-3:]
 590                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 591
 592                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 593                 if mobj is None:
 594                         self._downloader.trouble(u'ERROR: unable to extract title')
 595                         return
 596                 video_title = mobj.group(1).decode('utf-8')
 597
 598                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 599                 if mobj is None:
 600                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 601                         return
 602                 video_uploader = mobj.group(1)
 603
 604                 return [{
 605                         'id':           video_id.decode('utf-8'),
 606                         'url':          video_url.decode('utf-8'),
 607                         'uploader':     video_uploader.decode('utf-8'),
 608                         'upload_date':  u'NA',
 609                         'title':        video_title,
 610                         'ext':          video_extension.decode('utf-8'),
 611                         'format':       u'NA',
 612                         'player_url':   None,
 613                 }]
 614
 615
 616 class DailymotionIE(InfoExtractor):
 617         """Information Extractor for Dailymotion"""
 618
 619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 620         IE_NAME = u'dailymotion'
 621
 622         def __init__(self, downloader=None):
 623                 InfoExtractor.__init__(self, downloader)
 624
 625         def report_download_webpage(self, video_id):
 626                 """Report webpage download."""
 627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 628
 629         def report_extraction(self, video_id):
 630                 """Report information extraction."""
 631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 632
 633         def _real_extract(self, url):
 634                 # Extract id and simplified title from URL
 635                 mobj = re.match(self._VALID_URL, url)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 638                         return
 639
 640                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 641
 642                 video_extension = 'mp4'
 643
 644                 # Retrieve video webpage to extract further information
 645                 request = urllib2.Request(url)
 646                 request.add_header('Cookie', 'family_filter=off')
 647                 try:
 648                         self.report_download_webpage(video_id)
 649                         webpage = urllib2.urlopen(request).read()
 650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 652                         return
 653
 654                 # Extract URL, uploader and title from webpage
 655                 self.report_extraction(video_id)
 656                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 659                         return
 660                 flashvars = urllib.unquote(mobj.group(1))
 661                 if 'hqURL' in flashvars: max_quality = 'hqURL'
 662                 elif 'sdURL' in flashvars: max_quality = 'sdURL'
 663                 else: max_quality = 'ldURL'
 664                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 665                 if mobj is None:
 666                         mobj = re.search(r'"video_url":"(.*?)",', flashvars)
 667                 if mobj is None:
 668                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 669                         return
 670                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 671
 672                 # TODO: support choosing qualities
 673
 674                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 675                 if mobj is None:
 676                         self._downloader.trouble(u'ERROR: unable to extract title')
 677                         return
 678                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 679
 680                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 683                         return
 684                 video_uploader = mobj.group(1)
 685
 686                 video_upload_date = u'NA'
 687                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 688                 if mobj is not None:
 689                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 690
 691                 return [{
 692                         'id':           video_id.decode('utf-8'),
 693                         'url':          video_url.decode('utf-8'),
 694                         'uploader':     video_uploader.decode('utf-8'),
 695                         'upload_date':  video_upload_date,
 696                         'title':        video_title,
 697                         'ext':          video_extension.decode('utf-8'),
 698                         'format':       u'NA',
 699                         'player_url':   None,
 700                 }]
 701
 702
 703 class GoogleIE(InfoExtractor):
 704         """Information extractor for video.google.com."""
 705
 706         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 707         IE_NAME = u'video.google'
 708
 709         def __init__(self, downloader=None):
 710                 InfoExtractor.__init__(self, downloader)
 711
 712         def report_download_webpage(self, video_id):
 713                 """Report webpage download."""
 714                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 715
 716         def report_extraction(self, video_id):
 717                 """Report information extraction."""
 718                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 719
 720         def _real_extract(self, url):
 721                 # Extract id from URL
 722                 mobj = re.match(self._VALID_URL, url)
 723                 if mobj is None:
 724                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 725                         return
 726
 727                 video_id = mobj.group(1)
 728
 729                 video_extension = 'mp4'
 730
 731                 # Retrieve video webpage to extract further information
 732                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 733                 try:
 734                         self.report_download_webpage(video_id)
 735                         webpage = urllib2.urlopen(request).read()
 736                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 737                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 738                         return
 739
 740                 # Extract URL, uploader, and title from webpage
 741                 self.report_extraction(video_id)
 742                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 743                 if mobj is None:
 744                         video_extension = 'flv'
 745                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 746                 if mobj is None:
 747                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 748                         return
 749                 mediaURL = urllib.unquote(mobj.group(1))
 750                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 751                 mediaURL = mediaURL.replace('\\x26', '\x26')
 752
 753                 video_url = mediaURL
 754
 755                 mobj = re.search(r'<title>(.*)</title>', webpage)
 756                 if mobj is None:
 757                         self._downloader.trouble(u'ERROR: unable to extract title')
 758                         return
 759                 video_title = mobj.group(1).decode('utf-8')
 760
 761                 # Extract video description
 762                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 763                 if mobj is None:
 764                         self._downloader.trouble(u'ERROR: unable to extract video description')
 765                         return
 766                 video_description = mobj.group(1).decode('utf-8')
 767                 if not video_description:
 768                         video_description = 'No description available.'
 769
 770                 # Extract video thumbnail
 771                 if self._downloader.params.get('forcethumbnail', False):
 772                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 773                         try:
 774                                 webpage = urllib2.urlopen(request).read()
 775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 776                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 777                                 return
 778                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 779                         if mobj is None:
 780                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 781                                 return
 782                         video_thumbnail = mobj.group(1)
 783                 else:   # we need something to pass to process_info
 784                         video_thumbnail = ''
 785
 786                 return [{
 787                         'id':           video_id.decode('utf-8'),
 788                         'url':          video_url.decode('utf-8'),
 789                         'uploader':     u'NA',
 790                         'upload_date':  u'NA',
 791                         'title':        video_title,
 792                         'ext':          video_extension.decode('utf-8'),
 793                         'format':       u'NA',
 794                         'player_url':   None,
 795                 }]
 796
 797
 798 class PhotobucketIE(InfoExtractor):
 799         """Information extractor for photobucket.com."""
 800
 801         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 802         IE_NAME = u'photobucket'
 803
 804         def __init__(self, downloader=None):
 805                 InfoExtractor.__init__(self, downloader)
 806
 807         def report_download_webpage(self, video_id):
 808                 """Report webpage download."""
 809                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 810
 811         def report_extraction(self, video_id):
 812                 """Report information extraction."""
 813                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 814
 815         def _real_extract(self, url):
 816                 # Extract id from URL
 817                 mobj = re.match(self._VALID_URL, url)
 818                 if mobj is None:
 819                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 820                         return
 821
 822                 video_id = mobj.group(1)
 823
 824                 video_extension = 'flv'
 825
 826                 # Retrieve video webpage to extract further information
 827                 request = urllib2.Request(url)
 828                 try:
 829                         self.report_download_webpage(video_id)
 830                         webpage = urllib2.urlopen(request).read()
 831                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 832                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 833                         return
 834
 835                 # Extract URL, uploader, and title from webpage
 836                 self.report_extraction(video_id)
 837                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 838                 if mobj is None:
 839                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 840                         return
 841                 mediaURL = urllib.unquote(mobj.group(1))
 842
 843                 video_url = mediaURL
 844
 845                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 846                 if mobj is None:
 847                         self._downloader.trouble(u'ERROR: unable to extract title')
 848                         return
 849                 video_title = mobj.group(1).decode('utf-8')
 850
 851                 video_uploader = mobj.group(2).decode('utf-8')
 852
 853                 return [{
 854                         'id':           video_id.decode('utf-8'),
 855                         'url':          video_url.decode('utf-8'),
 856                         'uploader':     video_uploader,
 857                         'upload_date':  u'NA',
 858                         'title':        video_title,
 859                         'ext':          video_extension.decode('utf-8'),
 860                         'format':       u'NA',
 861                         'player_url':   None,
 862                 }]
 863
 864
 865 class YahooIE(InfoExtractor):
 866         """Information extractor for video.yahoo.com."""
 867
 868         # _VALID_URL matches all Yahoo! Video URLs
 869         # _VPAGE_URL matches only the extractable '/watch/' URLs
 870         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 871         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 872         IE_NAME = u'video.yahoo'
 873
 874         def __init__(self, downloader=None):
 875                 InfoExtractor.__init__(self, downloader)
 876
 877         def report_download_webpage(self, video_id):
 878                 """Report webpage download."""
 879                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 880
 881         def report_extraction(self, video_id):
 882                 """Report information extraction."""
 883                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 884
 885         def _real_extract(self, url, new_video=True):
 886                 # Extract ID from URL
 887                 mobj = re.match(self._VALID_URL, url)
 888                 if mobj is None:
 889                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 890                         return
 891
 892                 video_id = mobj.group(2)
 893                 video_extension = 'flv'
 894
 895                 # Rewrite valid but non-extractable URLs as
 896                 # extractable English language /watch/ URLs
 897                 if re.match(self._VPAGE_URL, url) is None:
 898                         request = urllib2.Request(url)
 899                         try:
 900                                 webpage = urllib2.urlopen(request).read()
 901                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 902                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 903                                 return
 904
 905                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 906                         if mobj is None:
 907                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 908                                 return
 909                         yahoo_id = mobj.group(1)
 910
 911                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 912                         if mobj is None:
 913                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 914                                 return
 915                         yahoo_vid = mobj.group(1)
 916
 917                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 918                         return self._real_extract(url, new_video=False)
 919
 920                 # Retrieve video webpage to extract further information
 921                 request = urllib2.Request(url)
 922                 try:
 923                         self.report_download_webpage(video_id)
 924                         webpage = urllib2.urlopen(request).read()
 925                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 926                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 927                         return
 928
 929                 # Extract uploader and title from webpage
 930                 self.report_extraction(video_id)
 931                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 932                 if mobj is None:
 933                         self._downloader.trouble(u'ERROR: unable to extract video title')
 934                         return
 935                 video_title = mobj.group(1).decode('utf-8')
 936
 937                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 938                 if mobj is None:
 939                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 940                         return
 941                 video_uploader = mobj.group(1).decode('utf-8')
 942
 943                 # Extract video thumbnail
 944                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 945                 if mobj is None:
 946                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 947                         return
 948                 video_thumbnail = mobj.group(1).decode('utf-8')
 949
 950                 # Extract video description
 951                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video description')
 954                         return
 955                 video_description = mobj.group(1).decode('utf-8')
 956                 if not video_description:
 957                         video_description = 'No description available.'
 958
 959                 # Extract video height and width
 960                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 961                 if mobj is None:
 962                         self._downloader.trouble(u'ERROR: unable to extract video height')
 963                         return
 964                 yv_video_height = mobj.group(1)
 965
 966                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 967                 if mobj is None:
 968                         self._downloader.trouble(u'ERROR: unable to extract video width')
 969                         return
 970                 yv_video_width = mobj.group(1)
 971
 972                 # Retrieve video playlist to extract media URL
 973                 # I'm not completely sure what all these options are, but we
 974                 # seem to need most of them, otherwise the server sends a 401.
 975                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 976                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 977                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 978                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 979                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 980                 try:
 981                         self.report_download_webpage(video_id)
 982                         webpage = urllib2.urlopen(request).read()
 983                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 984                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 985                         return
 986
 987                 # Extract media URL from playlist XML
 988                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 989                 if mobj is None:
 990                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 991                         return
 992                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 993                 video_url = unescapeHTML(video_url)
 994
 995                 return [{
 996                         'id':           video_id.decode('utf-8'),
 997                         'url':          video_url,
 998                         'uploader':     video_uploader,
 999                         'upload_date':  u'NA',
1000                         'title':        video_title,
1001                         'ext':          video_extension.decode('utf-8'),
1002                         'thumbnail':    video_thumbnail.decode('utf-8'),
1003                         'description':  video_description,
1004                         'thumbnail':    video_thumbnail,
1005                         'player_url':   None,
1006                 }]
1007
1008
1009 class VimeoIE(InfoExtractor):
1010         """Information extractor for vimeo.com."""
1011
1012         # _VALID_URL matches Vimeo URLs
1013         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1014         IE_NAME = u'vimeo'
1015
1016         def __init__(self, downloader=None):
1017                 InfoExtractor.__init__(self, downloader)
1018
1019         def report_download_webpage(self, video_id):
1020                 """Report webpage download."""
1021                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1022
1023         def report_extraction(self, video_id):
1024                 """Report information extraction."""
1025                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1026
1027         def _real_extract(self, url, new_video=True):
1028                 # Extract ID from URL
1029                 mobj = re.match(self._VALID_URL, url)
1030                 if mobj is None:
1031                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1032                         return
1033
1034                 video_id = mobj.group(1)
1035
1036                 # Retrieve video webpage to extract further information
1037                 request = urllib2.Request(url, None, std_headers)
1038                 try:
1039                         self.report_download_webpage(video_id)
1040                         webpage = urllib2.urlopen(request).read()
1041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1042                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1043                         return
1044
1045                 # Now we begin extracting as much information as we can from what we
1046                 # retrieved. First we extract the information common to all extractors,
1047                 # and latter we extract those that are Vimeo specific.
1048                 self.report_extraction(video_id)
1049
1050                 # Extract the config JSON
1051                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1052                 try:
1053                         config = json.loads(config)
1054                 except:
1055                         self._downloader.trouble(u'ERROR: unable to extract info section')
1056                         return
1057
1058                 # Extract title
1059                 video_title = config["video"]["title"]
1060
1061                 # Extract uploader
1062                 video_uploader = config["video"]["owner"]["name"]
1063
1064                 # Extract video thumbnail
1065                 video_thumbnail = config["video"]["thumbnail"]
1066
1067                 # Extract video description
1068                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1069                 if video_description: video_description = clean_html(video_description)
1070                 else: video_description = ''
1071
1072                 # Extract upload date
1073                 video_upload_date = u'NA'
1074                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1075                 if mobj is not None:
1076                         video_upload_date = mobj.group(1)
1077
1078                 # Vimeo specific: extract request signature and timestamp
1079                 sig = config['request']['signature']
1080                 timestamp = config['request']['timestamp']
1081
1082                 # Vimeo specific: extract video codec and quality information
1083                 # TODO bind to format param
1084                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1085                 for codec in codecs:
1086                         if codec[0] in config["video"]["files"]:
1087                                 video_codec = codec[0]
1088                                 video_extension = codec[1]
1089                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1090                                 else: quality = 'sd'
1091                                 break
1092                 else:
1093                         self._downloader.trouble(u'ERROR: no known codec found')
1094                         return
1095
1096                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1097                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1098
1099                 return [{
1100                         'id':           video_id,
1101                         'url':          video_url,
1102                         'uploader':     video_uploader,
1103                         'upload_date':  video_upload_date,
1104                         'title':        video_title,
1105                         'ext':          video_extension,
1106                         'thumbnail':    video_thumbnail,
1107                         'description':  video_description,
1108                         'player_url':   None,
1109                 }]
1110
1111
1112 class GenericIE(InfoExtractor):
1113         """Generic last-resort information extractor."""
1114
1115         _VALID_URL = r'.*'
1116         IE_NAME = u'generic'
1117
1118         def __init__(self, downloader=None):
1119                 InfoExtractor.__init__(self, downloader)
1120
1121         def report_download_webpage(self, video_id):
1122                 """Report webpage download."""
1123                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1124                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1125
1126         def report_extraction(self, video_id):
1127                 """Report information extraction."""
1128                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1129
1130         def report_following_redirect(self, new_url):
1131                 """Report information extraction."""
1132                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1133
1134         def _test_redirect(self, url):
1135                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1136                 class HeadRequest(urllib2.Request):
1137                         def get_method(self):
1138                                 return "HEAD"
1139
1140                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1141                         """
1142                         Subclass the HTTPRedirectHandler to make it use our
1143                         HeadRequest also on the redirected URL
1144                         """
1145                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1146                                 if code in (301, 302, 303, 307):
1147                                         newurl = newurl.replace(' ', '%20')
1148                                         newheaders = dict((k,v) for k,v in req.headers.items()
1149                                                                           if k.lower() not in ("content-length", "content-type"))
1150                                         return HeadRequest(newurl,
1151                                                                            headers=newheaders,
1152                                                                            origin_req_host=req.get_origin_req_host(),
1153                                                                            unverifiable=True)
1154                                 else:
1155                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1156
1157                 class HTTPMethodFallback(urllib2.BaseHandler):
1158                         """
1159                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1160                         """
1161                         def http_error_405(self, req, fp, code, msg, headers):
1162                                 fp.read()
1163                                 fp.close()
1164
1165                                 newheaders = dict((k,v) for k,v in req.headers.items()
1166                                                                   if k.lower() not in ("content-length", "content-type"))
1167                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1168                                                                                                  headers=newheaders,
1169                                                                                                  origin_req_host=req.get_origin_req_host(),
1170                                                                                                  unverifiable=True))
1171
1172                 # Build our opener
1173                 opener = urllib2.OpenerDirector()
1174                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1175                                                 HTTPMethodFallback, HEADRedirectHandler,
1176                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1177                         opener.add_handler(handler())
1178
1179                 response = opener.open(HeadRequest(url))
1180                 new_url = response.geturl()
1181
1182                 if url == new_url: return False
1183
1184                 self.report_following_redirect(new_url)
1185                 self._downloader.download([new_url])
1186                 return True
1187
1188         def _real_extract(self, url):
1189                 if self._test_redirect(url): return
1190
1191                 video_id = url.split('/')[-1]
1192                 request = urllib2.Request(url)
1193                 try:
1194                         self.report_download_webpage(video_id)
1195                         webpage = urllib2.urlopen(request).read()
1196                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1197                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1198                         return
1199                 except ValueError, err:
1200                         # since this is the last-resort InfoExtractor, if
1201                         # this error is thrown, it'll be thrown here
1202                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1203                         return
1204
1205                 self.report_extraction(video_id)
1206                 # Start with something easy: JW Player in SWFObject
1207                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1208                 if mobj is None:
1209                         # Broaden the search a little bit
1210                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1211                 if mobj is None:
1212                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1213                         return
1214
1215                 # It's possible that one of the regexes
1216                 # matched, but returned an empty group:
1217                 if mobj.group(1) is None:
1218                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1219                         return
1220
1221                 video_url = urllib.unquote(mobj.group(1))
1222                 video_id = os.path.basename(video_url)
1223
1224                 # here's a fun little line of code for you:
1225                 video_extension = os.path.splitext(video_id)[1][1:]
1226                 video_id = os.path.splitext(video_id)[0]
1227
1228                 # it's tempting to parse this further, but you would
1229                 # have to take into account all the variations like
1230                 #   Video Title - Site Name
1231                 #   Site Name | Video Title
1232                 #   Video Title - Tagline | Site Name
1233                 # and so on and so forth; it's just not practical
1234                 mobj = re.search(r'<title>(.*)</title>', webpage)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: unable to extract title')
1237                         return
1238                 video_title = mobj.group(1).decode('utf-8')
1239
1240                 # video uploader is domain name
1241                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: unable to extract title')
1244                         return
1245                 video_uploader = mobj.group(1).decode('utf-8')
1246
1247                 return [{
1248                         'id':           video_id.decode('utf-8'),
1249                         'url':          video_url.decode('utf-8'),
1250                         'uploader':     video_uploader,
1251                         'upload_date':  u'NA',
1252                         'title':        video_title,
1253                         'ext':          video_extension.decode('utf-8'),
1254                         'format':       u'NA',
1255                         'player_url':   None,
1256                 }]
1257
1258
1259 class YoutubeSearchIE(InfoExtractor):
1260         """Information Extractor for YouTube search queries."""
1261         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1262         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1263         _max_youtube_results = 1000
1264         IE_NAME = u'youtube:search'
1265
1266         def __init__(self, downloader=None):
1267                 InfoExtractor.__init__(self, downloader)
1268
1269         def report_download_page(self, query, pagenum):
1270                 """Report attempt to download search page with given number."""
1271                 query = query.decode(preferredencoding())
1272                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1273
1274         def _real_extract(self, query):
1275                 mobj = re.match(self._VALID_URL, query)
1276                 if mobj is None:
1277                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1278                         return
1279
1280                 prefix, query = query.split(':')
1281                 prefix = prefix[8:]
1282                 query = query.encode('utf-8')
1283                 if prefix == '':
1284                         self._download_n_results(query, 1)
1285                         return
1286                 elif prefix == 'all':
1287                         self._download_n_results(query, self._max_youtube_results)
1288                         return
1289                 else:
1290                         try:
1291                                 n = long(prefix)
1292                                 if n <= 0:
1293                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1294                                         return
1295                                 elif n > self._max_youtube_results:
1296                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1297                                         n = self._max_youtube_results
1298                                 self._download_n_results(query, n)
1299                                 return
1300                         except ValueError: # parsing prefix as integer fails
1301                                 self._download_n_results(query, 1)
1302                                 return
1303
1304         def _download_n_results(self, query, n):
1305                 """Downloads a specified number of results for a query"""
1306
1307                 video_ids = []
1308                 pagenum = 0
1309                 limit = n
1310
1311                 while (50 * pagenum) < limit:
1312                         self.report_download_page(query, pagenum+1)
1313                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1314                         request = urllib2.Request(result_url)
1315                         try:
1316                                 data = urllib2.urlopen(request).read()
1317                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1318                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1319                                 return
1320                         api_response = json.loads(data)['data']
1321
1322                         new_ids = list(video['id'] for video in api_response['items'])
1323                         video_ids += new_ids
1324
1325                         limit = min(n, api_response['totalItems'])
1326                         pagenum += 1
1327
1328                 if len(video_ids) > n:
1329                         video_ids = video_ids[:n]
1330                 for id in video_ids:
1331                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1332                 return
1333
1334
1335 class GoogleSearchIE(InfoExtractor):
1336         """Information Extractor for Google Video search queries."""
1337         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1338         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1339         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1340         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1341         _max_google_results = 1000
1342         IE_NAME = u'video.google:search'
1343
1344         def __init__(self, downloader=None):
1345                 InfoExtractor.__init__(self, downloader)
1346
1347         def report_download_page(self, query, pagenum):
1348                 """Report attempt to download playlist page with given number."""
1349                 query = query.decode(preferredencoding())
1350                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1351
1352         def _real_extract(self, query):
1353                 mobj = re.match(self._VALID_URL, query)
1354                 if mobj is None:
1355                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1356                         return
1357
1358                 prefix, query = query.split(':')
1359                 prefix = prefix[8:]
1360                 query = query.encode('utf-8')
1361                 if prefix == '':
1362                         self._download_n_results(query, 1)
1363                         return
1364                 elif prefix == 'all':
1365                         self._download_n_results(query, self._max_google_results)
1366                         return
1367                 else:
1368                         try:
1369                                 n = long(prefix)
1370                                 if n <= 0:
1371                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1372                                         return
1373                                 elif n > self._max_google_results:
1374                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1375                                         n = self._max_google_results
1376                                 self._download_n_results(query, n)
1377                                 return
1378                         except ValueError: # parsing prefix as integer fails
1379                                 self._download_n_results(query, 1)
1380                                 return
1381
1382         def _download_n_results(self, query, n):
1383                 """Downloads a specified number of results for a query"""
1384
1385                 video_ids = []
1386                 pagenum = 0
1387
1388                 while True:
1389                         self.report_download_page(query, pagenum)
1390                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1391                         request = urllib2.Request(result_url)
1392                         try:
1393                                 page = urllib2.urlopen(request).read()
1394                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1396                                 return
1397
1398                         # Extract video identifiers
1399                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1400                                 video_id = mobj.group(1)
1401                                 if video_id not in video_ids:
1402                                         video_ids.append(video_id)
1403                                         if len(video_ids) == n:
1404                                                 # Specified n videos reached
1405                                                 for id in video_ids:
1406                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1407                                                 return
1408
1409                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1410                                 for id in video_ids:
1411                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1412                                 return
1413
1414                         pagenum = pagenum + 1
1415
1416
1417 class YahooSearchIE(InfoExtractor):
1418         """Information Extractor for Yahoo! Video search queries."""
1419         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1420         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1421         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1422         _MORE_PAGES_INDICATOR = r'\s*Next'
1423         _max_yahoo_results = 1000
1424         IE_NAME = u'video.yahoo:search'
1425
1426         def __init__(self, downloader=None):
1427                 InfoExtractor.__init__(self, downloader)
1428
1429         def report_download_page(self, query, pagenum):
1430                 """Report attempt to download playlist page with given number."""
1431                 query = query.decode(preferredencoding())
1432                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1433
1434         def _real_extract(self, query):
1435                 mobj = re.match(self._VALID_URL, query)
1436                 if mobj is None:
1437                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1438                         return
1439
1440                 prefix, query = query.split(':')
1441                 prefix = prefix[8:]
1442                 query = query.encode('utf-8')
1443                 if prefix == '':
1444                         self._download_n_results(query, 1)
1445                         return
1446                 elif prefix == 'all':
1447                         self._download_n_results(query, self._max_yahoo_results)
1448                         return
1449                 else:
1450                         try:
1451                                 n = long(prefix)
1452                                 if n <= 0:
1453                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454                                         return
1455                                 elif n > self._max_yahoo_results:
1456                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1457                                         n = self._max_yahoo_results
1458                                 self._download_n_results(query, n)
1459                                 return
1460                         except ValueError: # parsing prefix as integer fails
1461                                 self._download_n_results(query, 1)
1462                                 return
1463
1464         def _download_n_results(self, query, n):
1465                 """Downloads a specified number of results for a query"""
1466
1467                 video_ids = []
1468                 already_seen = set()
1469                 pagenum = 1
1470
1471                 while True:
1472                         self.report_download_page(query, pagenum)
1473                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1474                         request = urllib2.Request(result_url)
1475                         try:
1476                                 page = urllib2.urlopen(request).read()
1477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1479                                 return
1480
1481                         # Extract video identifiers
1482                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1483                                 video_id = mobj.group(1)
1484                                 if video_id not in already_seen:
1485                                         video_ids.append(video_id)
1486                                         already_seen.add(video_id)
1487                                         if len(video_ids) == n:
1488                                                 # Specified n videos reached
1489                                                 for id in video_ids:
1490                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1491                                                 return
1492
1493                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1494                                 for id in video_ids:
1495                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1496                                 return
1497
1498                         pagenum = pagenum + 1
1499
1500
1501 class YoutubePlaylistIE(InfoExtractor):
1502         """Information Extractor for YouTube playlists."""
1503
1504         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1505         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1506         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=.*?%s'
1507         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1508         IE_NAME = u'youtube:playlist'
1509
1510         def __init__(self, downloader=None):
1511                 InfoExtractor.__init__(self, downloader)
1512
1513         def report_download_page(self, playlist_id, pagenum):
1514                 """Report attempt to download playlist page with given number."""
1515                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1516
1517         def _real_extract(self, url):
1518                 # Extract playlist id
1519                 mobj = re.match(self._VALID_URL, url)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1522                         return
1523
1524                 # Single video case
1525                 if mobj.group(3) is not None:
1526                         self._downloader.download([mobj.group(3)])
1527                         return
1528
1529                 # Download playlist pages
1530                 # prefix is 'p' as default for playlists but there are other types that need extra care
1531                 playlist_prefix = mobj.group(1)
1532                 if playlist_prefix == 'a':
1533                         playlist_access = 'artist'
1534                 else:
1535                         playlist_prefix = 'p'
1536                         playlist_access = 'view_play_list'
1537                 playlist_id = mobj.group(2)
1538                 video_ids = []
1539                 pagenum = 1
1540
1541                 while True:
1542                         self.report_download_page(playlist_id, pagenum)
1543                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1544                         request = urllib2.Request(url)
1545                         try:
1546                                 page = urllib2.urlopen(request).read()
1547                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1549                                 return
1550
1551                         # Extract video identifiers
1552                         ids_in_page = []
1553                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1554                                 if mobj.group(1) not in ids_in_page:
1555                                         ids_in_page.append(mobj.group(1))
1556                         video_ids.extend(ids_in_page)
1557
1558                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1559                                 break
1560                         pagenum = pagenum + 1
1561
1562                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1563                 playlistend = self._downloader.params.get('playlistend', -1)
1564                 if playlistend == -1:
1565                         video_ids = video_ids[playliststart:]
1566                 else:
1567                         video_ids = video_ids[playliststart:playlistend]
1568
1569                 for id in video_ids:
1570                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1571                 return
1572
1573
1574 class YoutubeUserIE(InfoExtractor):
1575         """Information Extractor for YouTube users."""
1576
1577         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1578         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1579         _GDATA_PAGE_SIZE = 50
1580         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1581         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1582         IE_NAME = u'youtube:user'
1583
1584         def __init__(self, downloader=None):
1585                 InfoExtractor.__init__(self, downloader)
1586
1587         def report_download_page(self, username, start_index):
1588                 """Report attempt to download user page."""
1589                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1590                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1591
1592         def _real_extract(self, url):
1593                 # Extract username
1594                 mobj = re.match(self._VALID_URL, url)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1597                         return
1598
1599                 username = mobj.group(1)
1600
1601                 # Download video ids using YouTube Data API. Result size per
1602                 # query is limited (currently to 50 videos) so we need to query
1603                 # page by page until there are no video ids - it means we got
1604                 # all of them.
1605
1606                 video_ids = []
1607                 pagenum = 0
1608
1609                 while True:
1610                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1611                         self.report_download_page(username, start_index)
1612
1613                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1614
1615                         try:
1616                                 page = urllib2.urlopen(request).read()
1617                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1618                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1619                                 return
1620
1621                         # Extract video identifiers
1622                         ids_in_page = []
1623
1624                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1625                                 if mobj.group(1) not in ids_in_page:
1626                                         ids_in_page.append(mobj.group(1))
1627
1628                         video_ids.extend(ids_in_page)
1629
1630                         # A little optimization - if current page is not
1631                         # "full", ie. does not contain PAGE_SIZE video ids then
1632                         # we can assume that this page is the last one - there
1633                         # are no more ids on further pages - no need to query
1634                         # again.
1635
1636                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1637                                 break
1638
1639                         pagenum += 1
1640
1641                 all_ids_count = len(video_ids)
1642                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1643                 playlistend = self._downloader.params.get('playlistend', -1)
1644
1645                 if playlistend == -1:
1646                         video_ids = video_ids[playliststart:]
1647                 else:
1648                         video_ids = video_ids[playliststart:playlistend]
1649
1650                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1651                                 (username, all_ids_count, len(video_ids)))
1652
1653                 for video_id in video_ids:
1654                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1655
1656
1657 class BlipTVUserIE(InfoExtractor):
1658         """Information Extractor for blip.tv users."""
1659
1660         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1661         _PAGE_SIZE = 12
1662         IE_NAME = u'blip.tv:user'
1663
1664         def __init__(self, downloader=None):
1665                 InfoExtractor.__init__(self, downloader)
1666
1667         def report_download_page(self, username, pagenum):
1668                 """Report attempt to download user page."""
1669                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1670                                 (self.IE_NAME, username, pagenum))
1671
1672         def _real_extract(self, url):
1673                 # Extract username
1674                 mobj = re.match(self._VALID_URL, url)
1675                 if mobj is None:
1676                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1677                         return
1678
1679                 username = mobj.group(1)
1680
1681                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1682
1683                 request = urllib2.Request(url)
1684
1685                 try:
1686                         page = urllib2.urlopen(request).read().decode('utf-8')
1687                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1688                         page_base = page_base % mobj.group(1)
1689                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1691                         return
1692
1693
1694                 # Download video ids using BlipTV Ajax calls. Result size per
1695                 # query is limited (currently to 12 videos) so we need to query
1696                 # page by page until there are no video ids - it means we got
1697                 # all of them.
1698
1699                 video_ids = []
1700                 pagenum = 1
1701
1702                 while True:
1703                         self.report_download_page(username, pagenum)
1704
1705                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1706
1707                         try:
1708                                 page = urllib2.urlopen(request).read().decode('utf-8')
1709                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1711                                 return
1712
1713                         # Extract video identifiers
1714                         ids_in_page = []
1715
1716                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1717                                 if mobj.group(1) not in ids_in_page:
1718                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1719
1720                         video_ids.extend(ids_in_page)
1721
1722                         # A little optimization - if current page is not
1723                         # "full", ie. does not contain PAGE_SIZE video ids then
1724                         # we can assume that this page is the last one - there
1725                         # are no more ids on further pages - no need to query
1726                         # again.
1727
1728                         if len(ids_in_page) < self._PAGE_SIZE:
1729                                 break
1730
1731                         pagenum += 1
1732
1733                 all_ids_count = len(video_ids)
1734                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1735                 playlistend = self._downloader.params.get('playlistend', -1)
1736
1737                 if playlistend == -1:
1738                         video_ids = video_ids[playliststart:]
1739                 else:
1740                         video_ids = video_ids[playliststart:playlistend]
1741
1742                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1743                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1744
1745                 for video_id in video_ids:
1746                         self._downloader.download([u'http://blip.tv/'+video_id])
1747
1748
1749 class DepositFilesIE(InfoExtractor):
1750         """Information extractor for depositfiles.com"""
1751
1752         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1753         IE_NAME = u'DepositFiles'
1754
1755         def __init__(self, downloader=None):
1756                 InfoExtractor.__init__(self, downloader)
1757
1758         def report_download_webpage(self, file_id):
1759                 """Report webpage download."""
1760                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1761
1762         def report_extraction(self, file_id):
1763                 """Report information extraction."""
1764                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1765
1766         def _real_extract(self, url):
1767                 file_id = url.split('/')[-1]
1768                 # Rebuild url in english locale
1769                 url = 'http://depositfiles.com/en/files/' + file_id
1770
1771                 # Retrieve file webpage with 'Free download' button pressed
1772                 free_download_indication = { 'gateway_result' : '1' }
1773                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1774                 try:
1775                         self.report_download_webpage(file_id)
1776                         webpage = urllib2.urlopen(request).read()
1777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1779                         return
1780
1781                 # Search for the real file URL
1782                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1783                 if (mobj is None) or (mobj.group(1) is None):
1784                         # Try to figure out reason of the error.
1785                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1786                         if (mobj is not None) and (mobj.group(1) is not None):
1787                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1788                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1789                         else:
1790                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1791                         return
1792
1793                 file_url = mobj.group(1)
1794                 file_extension = os.path.splitext(file_url)[1][1:]
1795
1796                 # Search for file title
1797                 mobj = re.search(r'<b title="(.*?)">', webpage)
1798                 if mobj is None:
1799                         self._downloader.trouble(u'ERROR: unable to extract title')
1800                         return
1801                 file_title = mobj.group(1).decode('utf-8')
1802
1803                 return [{
1804                         'id':           file_id.decode('utf-8'),
1805                         'url':          file_url.decode('utf-8'),
1806                         'uploader':     u'NA',
1807                         'upload_date':  u'NA',
1808                         'title':        file_title,
1809                         'ext':          file_extension.decode('utf-8'),
1810                         'format':       u'NA',
1811                         'player_url':   None,
1812                 }]
1813
1814
1815 class FacebookIE(InfoExtractor):
1816         """Information Extractor for Facebook"""
1817
1818         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1819         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1820         _NETRC_MACHINE = 'facebook'
1821         _available_formats = ['video', 'highqual', 'lowqual']
1822         _video_extensions = {
1823                 'video': 'mp4',
1824                 'highqual': 'mp4',
1825                 'lowqual': 'mp4',
1826         }
1827         IE_NAME = u'facebook'
1828
1829         def __init__(self, downloader=None):
1830                 InfoExtractor.__init__(self, downloader)
1831
1832         def _reporter(self, message):
1833                 """Add header and report message."""
1834                 self._downloader.to_screen(u'[facebook] %s' % message)
1835
1836         def report_login(self):
1837                 """Report attempt to log in."""
1838                 self._reporter(u'Logging in')
1839
1840         def report_video_webpage_download(self, video_id):
1841                 """Report attempt to download video webpage."""
1842                 self._reporter(u'%s: Downloading video webpage' % video_id)
1843
1844         def report_information_extraction(self, video_id):
1845                 """Report attempt to extract video information."""
1846                 self._reporter(u'%s: Extracting video information' % video_id)
1847
1848         def _parse_page(self, video_webpage):
1849                 """Extract video information from page"""
1850                 # General data
1851                 data = {'title': r'\("video_title", "(.*?)"\)',
1852                         'description': r'<div class="datawrap">(.*?)</div>',
1853                         'owner': r'\("video_owner_name", "(.*?)"\)',
1854                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1855                         }
1856                 video_info = {}
1857                 for piece in data.keys():
1858                         mobj = re.search(data[piece], video_webpage)
1859                         if mobj is not None:
1860                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1861
1862                 # Video urls
1863                 video_urls = {}
1864                 for fmt in self._available_formats:
1865                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1866                         if mobj is not None:
1867                                 # URL is in a Javascript segment inside an escaped Unicode format within
1868                                 # the generally utf-8 page
1869                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1870                 video_info['video_urls'] = video_urls
1871
1872                 return video_info
1873
1874         def _real_initialize(self):
1875                 if self._downloader is None:
1876                         return
1877
1878                 useremail = None
1879                 password = None
1880                 downloader_params = self._downloader.params
1881
1882                 # Attempt to use provided username and password or .netrc data
1883                 if downloader_params.get('username', None) is not None:
1884                         useremail = downloader_params['username']
1885                         password = downloader_params['password']
1886                 elif downloader_params.get('usenetrc', False):
1887                         try:
1888                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1889                                 if info is not None:
1890                                         useremail = info[0]
1891                                         password = info[2]
1892                                 else:
1893                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1894                         except (IOError, netrc.NetrcParseError), err:
1895                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1896                                 return
1897
1898                 if useremail is None:
1899                         return
1900
1901                 # Log in
1902                 login_form = {
1903                         'email': useremail,
1904                         'pass': password,
1905                         'login': 'Log+In'
1906                         }
1907                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1908                 try:
1909                         self.report_login()
1910                         login_results = urllib2.urlopen(request).read()
1911                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1912                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1913                                 return
1914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1916                         return
1917
1918         def _real_extract(self, url):
1919                 mobj = re.match(self._VALID_URL, url)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1922                         return
1923                 video_id = mobj.group('ID')
1924
1925                 # Get video webpage
1926                 self.report_video_webpage_download(video_id)
1927                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1928                 try:
1929                         page = urllib2.urlopen(request)
1930                         video_webpage = page.read()
1931                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1932                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1933                         return
1934
1935                 # Start extracting information
1936                 self.report_information_extraction(video_id)
1937
1938                 # Extract information
1939                 video_info = self._parse_page(video_webpage)
1940
1941                 # uploader
1942                 if 'owner' not in video_info:
1943                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1944                         return
1945                 video_uploader = video_info['owner']
1946
1947                 # title
1948                 if 'title' not in video_info:
1949                         self._downloader.trouble(u'ERROR: unable to extract video title')
1950                         return
1951                 video_title = video_info['title']
1952                 video_title = video_title.decode('utf-8')
1953
1954                 # thumbnail image
1955                 if 'thumbnail' not in video_info:
1956                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1957                         video_thumbnail = ''
1958                 else:
1959                         video_thumbnail = video_info['thumbnail']
1960
1961                 # upload date
1962                 upload_date = u'NA'
1963                 if 'upload_date' in video_info:
1964                         upload_time = video_info['upload_date']
1965                         timetuple = email.utils.parsedate_tz(upload_time)
1966                         if timetuple is not None:
1967                                 try:
1968                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1969                                 except:
1970                                         pass
1971
1972                 # description
1973                 video_description = video_info.get('description', 'No description available.')
1974
1975                 url_map = video_info['video_urls']
1976                 if len(url_map.keys()) > 0:
1977                         # Decide which formats to download
1978                         req_format = self._downloader.params.get('format', None)
1979                         format_limit = self._downloader.params.get('format_limit', None)
1980
1981                         if format_limit is not None and format_limit in self._available_formats:
1982                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1983                         else:
1984                                 format_list = self._available_formats
1985                         existing_formats = [x for x in format_list if x in url_map]
1986                         if len(existing_formats) == 0:
1987                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1988                                 return
1989                         if req_format is None:
1990                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1991                         elif req_format == 'worst':
1992                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1993                         elif req_format == '-1':
1994                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1995                         else:
1996                                 # Specific format
1997                                 if req_format not in url_map:
1998                                         self._downloader.trouble(u'ERROR: requested format not available')
1999                                         return
2000                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2001
2002                 results = []
2003                 for format_param, video_real_url in video_url_list:
2004                         # Extension
2005                         video_extension = self._video_extensions.get(format_param, 'mp4')
2006
2007                         results.append({
2008                                 'id':           video_id.decode('utf-8'),
2009                                 'url':          video_real_url.decode('utf-8'),
2010                                 'uploader':     video_uploader.decode('utf-8'),
2011                                 'upload_date':  upload_date,
2012                                 'title':        video_title,
2013                                 'ext':          video_extension.decode('utf-8'),
2014                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2015                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2016                                 'description':  video_description.decode('utf-8'),
2017                                 'player_url':   None,
2018                         })
2019                 return results
2020
2021 class BlipTVIE(InfoExtractor):
2022         """Information extractor for blip.tv"""
2023
2024         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2025         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2026         IE_NAME = u'blip.tv'
2027
2028         def report_extraction(self, file_id):
2029                 """Report information extraction."""
2030                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2031
2032         def report_direct_download(self, title):
2033                 """Report information extraction."""
2034                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2035
2036         def _real_extract(self, url):
2037                 mobj = re.match(self._VALID_URL, url)
2038                 if mobj is None:
2039                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2040                         return
2041
2042                 if '?' in url:
2043                         cchar = '&'
2044                 else:
2045                         cchar = '?'
2046                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2047                 request = urllib2.Request(json_url.encode('utf-8'))
2048                 self.report_extraction(mobj.group(1))
2049                 info = None
2050                 try:
2051                         urlh = urllib2.urlopen(request)
2052                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2053                                 basename = url.split('/')[-1]
2054                                 title,ext = os.path.splitext(basename)
2055                                 title = title.decode('UTF-8')
2056                                 ext = ext.replace('.', '')
2057                                 self.report_direct_download(title)
2058                                 info = {
2059                                         'id': title,
2060                                         'url': url,
2061                                         'title': title,
2062                                         'ext': ext,
2063                                         'urlhandle': urlh
2064                                 }
2065                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2067                         return
2068                 if info is None: # Regular URL
2069                         try:
2070                                 json_code = urlh.read()
2071                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2072                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2073                                 return
2074
2075                         try:
2076                                 json_data = json.loads(json_code)
2077                                 if 'Post' in json_data:
2078                                         data = json_data['Post']
2079                                 else:
2080                                         data = json_data
2081
2082                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2083                                 video_url = data['media']['url']
2084                                 umobj = re.match(self._URL_EXT, video_url)
2085                                 if umobj is None:
2086                                         raise ValueError('Can not determine filename extension')
2087                                 ext = umobj.group(1)
2088
2089                                 info = {
2090                                         'id': data['item_id'],
2091                                         'url': video_url,
2092                                         'uploader': data['display_name'],
2093                                         'upload_date': upload_date,
2094                                         'title': data['title'],
2095                                         'ext': ext,
2096                                         'format': data['media']['mimeType'],
2097                                         'thumbnail': data['thumbnailUrl'],
2098                                         'description': data['description'],
2099                                         'player_url': data['embedUrl']
2100                                 }
2101                         except (ValueError,KeyError), err:
2102                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2103                                 return
2104
2105                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2106                 return [info]
2107
2108
2109 class MyVideoIE(InfoExtractor):
2110         """Information Extractor for myvideo.de."""
2111
2112         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2113         IE_NAME = u'myvideo'
2114
2115         def __init__(self, downloader=None):
2116                 InfoExtractor.__init__(self, downloader)
2117
2118         def report_download_webpage(self, video_id):
2119                 """Report webpage download."""
2120                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2121
2122         def report_extraction(self, video_id):
2123                 """Report information extraction."""
2124                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2125
2126         def _real_extract(self,url):
2127                 mobj = re.match(self._VALID_URL, url)
2128                 if mobj is None:
2129                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2130                         return
2131
2132                 video_id = mobj.group(1)
2133
2134                 # Get video webpage
2135                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2136                 try:
2137                         self.report_download_webpage(video_id)
2138                         webpage = urllib2.urlopen(request).read()
2139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2141                         return
2142
2143                 self.report_extraction(video_id)
2144                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2145                                  webpage)
2146                 if mobj is None:
2147                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2148                         return
2149                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2150
2151                 mobj = re.search('<title>([^<]+)</title>', webpage)
2152                 if mobj is None:
2153                         self._downloader.trouble(u'ERROR: unable to extract title')
2154                         return
2155
2156                 video_title = mobj.group(1)
2157
2158                 return [{
2159                         'id':           video_id,
2160                         'url':          video_url,
2161                         'uploader':     u'NA',
2162                         'upload_date':  u'NA',
2163                         'title':        video_title,
2164                         'ext':          u'flv',
2165                         'format':       u'NA',
2166                         'player_url':   None,
2167                 }]
2168
2169 class ComedyCentralIE(InfoExtractor):
2170         """Information extractor for The Daily Show and Colbert Report """
2171
2172         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2173         IE_NAME = u'comedycentral'
2174
2175         def report_extraction(self, episode_id):
2176                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2177
2178         def report_config_download(self, episode_id):
2179                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2180
2181         def report_index_download(self, episode_id):
2182                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2183
2184         def report_player_url(self, episode_id):
2185                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2186
2187         def _real_extract(self, url):
2188                 mobj = re.match(self._VALID_URL, url)
2189                 if mobj is None:
2190                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2191                         return
2192
2193                 if mobj.group('shortname'):
2194                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2195                                 url = u'http://www.thedailyshow.com/full-episodes/'
2196                         else:
2197                                 url = u'http://www.colbertnation.com/full-episodes/'
2198                         mobj = re.match(self._VALID_URL, url)
2199                         assert mobj is not None
2200
2201                 dlNewest = not mobj.group('episode')
2202                 if dlNewest:
2203                         epTitle = mobj.group('showname')
2204                 else:
2205                         epTitle = mobj.group('episode')
2206
2207                 req = urllib2.Request(url)
2208                 self.report_extraction(epTitle)
2209                 try:
2210                         htmlHandle = urllib2.urlopen(req)
2211                         html = htmlHandle.read()
2212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2213                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2214                         return
2215                 if dlNewest:
2216                         url = htmlHandle.geturl()
2217                         mobj = re.match(self._VALID_URL, url)
2218                         if mobj is None:
2219                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2220                                 return
2221                         if mobj.group('episode') == '':
2222                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2223                                 return
2224                         epTitle = mobj.group('episode')
2225
2226                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2227                 if len(mMovieParams) == 0:
2228                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2229                         return
2230
2231                 playerUrl_raw = mMovieParams[0][0]
2232                 self.report_player_url(epTitle)
2233                 try:
2234                         urlHandle = urllib2.urlopen(playerUrl_raw)
2235                         playerUrl = urlHandle.geturl()
2236                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2237                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2238                         return
2239
2240                 uri = mMovieParams[0][1]
2241                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2242                 self.report_index_download(epTitle)
2243                 try:
2244                         indexXml = urllib2.urlopen(indexUrl).read()
2245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2247                         return
2248
2249                 results = []
2250
2251                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2252                 itemEls = idoc.findall('.//item')
2253                 for itemEl in itemEls:
2254                         mediaId = itemEl.findall('./guid')[0].text
2255                         shortMediaId = mediaId.split(':')[-1]
2256                         showId = mediaId.split(':')[-2].replace('.com', '')
2257                         officialTitle = itemEl.findall('./title')[0].text
2258                         officialDate = itemEl.findall('./pubDate')[0].text
2259
2260                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2261                                                 urllib.urlencode({'uri': mediaId}))
2262                         configReq = urllib2.Request(configUrl)
2263                         self.report_config_download(epTitle)
2264                         try:
2265                                 configXml = urllib2.urlopen(configReq).read()
2266                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2267                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2268                                 return
2269
2270                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2271                         turls = []
2272                         for rendition in cdoc.findall('.//rendition'):
2273                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2274                                 turls.append(finfo)
2275
2276                         if len(turls) == 0:
2277                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2278                                 continue
2279
2280                         # For now, just pick the highest bitrate
2281                         format,video_url = turls[-1]
2282
2283                         effTitle = showId + u'-' + epTitle
2284                         info = {
2285                                 'id': shortMediaId,
2286                                 'url': video_url,
2287                                 'uploader': showId,
2288                                 'upload_date': officialDate,
2289                                 'title': effTitle,
2290                                 'ext': 'mp4',
2291                                 'format': format,
2292                                 'thumbnail': None,
2293                                 'description': officialTitle,
2294                                 'player_url': playerUrl
2295                         }
2296
2297                         results.append(info)
2298
2299                 return results
2300
2301
2302 class EscapistIE(InfoExtractor):
2303         """Information extractor for The Escapist """
2304
2305         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2306         IE_NAME = u'escapist'
2307
2308         def report_extraction(self, showName):
2309                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2310
2311         def report_config_download(self, showName):
2312                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2313
2314         def _real_extract(self, url):
2315                 mobj = re.match(self._VALID_URL, url)
2316                 if mobj is None:
2317                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2318                         return
2319                 showName = mobj.group('showname')
2320                 videoId = mobj.group('episode')
2321
2322                 self.report_extraction(showName)
2323                 try:
2324                         webPage = urllib2.urlopen(url)
2325                         webPageBytes = webPage.read()
2326                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2327                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2328                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2329                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2330                         return
2331
2332                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2333                 description = unescapeHTML(descMatch.group(1))
2334                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2335                 imgUrl = unescapeHTML(imgMatch.group(1))
2336                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2337                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2338                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2339                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2340
2341                 self.report_config_download(showName)
2342                 try:
2343                         configJSON = urllib2.urlopen(configUrl).read()
2344                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2345                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2346                         return
2347
2348                 # Technically, it's JavaScript, not JSON
2349                 configJSON = configJSON.replace("'", '"')
2350
2351                 try:
2352                         config = json.loads(configJSON)
2353                 except (ValueError,), err:
2354                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2355                         return
2356
2357                 playlist = config['playlist']
2358                 videoUrl = playlist[1]['url']
2359
2360                 info = {
2361                         'id': videoId,
2362                         'url': videoUrl,
2363                         'uploader': showName,
2364                         'upload_date': None,
2365                         'title': showName,
2366                         'ext': 'flv',
2367                         'format': 'flv',
2368                         'thumbnail': imgUrl,
2369                         'description': description,
2370                         'player_url': playerUrl,
2371                 }
2372
2373                 return [info]
2374
2375
2376 class CollegeHumorIE(InfoExtractor):
2377         """Information extractor for collegehumor.com"""
2378
2379         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2380         IE_NAME = u'collegehumor'
2381
2382         def report_webpage(self, video_id):
2383                 """Report information extraction."""
2384                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2385
2386         def report_extraction(self, video_id):
2387                 """Report information extraction."""
2388                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2389
2390         def _real_extract(self, url):
2391                 mobj = re.match(self._VALID_URL, url)
2392                 if mobj is None:
2393                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2394                         return
2395                 video_id = mobj.group('videoid')
2396
2397                 self.report_webpage(video_id)
2398                 request = urllib2.Request(url)
2399                 try:
2400                         webpage = urllib2.urlopen(request).read()
2401                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2402                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2403                         return
2404
2405                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2406                 if m is None:
2407                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2408                         return
2409                 internal_video_id = m.group('internalvideoid')
2410
2411                 info = {
2412                         'id': video_id,
2413                         'internal_id': internal_video_id,
2414                 }
2415
2416                 self.report_extraction(video_id)
2417                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2418                 try:
2419                         metaXml = urllib2.urlopen(xmlUrl).read()
2420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2421                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2422                         return
2423
2424                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2425                 try:
2426                         videoNode = mdoc.findall('./video')[0]
2427                         info['description'] = videoNode.findall('./description')[0].text
2428                         info['title'] = videoNode.findall('./caption')[0].text
2429                         info['url'] = videoNode.findall('./file')[0].text
2430                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2431                         info['ext'] = info['url'].rpartition('.')[2]
2432                         info['format'] = info['ext']
2433                 except IndexError:
2434                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2435                         return
2436
2437                 return [info]
2438
2439
2440 class XVideosIE(InfoExtractor):
2441         """Information extractor for xvideos.com"""
2442
2443         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2444         IE_NAME = u'xvideos'
2445
2446         def report_webpage(self, video_id):
2447                 """Report information extraction."""
2448                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2449
2450         def report_extraction(self, video_id):
2451                 """Report information extraction."""
2452                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2453
2454         def _real_extract(self, url):
2455                 mobj = re.match(self._VALID_URL, url)
2456                 if mobj is None:
2457                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2458                         return
2459                 video_id = mobj.group(1).decode('utf-8')
2460
2461                 self.report_webpage(video_id)
2462
2463                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2464                 try:
2465                         webpage = urllib2.urlopen(request).read()
2466                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2468                         return
2469
2470                 self.report_extraction(video_id)
2471
2472
2473                 # Extract video URL
2474                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2475                 if mobj is None:
2476                         self._downloader.trouble(u'ERROR: unable to extract video url')
2477                         return
2478                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2479
2480
2481                 # Extract title
2482                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2483                 if mobj is None:
2484                         self._downloader.trouble(u'ERROR: unable to extract video title')
2485                         return
2486                 video_title = mobj.group(1).decode('utf-8')
2487
2488
2489                 # Extract video thumbnail
2490                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2491                 if mobj is None:
2492                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2493                         return
2494                 video_thumbnail = mobj.group(0).decode('utf-8')
2495
2496                 info = {
2497                         'id': video_id,
2498                         'url': video_url,
2499                         'uploader': None,
2500                         'upload_date': None,
2501                         'title': video_title,
2502                         'ext': 'flv',
2503                         'format': 'flv',
2504                         'thumbnail': video_thumbnail,
2505                         'description': None,
2506                         'player_url': None,
2507                 }
2508
2509                 return [info]
2510
2511
2512 class SoundcloudIE(InfoExtractor):
2513         """Information extractor for soundcloud.com
2514            To access the media, the uid of the song and a stream token
2515            must be extracted from the page source and the script must make
2516            a request to media.soundcloud.com/crossdomain.xml. Then
2517            the media can be grabbed by requesting from an url composed
2518            of the stream token and uid
2519          """
2520
2521         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2522         IE_NAME = u'soundcloud'
2523
2524         def __init__(self, downloader=None):
2525                 InfoExtractor.__init__(self, downloader)
2526
2527         def report_webpage(self, video_id):
2528                 """Report information extraction."""
2529                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2530
2531         def report_extraction(self, video_id):
2532                 """Report information extraction."""
2533                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2534
2535         def _real_extract(self, url):
2536                 mobj = re.match(self._VALID_URL, url)
2537                 if mobj is None:
2538                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2539                         return
2540
2541                 # extract uploader (which is in the url)
2542                 uploader = mobj.group(1).decode('utf-8')
2543                 # extract simple title (uploader + slug of song title)
2544                 slug_title =  mobj.group(2).decode('utf-8')
2545                 simple_title = uploader + u'-' + slug_title
2546
2547                 self.report_webpage('%s/%s' % (uploader, slug_title))
2548
2549                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2550                 try:
2551                         webpage = urllib2.urlopen(request).read()
2552                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2554                         return
2555
2556                 self.report_extraction('%s/%s' % (uploader, slug_title))
2557
2558                 # extract uid and stream token that soundcloud hands out for access
2559                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2560                 if mobj:
2561                         video_id = mobj.group(1)
2562                         stream_token = mobj.group(2)
2563
2564                 # extract unsimplified title
2565                 mobj = re.search('"title":"(.*?)",', webpage)
2566                 if mobj:
2567                         title = mobj.group(1).decode('utf-8')
2568                 else:
2569                         title = simple_title
2570
2571                 # construct media url (with uid/token)
2572                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2573                 mediaURL = mediaURL % (video_id, stream_token)
2574
2575                 # description
2576                 description = u'No description available'
2577                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2578                 if mobj:
2579                         description = mobj.group(1)
2580
2581                 # upload date
2582                 upload_date = None
2583                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2584                 if mobj:
2585                         try:
2586                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2587                         except Exception, e:
2588                                 self._downloader.to_stderr(str(e))
2589
2590                 # for soundcloud, a request to a cross domain is required for cookies
2591                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2592
2593                 return [{
2594                         'id':           video_id.decode('utf-8'),
2595                         'url':          mediaURL,
2596                         'uploader':     uploader.decode('utf-8'),
2597                         'upload_date':  upload_date,
2598                         'title':        title,
2599                         'ext':          u'mp3',
2600                         'format':       u'NA',
2601                         'player_url':   None,
2602                         'description': description.decode('utf-8')
2603                 }]
2604
2605
2606 class InfoQIE(InfoExtractor):
2607         """Information extractor for infoq.com"""
2608
2609         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2610         IE_NAME = u'infoq'
2611
2612         def report_webpage(self, video_id):
2613                 """Report information extraction."""
2614                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2615
2616         def report_extraction(self, video_id):
2617                 """Report information extraction."""
2618                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2619
2620         def _real_extract(self, url):
2621                 mobj = re.match(self._VALID_URL, url)
2622                 if mobj is None:
2623                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2624                         return
2625
2626                 self.report_webpage(url)
2627
2628                 request = urllib2.Request(url)
2629                 try:
2630                         webpage = urllib2.urlopen(request).read()
2631                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2633                         return
2634
2635                 self.report_extraction(url)
2636
2637
2638                 # Extract video URL
2639                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2640                 if mobj is None:
2641                         self._downloader.trouble(u'ERROR: unable to extract video url')
2642                         return
2643                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2644
2645
2646                 # Extract title
2647                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2648                 if mobj is None:
2649                         self._downloader.trouble(u'ERROR: unable to extract video title')
2650                         return
2651                 video_title = mobj.group(1).decode('utf-8')
2652
2653                 # Extract description
2654                 video_description = u'No description available.'
2655                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2656                 if mobj is not None:
2657                         video_description = mobj.group(1).decode('utf-8')
2658
2659                 video_filename = video_url.split('/')[-1]
2660                 video_id, extension = video_filename.split('.')
2661
2662                 info = {
2663                         'id': video_id,
2664                         'url': video_url,
2665                         'uploader': None,
2666                         'upload_date': None,
2667                         'title': video_title,
2668                         'ext': extension,
2669                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2670                         'thumbnail': None,
2671                         'description': video_description,
2672                         'player_url': None,
2673                 }
2674
2675                 return [info]
2676
2677 class MixcloudIE(InfoExtractor):
2678         """Information extractor for www.mixcloud.com"""
2679         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2680         IE_NAME = u'mixcloud'
2681
2682         def __init__(self, downloader=None):
2683                 InfoExtractor.__init__(self, downloader)
2684
2685         def report_download_json(self, file_id):
2686                 """Report JSON download."""
2687                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2688
2689         def report_extraction(self, file_id):
2690                 """Report information extraction."""
2691                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2692
2693         def get_urls(self, jsonData, fmt, bitrate='best'):
2694                 """Get urls from 'audio_formats' section in json"""
2695                 file_url = None
2696                 try:
2697                         bitrate_list = jsonData[fmt]
2698                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2699                                 bitrate = max(bitrate_list) # select highest
2700
2701                         url_list = jsonData[fmt][bitrate]
2702                 except TypeError: # we have no bitrate info.
2703                         url_list = jsonData[fmt]
2704                 return url_list
2705
2706         def check_urls(self, url_list):
2707                 """Returns 1st active url from list"""
2708                 for url in url_list:
2709                         try:
2710                                 urllib2.urlopen(url)
2711                                 return url
2712                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2713                                 url = None
2714
2715                 return None
2716
2717         def _print_formats(self, formats):
2718                 print 'Available formats:'
2719                 for fmt in formats.keys():
2720                         for b in formats[fmt]:
2721                                 try:
2722                                         ext = formats[fmt][b][0]
2723                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2724                                 except TypeError: # we have no bitrate info
2725                                         ext = formats[fmt][0]
2726                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2727                                         break
2728
2729         def _real_extract(self, url):
2730                 mobj = re.match(self._VALID_URL, url)
2731                 if mobj is None:
2732                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2733                         return
2734                 # extract uploader & filename from url
2735                 uploader = mobj.group(1).decode('utf-8')
2736                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2737
2738                 # construct API request
2739                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2740                 # retrieve .json file with links to files
2741                 request = urllib2.Request(file_url)
2742                 try:
2743                         self.report_download_json(file_url)
2744                         jsonData = urllib2.urlopen(request).read()
2745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2746                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2747                         return
2748
2749                 # parse JSON
2750                 json_data = json.loads(jsonData)
2751                 player_url = json_data['player_swf_url']
2752                 formats = dict(json_data['audio_formats'])
2753
2754                 req_format = self._downloader.params.get('format', None)
2755                 bitrate = None
2756
2757                 if self._downloader.params.get('listformats', None):
2758                         self._print_formats(formats)
2759                         return
2760
2761                 if req_format is None or req_format == 'best':
2762                         for format_param in formats.keys():
2763                                 url_list = self.get_urls(formats, format_param)
2764                                 # check urls
2765                                 file_url = self.check_urls(url_list)
2766                                 if file_url is not None:
2767                                         break # got it!
2768                 else:
2769                         if req_format not in formats.keys():
2770                                 self._downloader.trouble(u'ERROR: format is not available')
2771                                 return
2772
2773                         url_list = self.get_urls(formats, req_format)
2774                         file_url = self.check_urls(url_list)
2775                         format_param = req_format
2776
2777                 return [{
2778                         'id': file_id.decode('utf-8'),
2779                         'url': file_url.decode('utf-8'),
2780                         'uploader':     uploader.decode('utf-8'),
2781                         'upload_date': u'NA',
2782                         'title': json_data['name'],
2783                         'ext': file_url.split('.')[-1].decode('utf-8'),
2784                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2785                         'thumbnail': json_data['thumbnail_url'],
2786                         'description': json_data['description'],
2787                         'player_url': player_url.decode('utf-8'),
2788                 }]
2789
2790 class StanfordOpenClassroomIE(InfoExtractor):
2791         """Information extractor for Stanford's Open ClassRoom"""
2792
2793         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2794         IE_NAME = u'stanfordoc'
2795
2796         def report_download_webpage(self, objid):
2797                 """Report information extraction."""
2798                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2799
2800         def report_extraction(self, video_id):
2801                 """Report information extraction."""
2802                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2803
2804         def _real_extract(self, url):
2805                 mobj = re.match(self._VALID_URL, url)
2806                 if mobj is None:
2807                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2808                         return
2809
2810                 if mobj.group('course') and mobj.group('video'): # A specific video
2811                         course = mobj.group('course')
2812                         video = mobj.group('video')
2813                         info = {
2814                                 'id': course + '_' + video,
2815                         }
2816
2817                         self.report_extraction(info['id'])
2818                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2819                         xmlUrl = baseUrl + video + '.xml'
2820                         try:
2821                                 metaXml = urllib2.urlopen(xmlUrl).read()
2822                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2823                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2824                                 return
2825                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2826                         try:
2827                                 info['title'] = mdoc.findall('./title')[0].text
2828                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2829                         except IndexError:
2830                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2831                                 return
2832                         info['ext'] = info['url'].rpartition('.')[2]
2833                         info['format'] = info['ext']
2834                         return [info]
2835                 elif mobj.group('course'): # A course page
2836                         course = mobj.group('course')
2837                         info = {
2838                                 'id': course,
2839                                 'type': 'playlist',
2840                         }
2841
2842                         self.report_download_webpage(info['id'])
2843                         try:
2844                                 coursepage = urllib2.urlopen(url).read()
2845                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2846                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2847                                 return
2848
2849                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2850                         if m:
2851                                 info['title'] = unescapeHTML(m.group(1))
2852                         else:
2853                                 info['title'] = info['id']
2854
2855                         m = re.search('<description>([^<]+)</description>', coursepage)
2856                         if m:
2857                                 info['description'] = unescapeHTML(m.group(1))
2858
2859                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2860                         info['list'] = [
2861                                 {
2862                                         'type': 'reference',
2863                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2864                                 }
2865                                         for vpage in links]
2866                         results = []
2867                         for entry in info['list']:
2868                                 assert entry['type'] == 'reference'
2869                                 results += self.extract(entry['url'])
2870                         return results
2871
2872                 else: # Root page
2873                         info = {
2874                                 'id': 'Stanford OpenClassroom',
2875                                 'type': 'playlist',
2876                         }
2877
2878                         self.report_download_webpage(info['id'])
2879                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2880                         try:
2881                                 rootpage = urllib2.urlopen(rootURL).read()
2882                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2883                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2884                                 return
2885
2886                         info['title'] = info['id']
2887
2888                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2889                         info['list'] = [
2890                                 {
2891                                         'type': 'reference',
2892                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2893                                 }
2894                                         for cpage in links]
2895
2896                         results = []
2897                         for entry in info['list']:
2898                                 assert entry['type'] == 'reference'
2899                                 results += self.extract(entry['url'])
2900                         return results
2901
2902 class MTVIE(InfoExtractor):
2903         """Information extractor for MTV.com"""
2904
2905         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2906         IE_NAME = u'mtv'
2907
2908         def report_webpage(self, video_id):
2909                 """Report information extraction."""
2910                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2911
2912         def report_extraction(self, video_id):
2913                 """Report information extraction."""
2914                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915
2916         def _real_extract(self, url):
2917                 mobj = re.match(self._VALID_URL, url)
2918                 if mobj is None:
2919                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2920                         return
2921                 if not mobj.group('proto'):
2922                         url = 'http://' + url
2923                 video_id = mobj.group('videoid')
2924                 self.report_webpage(video_id)
2925
2926                 request = urllib2.Request(url)
2927                 try:
2928                         webpage = urllib2.urlopen(request).read()
2929                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2930                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2931                         return
2932
2933                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2934                 if mobj is None:
2935                         self._downloader.trouble(u'ERROR: unable to extract song name')
2936                         return
2937                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2938                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2939                 if mobj is None:
2940                         self._downloader.trouble(u'ERROR: unable to extract performer')
2941                         return
2942                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2943                 video_title = performer + ' - ' + song_name
2944
2945                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2946                 if mobj is None:
2947                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2948                         return
2949                 mtvn_uri = mobj.group(1)
2950
2951                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2952                 if mobj is None:
2953                         self._downloader.trouble(u'ERROR: unable to extract content id')
2954                         return
2955                 content_id = mobj.group(1)
2956
2957                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2958                 self.report_extraction(video_id)
2959                 request = urllib2.Request(videogen_url)
2960                 try:
2961                         metadataXml = urllib2.urlopen(request).read()
2962                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2963                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2964                         return
2965
2966                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2967                 renditions = mdoc.findall('.//rendition')
2968
2969                 # For now, always pick the highest quality.
2970                 rendition = renditions[-1]
2971
2972                 try:
2973                         _,_,ext = rendition.attrib['type'].partition('/')
2974                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2975                         video_url = rendition.find('./src').text
2976                 except KeyError:
2977                         self._downloader.trouble('Invalid rendition field.')
2978                         return
2979
2980                 info = {
2981                         'id': video_id,
2982                         'url': video_url,
2983                         'uploader': performer,
2984                         'title': video_title,
2985                         'ext': ext,
2986                         'format': format,
2987                 }
2988
2989                 return [info]
2990
2991
2992 class YoukuIE(InfoExtractor):
2993
2994         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2995         IE_NAME = u'Youku'
2996
2997         def __init__(self, downloader=None):
2998                 InfoExtractor.__init__(self, downloader)
2999
3000         def report_download_webpage(self, file_id):
3001                 """Report webpage download."""
3002                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3003
3004         def report_extraction(self, file_id):
3005                 """Report information extraction."""
3006                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3007
3008         def _gen_sid(self):
3009                 nowTime = int(time.time() * 1000)
3010                 random1 = random.randint(1000,1998)
3011                 random2 = random.randint(1000,9999)
3012
3013                 return "%d%d%d" %(nowTime,random1,random2)
3014
3015         def _get_file_ID_mix_string(self, seed):
3016                 mixed = []
3017                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3018                 seed = float(seed)
3019                 for i in range(len(source)):
3020                         seed  =  (seed * 211 + 30031 ) % 65536
3021                         index  =  math.floor(seed / 65536 * len(source) )
3022                         mixed.append(source[int(index)])
3023                         source.remove(source[int(index)])
3024                 #return ''.join(mixed)
3025                 return mixed
3026
3027         def _get_file_id(self, fileId, seed):
3028                 mixed = self._get_file_ID_mix_string(seed)
3029                 ids = fileId.split('*')
3030                 realId = []
3031                 for ch in ids:
3032                         if ch:
3033                                 realId.append(mixed[int(ch)])
3034                 return ''.join(realId)
3035
3036         def _real_extract(self, url):
3037                 mobj = re.match(self._VALID_URL, url)
3038                 if mobj is None:
3039                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3040                         return
3041                 video_id = mobj.group('ID')
3042
3043                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3044
3045                 request = urllib2.Request(info_url, None, std_headers)
3046                 try:
3047                         self.report_download_webpage(video_id)
3048                         jsondata = urllib2.urlopen(request).read()
3049                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3050                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3051                         return
3052
3053                 self.report_extraction(video_id)
3054                 try:
3055                         config = json.loads(jsondata)
3056
3057                         video_title =  config['data'][0]['title']
3058                         seed = config['data'][0]['seed']
3059
3060                         format = self._downloader.params.get('format', None)
3061                         supported_format = config['data'][0]['streamfileids'].keys()
3062
3063                         if format is None or format == 'best':
3064                                 if 'hd2' in supported_format:
3065                                         format = 'hd2'
3066                                 else:
3067                                         format = 'flv'
3068                                 ext = u'flv'
3069                         elif format == 'worst':
3070                                 format = 'mp4'
3071                                 ext = u'mp4'
3072                         else:
3073                                 format = 'flv'
3074                                 ext = u'flv'
3075
3076
3077                         fileid = config['data'][0]['streamfileids'][format]
3078                         seg_number = len(config['data'][0]['segs'][format])
3079
3080                         keys=[]
3081                         for i in xrange(seg_number):
3082                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3083
3084                         #TODO check error
3085                         #youku only could be viewed from mainland china
3086                 except:
3087                         self._downloader.trouble(u'ERROR: unable to extract info section')
3088                         return
3089
3090                 files_info=[]
3091                 sid = self._gen_sid()
3092                 fileid = self._get_file_id(fileid, seed)
3093
3094                 #column 8,9 of fileid represent the segment number
3095                 #fileid[7:9] should be changed
3096                 for index, key in enumerate(keys):
3097
3098                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3099                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3100
3101                         info = {
3102                                 'id': '%s_part%02d' % (video_id, index),
3103                                 'url': download_url,
3104                                 'uploader': None,
3105                                 'title': video_title,
3106                                 'ext': ext,
3107                                 'format': u'NA'
3108                         }
3109                         files_info.append(info)
3110
3111                 return files_info
3112
3113
3114 class XNXXIE(InfoExtractor):
3115         """Information extractor for xnxx.com"""
3116
3117         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3118         IE_NAME = u'xnxx'
3119         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3120         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3121         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3122
3123         def report_webpage(self, video_id):
3124                 """Report information extraction"""
3125                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3126
3127         def report_extraction(self, video_id):
3128                 """Report information extraction"""
3129                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3130
3131         def _real_extract(self, url):
3132                 mobj = re.match(self._VALID_URL, url)
3133                 if mobj is None:
3134                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3135                         return
3136                 video_id = mobj.group(1).decode('utf-8')
3137
3138                 self.report_webpage(video_id)
3139
3140                 # Get webpage content
3141                 try:
3142                         webpage = urllib2.urlopen(url).read()
3143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3144                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3145                         return
3146
3147                 result = re.search(self.VIDEO_URL_RE, webpage)
3148                 if result is None:
3149                         self._downloader.trouble(u'ERROR: unable to extract video url')
3150                         return
3151                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3152
3153                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3154                 if result is None:
3155                         self._downloader.trouble(u'ERROR: unable to extract video title')
3156                         return
3157                 video_title = result.group(1).decode('utf-8')
3158
3159                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3160                 if result is None:
3161                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3162                         return
3163                 video_thumbnail = result.group(1).decode('utf-8')
3164
3165                 info = {'id': video_id,
3166                                 'url': video_url,
3167                                 'uploader': None,
3168                                 'upload_date': None,
3169                                 'title': video_title,
3170                                 'ext': 'flv',
3171                                 'format': 'flv',
3172                                 'thumbnail': video_thumbnail,
3173                                 'description': None,
3174                                 'player_url': None}
3175
3176                 return [info]