_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import lxml.etree
  72 except ImportError:
  73         pass # Handled below
  74
  75 try:
  76         import xml.etree.ElementTree
  77 except ImportError: # Python<2.5: Not officially supported, but let it slip
  78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  79
  80 std_headers = {
  81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84         'Accept-Encoding': 'gzip, deflate',
  85         'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88 try:
  89         import json
  90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  91         import re
  92         class json(object):
  93                 @staticmethod
  94                 def loads(s):
  95                         s = s.decode('UTF-8')
  96                         def raiseError(msg, i):
  97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  98                         def skipSpace(i, expectMore=True):
  99                                 while i < len(s) and s[i] in ' \t\r\n':
 100                                         i += 1
 101                                 if expectMore:
 102                                         if i >= len(s):
 103                                                 raiseError('Premature end', i)
 104                                 return i
 105                         def decodeEscape(match):
 106                                 esc = match.group(1)
 107                                 _STATIC = {
 108                                         '"': '"',
 109                                         '\\': '\\',
 110                                         '/': '/',
 111                                         'b': unichr(0x8),
 112                                         'f': unichr(0xc),
 113                                         'n': '\n',
 114                                         'r': '\r',
 115                                         't': '\t',
 116                                 }
 117                                 if esc in _STATIC:
 118                                         return _STATIC[esc]
 119                                 if esc[0] == 'u':
 120                                         if len(esc) == 1+4:
 121                                                 return unichr(int(esc[1:5], 16))
 122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 123                                                 hi = int(esc[1:5], 16)
 124                                                 low = int(esc[7:11], 16)
 125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 126                                 raise ValueError('Unknown escape ' + str(esc))
 127                         def parseString(i):
 128                                 i += 1
 129                                 e = i
 130                                 while True:
 131                                         e = s.index('"', e)
 132                                         bslashes = 0
 133                                         while s[e-bslashes-1] == '\\':
 134                                                 bslashes += 1
 135                                         if bslashes % 2 == 1:
 136                                                 e += 1
 137                                                 continue
 138                                         break
 139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 140                                 stri = rexp.sub(decodeEscape, s[i:e])
 141                                 return (e+1,stri)
 142                         def parseObj(i):
 143                                 i += 1
 144                                 res = {}
 145                                 i = skipSpace(i)
 146                                 if s[i] == '}': # Empty dictionary
 147                                         return (i+1,res)
 148                                 while True:
 149                                         if s[i] != '"':
 150                                                 raiseError('Expected a string object key', i)
 151                                         i,key = parseString(i)
 152                                         i = skipSpace(i)
 153                                         if i >= len(s) or s[i] != ':':
 154                                                 raiseError('Expected a colon', i)
 155                                         i,val = parse(i+1)
 156                                         res[key] = val
 157                                         i = skipSpace(i)
 158                                         if s[i] == '}':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected comma or closing curly brace', i)
 162                                         i = skipSpace(i+1)
 163                         def parseArray(i):
 164                                 res = []
 165                                 i = skipSpace(i+1)
 166                                 if s[i] == ']': # Empty array
 167                                         return (i+1,res)
 168                                 while True:
 169                                         i,val = parse(i)
 170                                         res.append(val)
 171                                         i = skipSpace(i) # Raise exception if premature end
 172                                         if s[i] == ']':
 173                                                 return (i+1, res)
 174                                         if s[i] != ',':
 175                                                 raiseError('Expected a comma or closing bracket', i)
 176                                         i = skipSpace(i+1)
 177                         def parseDiscrete(i):
 178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 179                                         if s.startswith(k, i):
 180                                                 return (i+len(k), v)
 181                                 raiseError('Not a boolean (or null)', i)
 182                         def parseNumber(i):
 183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 184                                 if mobj is None:
 185                                         raiseError('Not a number', i)
 186                                 nums = mobj.group(1)
 187                                 if '.' in nums or 'e' in nums or 'E' in nums:
 188                                         return (i+len(nums), float(nums))
 189                                 return (i+len(nums), int(nums))
 190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 191                         def parse(i):
 192                                 i = skipSpace(i)
 193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 194                                 i = skipSpace(i, False)
 195                                 return (i,res)
 196                         i,res = parse(0)
 197                         if i < len(s):
 198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 199                         return res
 200
 201 def preferredencoding():
 202         """Get preferred encoding.
 203
 204         Returns the best encoding scheme for the system, based on
 205         locale.getpreferredencoding() and some further tweaks.
 206         """
 207         def yield_preferredencoding():
 208                 try:
 209                         pref = locale.getpreferredencoding()
 210                         u'TEST'.encode(pref)
 211                 except:
 212                         pref = 'UTF-8'
 213                 while True:
 214                         yield pref
 215         return yield_preferredencoding().next()
 216
 217
 218 def htmlentity_transform(matchobj):
 219         """Transforms an HTML entity to a Unicode character.
 220
 221         This function receives a match object and is intended to be used with
 222         the re.sub() function.
 223         """
 224         entity = matchobj.group(1)
 225
 226         # Known non-numeric HTML entity
 227         if entity in htmlentitydefs.name2codepoint:
 228                 return unichr(htmlentitydefs.name2codepoint[entity])
 229
 230         # Unicode character
 231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 232         if mobj is not None:
 233                 numstr = mobj.group(1)
 234                 if numstr.startswith(u'x'):
 235                         base = 16
 236                         numstr = u'0%s' % numstr
 237                 else:
 238                         base = 10
 239                 return unichr(long(numstr, base))
 240
 241         # Unknown entity in name, return its literal representation
 242         return (u'&%s;' % entity)
 243
 244
 245 def sanitize_title(utitle):
 246         """Sanitizes a video title so it could be used as part of a filename."""
 247         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 248         return utitle.replace(unicode(os.sep), u'%')
 249
 250
 251 def sanitize_open(filename, open_mode):
 252         """Try to open the given filename, and slightly tweak it if this fails.
 253
 254         Attempts to open the given filename. If this fails, it tries to change
 255         the filename slightly, step by step, until it's either able to open it
 256         or it fails and raises a final exception, like the standard open()
 257         function.
 258
 259         It returns the tuple (stream, definitive_file_name).
 260         """
 261         try:
 262                 if filename == u'-':
 263                         if sys.platform == 'win32':
 264                                 import msvcrt
 265                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 266                         return (sys.stdout, filename)
 267                 stream = open(_encodeFilename(filename), open_mode)
 268                 return (stream, filename)
 269         except (IOError, OSError), err:
 270                 # In case of error, try to remove win32 forbidden chars
 271                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 272
 273                 # An exception here should be caught in the caller
 274                 stream = open(_encodeFilename(filename), open_mode)
 275                 return (stream, filename)
 276
 277
 278 def timeconvert(timestr):
 279         """Convert RFC 2822 defined time string into system timestamp"""
 280         timestamp = None
 281         timetuple = email.utils.parsedate_tz(timestr)
 282         if timetuple is not None:
 283                 timestamp = email.utils.mktime_tz(timetuple)
 284         return timestamp
 285
 286 def _simplify_title(title):
 287         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 288         return expr.sub(u'_', title).strip(u'_')
 289
 290 def _orderedSet(iterable):
 291         """ Remove all duplicates from the input iterable """
 292         res = []
 293         for el in iterable:
 294                 if el not in res:
 295                         res.append(el)
 296         return res
 297
 298 def _unescapeHTML(s):
 299         """
 300         @param s a string (of type unicode)
 301         """
 302         assert type(s) == type(u'')
 303
 304         htmlParser = HTMLParser.HTMLParser()
 305         return htmlParser.unescape(s)
 306
 307 def _encodeFilename(s):
 308         """
 309         @param s The name of the file (of type unicode)
 310         """
 311
 312         assert type(s) == type(u'')
 313
 314         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 315                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 316                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 317                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 318                 return s
 319         else:
 320                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 321
 322 class DownloadError(Exception):
 323         """Download Error exception.
 324
 325         This exception may be thrown by FileDownloader objects if they are not
 326         configured to continue on errors. They will contain the appropriate
 327         error message.
 328         """
 329         pass
 330
 331
 332 class SameFileError(Exception):
 333         """Same File exception.
 334
 335         This exception will be thrown by FileDownloader objects if they detect
 336         multiple files would have to be downloaded to the same file on disk.
 337         """
 338         pass
 339
 340
 341 class PostProcessingError(Exception):
 342         """Post Processing exception.
 343
 344         This exception may be raised by PostProcessor's .run() method to
 345         indicate an error in the postprocessing task.
 346         """
 347         pass
 348
 349 class MaxDownloadsReached(Exception):
 350         """ --max-downloads limit has been reached. """
 351         pass
 352
 353
 354 class UnavailableVideoError(Exception):
 355         """Unavailable Format exception.
 356
 357         This exception will be thrown when a video is requested
 358         in a format that is not available for that video.
 359         """
 360         pass
 361
 362
 363 class ContentTooShortError(Exception):
 364         """Content Too Short exception.
 365
 366         This exception may be raised by FileDownloader objects when a file they
 367         download is too small for what the server announced first, indicating
 368         the connection was probably interrupted.
 369         """
 370         # Both in bytes
 371         downloaded = None
 372         expected = None
 373
 374         def __init__(self, downloaded, expected):
 375                 self.downloaded = downloaded
 376                 self.expected = expected
 377
 378
 379 class YoutubeDLHandler(urllib2.HTTPHandler):
 380         """Handler for HTTP requests and responses.
 381
 382         This class, when installed with an OpenerDirector, automatically adds
 383         the standard headers to every HTTP request and handles gzipped and
 384         deflated responses from web servers. If compression is to be avoided in
 385         a particular request, the original request in the program code only has
 386         to include the HTTP header "Youtubedl-No-Compression", which will be
 387         removed before making the real request.
 388
 389         Part of this code was copied from:
 390
 391         http://techknack.net/python-urllib2-handlers/
 392
 393         Andrew Rowls, the author of that code, agreed to release it to the
 394         public domain.
 395         """
 396
 397         @staticmethod
 398         def deflate(data):
 399                 try:
 400                         return zlib.decompress(data, -zlib.MAX_WBITS)
 401                 except zlib.error:
 402                         return zlib.decompress(data)
 403
 404         @staticmethod
 405         def addinfourl_wrapper(stream, headers, url, code):
 406                 if hasattr(urllib2.addinfourl, 'getcode'):
 407                         return urllib2.addinfourl(stream, headers, url, code)
 408                 ret = urllib2.addinfourl(stream, headers, url)
 409                 ret.code = code
 410                 return ret
 411
 412         def http_request(self, req):
 413                 for h in std_headers:
 414                         if h in req.headers:
 415                                 del req.headers[h]
 416                         req.add_header(h, std_headers[h])
 417                 if 'Youtubedl-no-compression' in req.headers:
 418                         if 'Accept-encoding' in req.headers:
 419                                 del req.headers['Accept-encoding']
 420                         del req.headers['Youtubedl-no-compression']
 421                 return req
 422
 423         def http_response(self, req, resp):
 424                 old_resp = resp
 425                 # gzip
 426                 if resp.headers.get('Content-encoding', '') == 'gzip':
 427                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 428                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 429                         resp.msg = old_resp.msg
 430                 # deflate
 431                 if resp.headers.get('Content-encoding', '') == 'deflate':
 432                         gz = StringIO.StringIO(self.deflate(resp.read()))
 433                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 434                         resp.msg = old_resp.msg
 435                 return resp
 436
 437
 438 class FileDownloader(object):
 439         """File Downloader class.
 440
 441         File downloader objects are the ones responsible of downloading the
 442         actual video file and writing it to disk if the user has requested
 443         it, among some other tasks. In most cases there should be one per
 444         program. As, given a video URL, the downloader doesn't know how to
 445         extract all the needed information, task that InfoExtractors do, it
 446         has to pass the URL to one of them.
 447
 448         For this, file downloader objects have a method that allows
 449         InfoExtractors to be registered in a given order. When it is passed
 450         a URL, the file downloader handles it to the first InfoExtractor it
 451         finds that reports being able to handle it. The InfoExtractor extracts
 452         all the information about the video or videos the URL refers to, and
 453         asks the FileDownloader to process the video information, possibly
 454         downloading the video.
 455
 456         File downloaders accept a lot of parameters. In order not to saturate
 457         the object constructor with arguments, it receives a dictionary of
 458         options instead. These options are available through the params
 459         attribute for the InfoExtractors to use. The FileDownloader also
 460         registers itself as the downloader in charge for the InfoExtractors
 461         that are added to it, so this is a "mutual registration".
 462
 463         Available options:
 464
 465         username:         Username for authentication purposes.
 466         password:         Password for authentication purposes.
 467         usenetrc:         Use netrc for authentication instead.
 468         quiet:            Do not print messages to stdout.
 469         forceurl:         Force printing final URL.
 470         forcetitle:       Force printing title.
 471         forcethumbnail:   Force printing thumbnail URL.
 472         forcedescription: Force printing description.
 473         forcefilename:    Force printing final filename.
 474         simulate:         Do not download the video files.
 475         format:           Video format code.
 476         format_limit:     Highest quality format to try.
 477         outtmpl:          Template for output names.
 478         ignoreerrors:     Do not stop on download errors.
 479         ratelimit:        Download speed limit, in bytes/sec.
 480         nooverwrites:     Prevent overwriting files.
 481         retries:          Number of times to retry for HTTP error 5xx
 482         continuedl:       Try to continue downloads if possible.
 483         noprogress:       Do not print the progress bar.
 484         playliststart:    Playlist item to start at.
 485         playlistend:      Playlist item to end at.
 486         matchtitle:       Download only matching titles.
 487         rejecttitle:      Reject downloads for matching titles.
 488         logtostderr:      Log messages to stderr instead of stdout.
 489         consoletitle:     Display progress in console window's titlebar.
 490         nopart:           Do not use temporary .part files.
 491         updatetime:       Use the Last-modified header to set output file timestamps.
 492         writedescription: Write the video description to a .description file
 493         writeinfojson:    Write the video description to a .info.json file
 494         writesubtitles:   Write the video subtitles to a .srt file
 495         subtitleslang:    Language of the subtitles to download
 496         """
 497
 498         params = None
 499         _ies = []
 500         _pps = []
 501         _download_retcode = None
 502         _num_downloads = None
 503         _screen_file = None
 504
 505         def __init__(self, params):
 506                 """Create a FileDownloader object with the given options."""
 507                 self._ies = []
 508                 self._pps = []
 509                 self._download_retcode = 0
 510                 self._num_downloads = 0
 511                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 512                 self.params = params
 513
 514         @staticmethod
 515         def format_bytes(bytes):
 516                 if bytes is None:
 517                         return 'N/A'
 518                 if type(bytes) is str:
 519                         bytes = float(bytes)
 520                 if bytes == 0.0:
 521                         exponent = 0
 522                 else:
 523                         exponent = long(math.log(bytes, 1024.0))
 524                 suffix = 'bkMGTPEZY'[exponent]
 525                 converted = float(bytes) / float(1024 ** exponent)
 526                 return '%.2f%s' % (converted, suffix)
 527
 528         @staticmethod
 529         def calc_percent(byte_counter, data_len):
 530                 if data_len is None:
 531                         return '---.-%'
 532                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 533
 534         @staticmethod
 535         def calc_eta(start, now, total, current):
 536                 if total is None:
 537                         return '--:--'
 538                 dif = now - start
 539                 if current == 0 or dif < 0.001: # One millisecond
 540                         return '--:--'
 541                 rate = float(current) / dif
 542                 eta = long((float(total) - float(current)) / rate)
 543                 (eta_mins, eta_secs) = divmod(eta, 60)
 544                 if eta_mins > 99:
 545                         return '--:--'
 546                 return '%02d:%02d' % (eta_mins, eta_secs)
 547
 548         @staticmethod
 549         def calc_speed(start, now, bytes):
 550                 dif = now - start
 551                 if bytes == 0 or dif < 0.001: # One millisecond
 552                         return '%10s' % '---b/s'
 553                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 554
 555         @staticmethod
 556         def best_block_size(elapsed_time, bytes):
 557                 new_min = max(bytes / 2.0, 1.0)
 558                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 559                 if elapsed_time < 0.001:
 560                         return long(new_max)
 561                 rate = bytes / elapsed_time
 562                 if rate > new_max:
 563                         return long(new_max)
 564                 if rate < new_min:
 565                         return long(new_min)
 566                 return long(rate)
 567
 568         @staticmethod
 569         def parse_bytes(bytestr):
 570                 """Parse a string indicating a byte quantity into a long integer."""
 571                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 572                 if matchobj is None:
 573                         return None
 574                 number = float(matchobj.group(1))
 575                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 576                 return long(round(number * multiplier))
 577
 578         def add_info_extractor(self, ie):
 579                 """Add an InfoExtractor object to the end of the list."""
 580                 self._ies.append(ie)
 581                 ie.set_downloader(self)
 582
 583         def add_post_processor(self, pp):
 584                 """Add a PostProcessor object to the end of the chain."""
 585                 self._pps.append(pp)
 586                 pp.set_downloader(self)
 587
 588         def to_screen(self, message, skip_eol=False):
 589                 """Print message to stdout if not in quiet mode."""
 590                 assert type(message) == type(u'')
 591                 if not self.params.get('quiet', False):
 592                         terminator = [u'\n', u''][skip_eol]
 593                         output = message + terminator
 594
 595                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 596                                 output = output.encode(preferredencoding(), 'ignore')
 597                         self._screen_file.write(output)
 598                         self._screen_file.flush()
 599
 600         def to_stderr(self, message):
 601                 """Print message to stderr."""
 602                 print >>sys.stderr, message.encode(preferredencoding())
 603
 604         def to_cons_title(self, message):
 605                 """Set console/terminal window title to message."""
 606                 if not self.params.get('consoletitle', False):
 607                         return
 608                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 609                         # c_wchar_p() might not be necessary if `message` is
 610                         # already of type unicode()
 611                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 612                 elif 'TERM' in os.environ:
 613                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 614
 615         def fixed_template(self):
 616                 """Checks if the output template is fixed."""
 617                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 618
 619         def trouble(self, message=None):
 620                 """Determine action to take when a download problem appears.
 621
 622                 Depending on if the downloader has been configured to ignore
 623                 download errors or not, this method may throw an exception or
 624                 not when errors are found, after printing the message.
 625                 """
 626                 if message is not None:
 627                         self.to_stderr(message)
 628                 if not self.params.get('ignoreerrors', False):
 629                         raise DownloadError(message)
 630                 self._download_retcode = 1
 631
 632         def slow_down(self, start_time, byte_counter):
 633                 """Sleep if the download speed is over the rate limit."""
 634                 rate_limit = self.params.get('ratelimit', None)
 635                 if rate_limit is None or byte_counter == 0:
 636                         return
 637                 now = time.time()
 638                 elapsed = now - start_time
 639                 if elapsed <= 0.0:
 640                         return
 641                 speed = float(byte_counter) / elapsed
 642                 if speed > rate_limit:
 643                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 644
 645         def temp_name(self, filename):
 646                 """Returns a temporary filename for the given filename."""
 647                 if self.params.get('nopart', False) or filename == u'-' or \
 648                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 649                         return filename
 650                 return filename + u'.part'
 651
 652         def undo_temp_name(self, filename):
 653                 if filename.endswith(u'.part'):
 654                         return filename[:-len(u'.part')]
 655                 return filename
 656
 657         def try_rename(self, old_filename, new_filename):
 658                 try:
 659                         if old_filename == new_filename:
 660                                 return
 661                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 662                 except (IOError, OSError), err:
 663                         self.trouble(u'ERROR: unable to rename file')
 664
 665         def try_utime(self, filename, last_modified_hdr):
 666                 """Try to set the last-modified time of the given file."""
 667                 if last_modified_hdr is None:
 668                         return
 669                 if not os.path.isfile(_encodeFilename(filename)):
 670                         return
 671                 timestr = last_modified_hdr
 672                 if timestr is None:
 673                         return
 674                 filetime = timeconvert(timestr)
 675                 if filetime is None:
 676                         return filetime
 677                 try:
 678                         os.utime(filename, (time.time(), filetime))
 679                 except:
 680                         pass
 681                 return filetime
 682
 683         def report_writedescription(self, descfn):
 684                 """ Report that the description file is being written """
 685                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 686
 687         def report_writesubtitles(self, srtfn):
 688                 """ Report that the subtitles file is being written """
 689                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 690
 691         def report_writeinfojson(self, infofn):
 692                 """ Report that the metadata file has been written """
 693                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 694
 695         def report_destination(self, filename):
 696                 """Report destination filename."""
 697                 self.to_screen(u'[download] Destination: ' + filename)
 698
 699         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 700                 """Report download progress."""
 701                 if self.params.get('noprogress', False):
 702                         return
 703                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 704                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 705                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 706                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 707
 708         def report_resuming_byte(self, resume_len):
 709                 """Report attempt to resume at given byte."""
 710                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 711
 712         def report_retry(self, count, retries):
 713                 """Report retry in case of HTTP error 5xx"""
 714                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 715
 716         def report_file_already_downloaded(self, file_name):
 717                 """Report file has already been fully downloaded."""
 718                 try:
 719                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 720                 except (UnicodeEncodeError), err:
 721                         self.to_screen(u'[download] The file has already been downloaded')
 722
 723         def report_unable_to_resume(self):
 724                 """Report it was impossible to resume download."""
 725                 self.to_screen(u'[download] Unable to resume')
 726
 727         def report_finish(self):
 728                 """Report download finished."""
 729                 if self.params.get('noprogress', False):
 730                         self.to_screen(u'[download] Download completed')
 731                 else:
 732                         self.to_screen(u'')
 733
 734         def increment_downloads(self):
 735                 """Increment the ordinal that assigns a number to each file."""
 736                 self._num_downloads += 1
 737
 738         def prepare_filename(self, info_dict):
 739                 """Generate the output filename."""
 740                 try:
 741                         template_dict = dict(info_dict)
 742                         template_dict['epoch'] = unicode(long(time.time()))
 743                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 744                         filename = self.params['outtmpl'] % template_dict
 745                         return filename
 746                 except (ValueError, KeyError), err:
 747                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 748                         return None
 749
 750         def _match_entry(self, info_dict):
 751                 """ Returns None iff the file should be downloaded """
 752
 753                 title = info_dict['title']
 754                 matchtitle = self.params.get('matchtitle', False)
 755                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 756                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 757                 rejecttitle = self.params.get('rejecttitle', False)
 758                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 759                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 760                 return None
 761
 762         def process_info(self, info_dict):
 763                 """Process a single dictionary returned by an InfoExtractor."""
 764
 765                 reason = self._match_entry(info_dict)
 766                 if reason is not None:
 767                         self.to_screen(u'[download] ' + reason)
 768                         return
 769
 770                 max_downloads = self.params.get('max_downloads')
 771                 if max_downloads is not None:
 772                         if self._num_downloads > int(max_downloads):
 773                                 raise MaxDownloadsReached()
 774
 775                 filename = self.prepare_filename(info_dict)
 776
 777                 # Forced printings
 778                 if self.params.get('forcetitle', False):
 779                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 780                 if self.params.get('forceurl', False):
 781                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 782                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 783                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 784                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 785                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 786                 if self.params.get('forcefilename', False) and filename is not None:
 787                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 788                 if self.params.get('forceformat', False):
 789                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 790
 791                 # Do nothing else if in simulate mode
 792                 if self.params.get('simulate', False):
 793                         return
 794
 795                 if filename is None:
 796                         return
 797
 798                 try:
 799                         dn = os.path.dirname(_encodeFilename(filename))
 800                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 801                                 os.makedirs(dn)
 802                 except (OSError, IOError), err:
 803                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 804                         return
 805
 806                 if self.params.get('writedescription', False):
 807                         try:
 808                                 descfn = filename + u'.description'
 809                                 self.report_writedescription(descfn)
 810                                 descfile = open(_encodeFilename(descfn), 'wb')
 811                                 try:
 812                                         descfile.write(info_dict['description'].encode('utf-8'))
 813                                 finally:
 814                                         descfile.close()
 815                         except (OSError, IOError):
 816                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 817                                 return
 818
 819                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 820                         # subtitles download errors are already managed as troubles in relevant IE
 821                         # that way it will silently go on when used with unsupporting IE
 822                         try:
 823                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 824                                 self.report_writesubtitles(srtfn)
 825                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 826                                 try:
 827                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 828                                 finally:
 829                                         srtfile.close()
 830                         except (OSError, IOError):
 831                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 832                                 return
 833
 834                 if self.params.get('writeinfojson', False):
 835                         infofn = filename + u'.info.json'
 836                         self.report_writeinfojson(infofn)
 837                         try:
 838                                 json.dump
 839                         except (NameError,AttributeError):
 840                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 841                                 return
 842                         try:
 843                                 infof = open(_encodeFilename(infofn), 'wb')
 844                                 try:
 845                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 846                                         json.dump(json_info_dict, infof)
 847                                 finally:
 848                                         infof.close()
 849                         except (OSError, IOError):
 850                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 851                                 return
 852
 853                 if not self.params.get('skip_download', False):
 854                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 855                                 success = True
 856                         else:
 857                                 try:
 858                                         success = self._do_download(filename, info_dict)
 859                                 except (OSError, IOError), err:
 860                                         raise UnavailableVideoError
 861                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 863                                         return
 864                                 except (ContentTooShortError, ), err:
 865                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 866                                         return
 867
 868                         if success:
 869                                 try:
 870                                         self.post_process(filename, info_dict)
 871                                 except (PostProcessingError), err:
 872                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 873                                         return
 874
 875         def download(self, url_list):
 876                 """Download a given list of URLs."""
 877                 if len(url_list) > 1 and self.fixed_template():
 878                         raise SameFileError(self.params['outtmpl'])
 879
 880                 for url in url_list:
 881                         suitable_found = False
 882                         for ie in self._ies:
 883                                 # Go to next InfoExtractor if not suitable
 884                                 if not ie.suitable(url):
 885                                         continue
 886
 887                                 # Suitable InfoExtractor found
 888                                 suitable_found = True
 889
 890                                 # Extract information from URL and process it
 891                                 ie.extract(url)
 892
 893                                 # Suitable InfoExtractor had been found; go to next URL
 894                                 break
 895
 896                         if not suitable_found:
 897                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 898
 899                 return self._download_retcode
 900
 901         def post_process(self, filename, ie_info):
 902                 """Run the postprocessing chain on the given file."""
 903                 info = dict(ie_info)
 904                 info['filepath'] = filename
 905                 for pp in self._pps:
 906                         info = pp.run(info)
 907                         if info is None:
 908                                 break
 909
 910         def _download_with_rtmpdump(self, filename, url, player_url):
 911                 self.report_destination(filename)
 912                 tmpfilename = self.temp_name(filename)
 913
 914                 # Check for rtmpdump first
 915                 try:
 916                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 917                 except (OSError, IOError):
 918                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 919                         return False
 920
 921                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 922                 # the connection was interrumpted and resuming appears to be
 923                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 924                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 925                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 926                 if self.params.get('verbose', False):
 927                         try:
 928                                 import pipes
 929                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 930                         except ImportError:
 931                                 shell_quote = repr
 932                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 933                 retval = subprocess.call(args)
 934                 while retval == 2 or retval == 1:
 935                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 936                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 937                         time.sleep(5.0) # This seems to be needed
 938                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 939                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 940                         if prevsize == cursize and retval == 1:
 941                                 break
 942                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 943                         if prevsize == cursize and retval == 2 and cursize > 1024:
 944                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 945                                 retval = 0
 946                                 break
 947                 if retval == 0:
 948                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 949                         self.try_rename(tmpfilename, filename)
 950                         return True
 951                 else:
 952                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 953                         return False
 954
 955         def _do_download(self, filename, info_dict):
 956                 url = info_dict['url']
 957                 player_url = info_dict.get('player_url', None)
 958
 959                 # Check file already present
 960                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 961                         self.report_file_already_downloaded(filename)
 962                         return True
 963
 964                 # Attempt to download using rtmpdump
 965                 if url.startswith('rtmp'):
 966                         return self._download_with_rtmpdump(filename, url, player_url)
 967
 968                 tmpfilename = self.temp_name(filename)
 969                 stream = None
 970
 971                 # Do not include the Accept-Encoding header
 972                 headers = {'Youtubedl-no-compression': 'True'}
 973                 basic_request = urllib2.Request(url, None, headers)
 974                 request = urllib2.Request(url, None, headers)
 975
 976                 # Establish possible resume length
 977                 if os.path.isfile(_encodeFilename(tmpfilename)):
 978                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 979                 else:
 980                         resume_len = 0
 981
 982                 open_mode = 'wb'
 983                 if resume_len != 0:
 984                         if self.params.get('continuedl', False):
 985                                 self.report_resuming_byte(resume_len)
 986                                 request.add_header('Range','bytes=%d-' % resume_len)
 987                                 open_mode = 'ab'
 988                         else:
 989                                 resume_len = 0
 990
 991                 count = 0
 992                 retries = self.params.get('retries', 0)
 993                 while count <= retries:
 994                         # Establish connection
 995                         try:
 996                                 if count == 0 and 'urlhandle' in info_dict:
 997                                         data = info_dict['urlhandle']
 998                                 data = urllib2.urlopen(request)
 999                                 break
1000                         except (urllib2.HTTPError, ), err:
1001                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002                                         # Unexpected HTTP error
1003                                         raise
1004                                 elif err.code == 416:
1005                                         # Unable to resume (requested range not satisfiable)
1006                                         try:
1007                                                 # Open the connection again without the range header
1008                                                 data = urllib2.urlopen(basic_request)
1009                                                 content_length = data.info()['Content-Length']
1010                                         except (urllib2.HTTPError, ), err:
1011                                                 if err.code < 500 or err.code >= 600:
1012                                                         raise
1013                                         else:
1014                                                 # Examine the reported length
1015                                                 if (content_length is not None and
1016                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017                                                         # The file had already been fully downloaded.
1018                                                         # Explanation to the above condition: in issue #175 it was revealed that
1019                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1020                                                         # changing the file size slightly and causing problems for some users. So
1021                                                         # I decided to implement a suggested change and consider the file
1022                                                         # completely downloaded if the file size differs less than 100 bytes from
1023                                                         # the one in the hard drive.
1024                                                         self.report_file_already_downloaded(filename)
1025                                                         self.try_rename(tmpfilename, filename)
1026                                                         return True
1027                                                 else:
1028                                                         # The length does not match, we start the download over
1029                                                         self.report_unable_to_resume()
1030                                                         open_mode = 'wb'
1031                                                         break
1032                         # Retry
1033                         count += 1
1034                         if count <= retries:
1035                                 self.report_retry(count, retries)
1036
1037                 if count > retries:
1038                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1039                         return False
1040
1041                 data_len = data.info().get('Content-length', None)
1042                 if data_len is not None:
1043                         data_len = long(data_len) + resume_len
1044                 data_len_str = self.format_bytes(data_len)
1045                 byte_counter = 0 + resume_len
1046                 block_size = 1024
1047                 start = time.time()
1048                 while True:
1049                         # Download and write
1050                         before = time.time()
1051                         data_block = data.read(block_size)
1052                         after = time.time()
1053                         if len(data_block) == 0:
1054                                 break
1055                         byte_counter += len(data_block)
1056
1057                         # Open file just in time
1058                         if stream is None:
1059                                 try:
1060                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061                                         assert stream is not None
1062                                         filename = self.undo_temp_name(tmpfilename)
1063                                         self.report_destination(filename)
1064                                 except (OSError, IOError), err:
1065                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066                                         return False
1067                         try:
1068                                 stream.write(data_block)
1069                         except (IOError, OSError), err:
1070                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071                                 return False
1072                         block_size = self.best_block_size(after - before, len(data_block))
1073
1074                         # Progress message
1075                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076                         if data_len is None:
1077                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078                         else:
1079                                 percent_str = self.calc_percent(byte_counter, data_len)
1080                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083                         # Apply rate limit
1084                         self.slow_down(start, byte_counter - resume_len)
1085
1086                 if stream is None:
1087                         self.trouble(u'\nERROR: Did not get any data blocks')
1088                         return False
1089                 stream.close()
1090                 self.report_finish()
1091                 if data_len is not None and byte_counter != data_len:
1092                         raise ContentTooShortError(byte_counter, long(data_len))
1093                 self.try_rename(tmpfilename, filename)
1094
1095                 # Update file modification time
1096                 if self.params.get('updatetime', True):
1097                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099                 return True
1100
1101
1102 class InfoExtractor(object):
1103         """Information Extractor class.
1104
1105         Information extractors are the classes that, given a URL, extract
1106         information from the video (or videos) the URL refers to. This
1107         information includes the real video URL, the video title and simplified
1108         title, author and others. The information is stored in a dictionary
1109         which is then passed to the FileDownloader. The FileDownloader
1110         processes this information possibly downloading the video to the file
1111         system, among other possible outcomes. The dictionaries must include
1112         the following fields:
1113
1114         id:             Video identifier.
1115         url:            Final video URL.
1116         uploader:       Nickname of the video uploader.
1117         title:          Literal title.
1118         stitle:         Simplified title.
1119         ext:            Video filename extension.
1120         format:         Video format.
1121         player_url:     SWF Player URL (may be None).
1122
1123         The following fields are optional. Their primary purpose is to allow
1124         youtube-dl to serve as the backend for a video search function, such
1125         as the one in youtube2mp3.  They are only used when their respective
1126         forced printing functions are called:
1127
1128         thumbnail:      Full URL to a video thumbnail image.
1129         description:    One-line video description.
1130
1131         Subclasses of this one should re-define the _real_initialize() and
1132         _real_extract() methods and define a _VALID_URL regexp.
1133         Probably, they should also be added to the list of extractors.
1134         """
1135
1136         _ready = False
1137         _downloader = None
1138
1139         def __init__(self, downloader=None):
1140                 """Constructor. Receives an optional downloader."""
1141                 self._ready = False
1142                 self.set_downloader(downloader)
1143
1144         def suitable(self, url):
1145                 """Receives a URL and returns True if suitable for this IE."""
1146                 return re.match(self._VALID_URL, url) is not None
1147
1148         def initialize(self):
1149                 """Initializes an instance (authentication, etc)."""
1150                 if not self._ready:
1151                         self._real_initialize()
1152                         self._ready = True
1153
1154         def extract(self, url):
1155                 """Extracts URL information and returns it in list of dicts."""
1156                 self.initialize()
1157                 return self._real_extract(url)
1158
1159         def set_downloader(self, downloader):
1160                 """Sets the downloader for this IE."""
1161                 self._downloader = downloader
1162
1163         def _real_initialize(self):
1164                 """Real initialization process. Redefine in subclasses."""
1165                 pass
1166
1167         def _real_extract(self, url):
1168                 """Real extraction process. Redefine in subclasses."""
1169                 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173         """Information extractor for youtube.com."""
1174
1175         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1180         _NETRC_MACHINE = 'youtube'
1181         # Listed in order of quality
1182         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184         _video_extensions = {
1185                 '13': '3gp',
1186                 '17': 'mp4',
1187                 '18': 'mp4',
1188                 '22': 'mp4',
1189                 '37': 'mp4',
1190                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1191                 '43': 'webm',
1192                 '44': 'webm',
1193                 '45': 'webm',
1194         }
1195         _video_dimensions = {
1196                 '5': '240x400',
1197                 '6': '???',
1198                 '13': '???',
1199                 '17': '144x176',
1200                 '18': '360x640',
1201                 '22': '720x1280',
1202                 '34': '360x640',
1203                 '35': '480x854',
1204                 '37': '1080x1920',
1205                 '38': '3072x4096',
1206                 '43': '360x640',
1207                 '44': '480x854',
1208                 '45': '720x1280',
1209         }
1210         IE_NAME = u'youtube'
1211
1212         def report_lang(self):
1213                 """Report attempt to set language."""
1214                 self._downloader.to_screen(u'[youtube] Setting language')
1215
1216         def report_login(self):
1217                 """Report attempt to log in."""
1218                 self._downloader.to_screen(u'[youtube] Logging in')
1219
1220         def report_age_confirmation(self):
1221                 """Report attempt to confirm age."""
1222                 self._downloader.to_screen(u'[youtube] Confirming age')
1223
1224         def report_video_webpage_download(self, video_id):
1225                 """Report attempt to download video webpage."""
1226                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227
1228         def report_video_info_webpage_download(self, video_id):
1229                 """Report attempt to download video info webpage."""
1230                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231
1232         def report_video_subtitles_download(self, video_id):
1233                 """Report attempt to download video info webpage."""
1234                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235
1236         def report_information_extraction(self, video_id):
1237                 """Report attempt to extract video information."""
1238                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239
1240         def report_unavailable_format(self, video_id, format):
1241                 """Report extracted video URL."""
1242                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243
1244         def report_rtmp_download(self):
1245                 """Indicate the download will use the RTMP protocol."""
1246                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247
1248         def _closed_captions_xml_to_srt(self, xml_string):
1249                 srt = ''
1250                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251                 # TODO parse xml instead of regex
1252                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253                         if not dur: dur = '4'
1254                         start = float(start)
1255                         end = start + float(dur)
1256                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260                         srt += str(n) + '\n'
1261                         srt += start + ' --> ' + end + '\n'
1262                         srt += caption + '\n\n'
1263                 return srt
1264
1265         def _print_formats(self, formats):
1266                 print 'Available formats:'
1267                 for x in formats:
1268                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269
1270         def _real_initialize(self):
1271                 if self._downloader is None:
1272                         return
1273
1274                 username = None
1275                 password = None
1276                 downloader_params = self._downloader.params
1277
1278                 # Attempt to use provided username and password or .netrc data
1279                 if downloader_params.get('username', None) is not None:
1280                         username = downloader_params['username']
1281                         password = downloader_params['password']
1282                 elif downloader_params.get('usenetrc', False):
1283                         try:
1284                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285                                 if info is not None:
1286                                         username = info[0]
1287                                         password = info[2]
1288                                 else:
1289                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290                         except (IOError, netrc.NetrcParseError), err:
1291                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292                                 return
1293
1294                 # Set language
1295                 request = urllib2.Request(self._LANG_URL)
1296                 try:
1297                         self.report_lang()
1298                         urllib2.urlopen(request).read()
1299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301                         return
1302
1303                 # No authentication to be performed
1304                 if username is None:
1305                         return
1306
1307                 # Log in
1308                 login_form = {
1309                                 'current_form': 'loginForm',
1310                                 'next':         '/',
1311                                 'action_login': 'Log In',
1312                                 'username':     username,
1313                                 'password':     password,
1314                                 }
1315                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316                 try:
1317                         self.report_login()
1318                         login_results = urllib2.urlopen(request).read()
1319                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321                                 return
1322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1324                         return
1325
1326                 # Confirm age
1327                 age_form = {
1328                                 'next_url':             '/',
1329                                 'action_confirm':       'Confirm',
1330                                 }
1331                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332                 try:
1333                         self.report_age_confirmation()
1334                         age_results = urllib2.urlopen(request).read()
1335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337                         return
1338
1339         def _real_extract(self, url):
1340                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1341                 mobj = re.search(self._NEXT_URL_RE, url)
1342                 if mobj:
1343                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1344
1345                 # Extract video id from URL
1346                 mobj = re.match(self._VALID_URL, url)
1347                 if mobj is None:
1348                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1349                         return
1350                 video_id = mobj.group(2)
1351
1352                 # Get video webpage
1353                 self.report_video_webpage_download(video_id)
1354                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1355                 try:
1356                         video_webpage = urllib2.urlopen(request).read()
1357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1359                         return
1360
1361                 # Attempt to extract SWF player URL
1362                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1363                 if mobj is not None:
1364                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1365                 else:
1366                         player_url = None
1367
1368                 # Get video info
1369                 self.report_video_info_webpage_download(video_id)
1370                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1371                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1372                                         % (video_id, el_type))
1373                         request = urllib2.Request(video_info_url)
1374                         try:
1375                                 video_info_webpage = urllib2.urlopen(request).read()
1376                                 video_info = parse_qs(video_info_webpage)
1377                                 if 'token' in video_info:
1378                                         break
1379                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1381                                 return
1382                 if 'token' not in video_info:
1383                         if 'reason' in video_info:
1384                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1385                         else:
1386                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1387                         return
1388
1389                 # Start extracting information
1390                 self.report_information_extraction(video_id)
1391
1392                 # uploader
1393                 if 'author' not in video_info:
1394                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1395                         return
1396                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1397
1398                 # title
1399                 if 'title' not in video_info:
1400                         self._downloader.trouble(u'ERROR: unable to extract video title')
1401                         return
1402                 video_title = urllib.unquote_plus(video_info['title'][0])
1403                 video_title = video_title.decode('utf-8')
1404                 video_title = sanitize_title(video_title)
1405
1406                 # simplified title
1407                 simple_title = _simplify_title(video_title)
1408
1409                 # thumbnail image
1410                 if 'thumbnail_url' not in video_info:
1411                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1412                         video_thumbnail = ''
1413                 else:   # don't panic if we can't find it
1414                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1415
1416                 # upload date
1417                 upload_date = u'NA'
1418                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1419                 if mobj is not None:
1420                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1421                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1422                         for expression in format_expressions:
1423                                 try:
1424                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1425                                 except:
1426                                         pass
1427
1428                 # description
1429                 try:
1430                         lxml.etree
1431                 except NameError:
1432                         video_description = u'No description available.'
1433                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1434                         if mobj is not None:
1435                                 video_description = mobj.group(1).decode('utf-8')
1436                 else:
1437                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1438                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1439                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1440                         # TODO use another parser
1441
1442                 # closed captions
1443                 video_subtitles = None
1444                 if self._downloader.params.get('writesubtitles', False):
1445                         self.report_video_subtitles_download(video_id)
1446                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1447                         try:
1448                                 srt_list = urllib2.urlopen(request).read()
1449                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1450                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1451                         else:
1452                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1453                                 if srt_lang_list:
1454                                         if self._downloader.params.get('subtitleslang', False):
1455                                                 srt_lang = self._downloader.params.get('subtitleslang')
1456                                         elif 'en' in srt_lang_list:
1457                                                 srt_lang = 'en'
1458                                         else:
1459                                                 srt_lang = srt_lang_list[0]
1460                                         if not srt_lang in srt_lang_list:
1461                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1462                                         else:
1463                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1464                                                 try:
1465                                                         srt_xml = urllib2.urlopen(request).read()
1466                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1468                                                 else:
1469                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1470                                 else:
1471                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1472
1473                 # token
1474                 video_token = urllib.unquote_plus(video_info['token'][0])
1475
1476                 # Decide which formats to download
1477                 req_format = self._downloader.params.get('format', None)
1478
1479                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1480                         self.report_rtmp_download()
1481                         video_url_list = [(None, video_info['conn'][0])]
1482                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1483                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1484                         url_data = [parse_qs(uds) for uds in url_data_strs]
1485                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1486                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1487
1488                         format_limit = self._downloader.params.get('format_limit', None)
1489                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1490                         if format_limit is not None and format_limit in available_formats:
1491                                 format_list = available_formats[available_formats.index(format_limit):]
1492                         else:
1493                                 format_list = available_formats
1494                         existing_formats = [x for x in format_list if x in url_map]
1495                         if len(existing_formats) == 0:
1496                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1497                                 return
1498                         if self._downloader.params.get('listformats', None):
1499                                 self._print_formats(existing_formats)
1500                                 return
1501                         if req_format is None or req_format == 'best':
1502                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1503                         elif req_format == 'worst':
1504                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1505                         elif req_format in ('-1', 'all'):
1506                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1507                         else:
1508                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1509                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1510                                 req_formats = req_format.split('/')
1511                                 video_url_list = None
1512                                 for rf in req_formats:
1513                                         if rf in url_map:
1514                                                 video_url_list = [(rf, url_map[rf])]
1515                                                 break
1516                                 if video_url_list is None:
1517                                         self._downloader.trouble(u'ERROR: requested format not available')
1518                                         return
1519                 else:
1520                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1521                         return
1522
1523                 for format_param, video_real_url in video_url_list:
1524                         # At this point we have a new video
1525                         self._downloader.increment_downloads()
1526
1527                         # Extension
1528                         video_extension = self._video_extensions.get(format_param, 'flv')
1529
1530                         try:
1531                                 # Process video information
1532                                 self._downloader.process_info({
1533                                         'id':           video_id.decode('utf-8'),
1534                                         'url':          video_real_url.decode('utf-8'),
1535                                         'uploader':     video_uploader.decode('utf-8'),
1536                                         'upload_date':  upload_date,
1537                                         'title':        video_title,
1538                                         'stitle':       simple_title,
1539                                         'ext':          video_extension.decode('utf-8'),
1540                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1541                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1542                                         'description':  video_description,
1543                                         'player_url':   player_url,
1544                                         'subtitles':    video_subtitles
1545                                 })
1546                         except UnavailableVideoError, err:
1547                                 self._downloader.trouble(u'\nERROR: unable to download video')
1548
1549
1550 class MetacafeIE(InfoExtractor):
1551         """Information Extractor for metacafe.com."""
1552
1553         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1554         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1555         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1556         _youtube_ie = None
1557         IE_NAME = u'metacafe'
1558
1559         def __init__(self, youtube_ie, downloader=None):
1560                 InfoExtractor.__init__(self, downloader)
1561                 self._youtube_ie = youtube_ie
1562
1563         def report_disclaimer(self):
1564                 """Report disclaimer retrieval."""
1565                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1566
1567         def report_age_confirmation(self):
1568                 """Report attempt to confirm age."""
1569                 self._downloader.to_screen(u'[metacafe] Confirming age')
1570
1571         def report_download_webpage(self, video_id):
1572                 """Report webpage download."""
1573                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1574
1575         def report_extraction(self, video_id):
1576                 """Report information extraction."""
1577                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1578
1579         def _real_initialize(self):
1580                 # Retrieve disclaimer
1581                 request = urllib2.Request(self._DISCLAIMER)
1582                 try:
1583                         self.report_disclaimer()
1584                         disclaimer = urllib2.urlopen(request).read()
1585                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1587                         return
1588
1589                 # Confirm age
1590                 disclaimer_form = {
1591                         'filters': '0',
1592                         'submit': "Continue - I'm over 18",
1593                         }
1594                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1595                 try:
1596                         self.report_age_confirmation()
1597                         disclaimer = urllib2.urlopen(request).read()
1598                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1600                         return
1601
1602         def _real_extract(self, url):
1603                 # Extract id and simplified title from URL
1604                 mobj = re.match(self._VALID_URL, url)
1605                 if mobj is None:
1606                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1607                         return
1608
1609                 video_id = mobj.group(1)
1610
1611                 # Check if video comes from YouTube
1612                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1613                 if mobj2 is not None:
1614                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1615                         return
1616
1617                 # At this point we have a new video
1618                 self._downloader.increment_downloads()
1619
1620                 simple_title = mobj.group(2).decode('utf-8')
1621
1622                 # Retrieve video webpage to extract further information
1623                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1624                 try:
1625                         self.report_download_webpage(video_id)
1626                         webpage = urllib2.urlopen(request).read()
1627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1629                         return
1630
1631                 # Extract URL, uploader and title from webpage
1632                 self.report_extraction(video_id)
1633                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1634                 if mobj is not None:
1635                         mediaURL = urllib.unquote(mobj.group(1))
1636                         video_extension = mediaURL[-3:]
1637
1638                         # Extract gdaKey if available
1639                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1640                         if mobj is None:
1641                                 video_url = mediaURL
1642                         else:
1643                                 gdaKey = mobj.group(1)
1644                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1645                 else:
1646                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1647                         if mobj is None:
1648                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649                                 return
1650                         vardict = parse_qs(mobj.group(1))
1651                         if 'mediaData' not in vardict:
1652                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653                                 return
1654                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1655                         if mobj is None:
1656                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657                                 return
1658                         mediaURL = mobj.group(1).replace('\\/', '/')
1659                         video_extension = mediaURL[-3:]
1660                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1661
1662                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: unable to extract title')
1665                         return
1666                 video_title = mobj.group(1).decode('utf-8')
1667                 video_title = sanitize_title(video_title)
1668
1669                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1672                         return
1673                 video_uploader = mobj.group(1)
1674
1675                 try:
1676                         # Process video information
1677                         self._downloader.process_info({
1678                                 'id':           video_id.decode('utf-8'),
1679                                 'url':          video_url.decode('utf-8'),
1680                                 'uploader':     video_uploader.decode('utf-8'),
1681                                 'upload_date':  u'NA',
1682                                 'title':        video_title,
1683                                 'stitle':       simple_title,
1684                                 'ext':          video_extension.decode('utf-8'),
1685                                 'format':       u'NA',
1686                                 'player_url':   None,
1687                         })
1688                 except UnavailableVideoError:
1689                         self._downloader.trouble(u'\nERROR: unable to download video')
1690
1691
1692 class DailymotionIE(InfoExtractor):
1693         """Information Extractor for Dailymotion"""
1694
1695         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1696         IE_NAME = u'dailymotion'
1697
1698         def __init__(self, downloader=None):
1699                 InfoExtractor.__init__(self, downloader)
1700
1701         def report_download_webpage(self, video_id):
1702                 """Report webpage download."""
1703                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1704
1705         def report_extraction(self, video_id):
1706                 """Report information extraction."""
1707                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1708
1709         def _real_extract(self, url):
1710                 # Extract id and simplified title from URL
1711                 mobj = re.match(self._VALID_URL, url)
1712                 if mobj is None:
1713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1714                         return
1715
1716                 # At this point we have a new video
1717                 self._downloader.increment_downloads()
1718                 video_id = mobj.group(1)
1719
1720                 video_extension = 'flv'
1721
1722                 # Retrieve video webpage to extract further information
1723                 request = urllib2.Request(url)
1724                 request.add_header('Cookie', 'family_filter=off')
1725                 try:
1726                         self.report_download_webpage(video_id)
1727                         webpage = urllib2.urlopen(request).read()
1728                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1729                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1730                         return
1731
1732                 # Extract URL, uploader and title from webpage
1733                 self.report_extraction(video_id)
1734                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1737                         return
1738                 sequence = urllib.unquote(mobj.group(1))
1739                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1740                 if mobj is None:
1741                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1742                         return
1743                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1744
1745                 # if needed add http://www.dailymotion.com/ if relative URL
1746
1747                 video_url = mediaURL
1748
1749                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: unable to extract title')
1752                         return
1753                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1754                 video_title = sanitize_title(video_title)
1755                 simple_title = _simplify_title(video_title)
1756
1757                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1760                         return
1761                 video_uploader = mobj.group(1)
1762
1763                 try:
1764                         # Process video information
1765                         self._downloader.process_info({
1766                                 'id':           video_id.decode('utf-8'),
1767                                 'url':          video_url.decode('utf-8'),
1768                                 'uploader':     video_uploader.decode('utf-8'),
1769                                 'upload_date':  u'NA',
1770                                 'title':        video_title,
1771                                 'stitle':       simple_title,
1772                                 'ext':          video_extension.decode('utf-8'),
1773                                 'format':       u'NA',
1774                                 'player_url':   None,
1775                         })
1776                 except UnavailableVideoError:
1777                         self._downloader.trouble(u'\nERROR: unable to download video')
1778
1779
1780 class GoogleIE(InfoExtractor):
1781         """Information extractor for video.google.com."""
1782
1783         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1784         IE_NAME = u'video.google'
1785
1786         def __init__(self, downloader=None):
1787                 InfoExtractor.__init__(self, downloader)
1788
1789         def report_download_webpage(self, video_id):
1790                 """Report webpage download."""
1791                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1792
1793         def report_extraction(self, video_id):
1794                 """Report information extraction."""
1795                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1796
1797         def _real_extract(self, url):
1798                 # Extract id from URL
1799                 mobj = re.match(self._VALID_URL, url)
1800                 if mobj is None:
1801                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1802                         return
1803
1804                 # At this point we have a new video
1805                 self._downloader.increment_downloads()
1806                 video_id = mobj.group(1)
1807
1808                 video_extension = 'mp4'
1809
1810                 # Retrieve video webpage to extract further information
1811                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1812                 try:
1813                         self.report_download_webpage(video_id)
1814                         webpage = urllib2.urlopen(request).read()
1815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1816                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1817                         return
1818
1819                 # Extract URL, uploader, and title from webpage
1820                 self.report_extraction(video_id)
1821                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1822                 if mobj is None:
1823                         video_extension = 'flv'
1824                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1827                         return
1828                 mediaURL = urllib.unquote(mobj.group(1))
1829                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1830                 mediaURL = mediaURL.replace('\\x26', '\x26')
1831
1832                 video_url = mediaURL
1833
1834                 mobj = re.search(r'<title>(.*)</title>', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract title')
1837                         return
1838                 video_title = mobj.group(1).decode('utf-8')
1839                 video_title = sanitize_title(video_title)
1840                 simple_title = _simplify_title(video_title)
1841
1842                 # Extract video description
1843                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract video description')
1846                         return
1847                 video_description = mobj.group(1).decode('utf-8')
1848                 if not video_description:
1849                         video_description = 'No description available.'
1850
1851                 # Extract video thumbnail
1852                 if self._downloader.params.get('forcethumbnail', False):
1853                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1854                         try:
1855                                 webpage = urllib2.urlopen(request).read()
1856                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1858                                 return
1859                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1860                         if mobj is None:
1861                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1862                                 return
1863                         video_thumbnail = mobj.group(1)
1864                 else:   # we need something to pass to process_info
1865                         video_thumbnail = ''
1866
1867                 try:
1868                         # Process video information
1869                         self._downloader.process_info({
1870                                 'id':           video_id.decode('utf-8'),
1871                                 'url':          video_url.decode('utf-8'),
1872                                 'uploader':     u'NA',
1873                                 'upload_date':  u'NA',
1874                                 'title':        video_title,
1875                                 'stitle':       simple_title,
1876                                 'ext':          video_extension.decode('utf-8'),
1877                                 'format':       u'NA',
1878                                 'player_url':   None,
1879                         })
1880                 except UnavailableVideoError:
1881                         self._downloader.trouble(u'\nERROR: unable to download video')
1882
1883
1884 class PhotobucketIE(InfoExtractor):
1885         """Information extractor for photobucket.com."""
1886
1887         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1888         IE_NAME = u'photobucket'
1889
1890         def __init__(self, downloader=None):
1891                 InfoExtractor.__init__(self, downloader)
1892
1893         def report_download_webpage(self, video_id):
1894                 """Report webpage download."""
1895                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1896
1897         def report_extraction(self, video_id):
1898                 """Report information extraction."""
1899                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1900
1901         def _real_extract(self, url):
1902                 # Extract id from URL
1903                 mobj = re.match(self._VALID_URL, url)
1904                 if mobj is None:
1905                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1906                         return
1907
1908                 # At this point we have a new video
1909                 self._downloader.increment_downloads()
1910                 video_id = mobj.group(1)
1911
1912                 video_extension = 'flv'
1913
1914                 # Retrieve video webpage to extract further information
1915                 request = urllib2.Request(url)
1916                 try:
1917                         self.report_download_webpage(video_id)
1918                         webpage = urllib2.urlopen(request).read()
1919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921                         return
1922
1923                 # Extract URL, uploader, and title from webpage
1924                 self.report_extraction(video_id)
1925                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1926                 if mobj is None:
1927                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1928                         return
1929                 mediaURL = urllib.unquote(mobj.group(1))
1930
1931                 video_url = mediaURL
1932
1933                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1934                 if mobj is None:
1935                         self._downloader.trouble(u'ERROR: unable to extract title')
1936                         return
1937                 video_title = mobj.group(1).decode('utf-8')
1938                 video_title = sanitize_title(video_title)
1939                 simple_title = _simplify_title(vide_title)
1940
1941                 video_uploader = mobj.group(2).decode('utf-8')
1942
1943                 try:
1944                         # Process video information
1945                         self._downloader.process_info({
1946                                 'id':           video_id.decode('utf-8'),
1947                                 'url':          video_url.decode('utf-8'),
1948                                 'uploader':     video_uploader,
1949                                 'upload_date':  u'NA',
1950                                 'title':        video_title,
1951                                 'stitle':       simple_title,
1952                                 'ext':          video_extension.decode('utf-8'),
1953                                 'format':       u'NA',
1954                                 'player_url':   None,
1955                         })
1956                 except UnavailableVideoError:
1957                         self._downloader.trouble(u'\nERROR: unable to download video')
1958
1959
1960 class YahooIE(InfoExtractor):
1961         """Information extractor for video.yahoo.com."""
1962
1963         # _VALID_URL matches all Yahoo! Video URLs
1964         # _VPAGE_URL matches only the extractable '/watch/' URLs
1965         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1966         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1967         IE_NAME = u'video.yahoo'
1968
1969         def __init__(self, downloader=None):
1970                 InfoExtractor.__init__(self, downloader)
1971
1972         def report_download_webpage(self, video_id):
1973                 """Report webpage download."""
1974                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1975
1976         def report_extraction(self, video_id):
1977                 """Report information extraction."""
1978                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1979
1980         def _real_extract(self, url, new_video=True):
1981                 # Extract ID from URL
1982                 mobj = re.match(self._VALID_URL, url)
1983                 if mobj is None:
1984                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1985                         return
1986
1987                 # At this point we have a new video
1988                 self._downloader.increment_downloads()
1989                 video_id = mobj.group(2)
1990                 video_extension = 'flv'
1991
1992                 # Rewrite valid but non-extractable URLs as
1993                 # extractable English language /watch/ URLs
1994                 if re.match(self._VPAGE_URL, url) is None:
1995                         request = urllib2.Request(url)
1996                         try:
1997                                 webpage = urllib2.urlopen(request).read()
1998                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2000                                 return
2001
2002                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2003                         if mobj is None:
2004                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2005                                 return
2006                         yahoo_id = mobj.group(1)
2007
2008                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2009                         if mobj is None:
2010                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2011                                 return
2012                         yahoo_vid = mobj.group(1)
2013
2014                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2015                         return self._real_extract(url, new_video=False)
2016
2017                 # Retrieve video webpage to extract further information
2018                 request = urllib2.Request(url)
2019                 try:
2020                         self.report_download_webpage(video_id)
2021                         webpage = urllib2.urlopen(request).read()
2022                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2023                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2024                         return
2025
2026                 # Extract uploader and title from webpage
2027                 self.report_extraction(video_id)
2028                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2029                 if mobj is None:
2030                         self._downloader.trouble(u'ERROR: unable to extract video title')
2031                         return
2032                 video_title = mobj.group(1).decode('utf-8')
2033                 simple_title = _simplify_title(video_title)
2034
2035                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038                         return
2039                 video_uploader = mobj.group(1).decode('utf-8')
2040
2041                 # Extract video thumbnail
2042                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045                         return
2046                 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048                 # Extract video description
2049                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video description')
2052                         return
2053                 video_description = mobj.group(1).decode('utf-8')
2054                 if not video_description:
2055                         video_description = 'No description available.'
2056
2057                 # Extract video height and width
2058                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract video height')
2061                         return
2062                 yv_video_height = mobj.group(1)
2063
2064                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2065                 if mobj is None:
2066                         self._downloader.trouble(u'ERROR: unable to extract video width')
2067                         return
2068                 yv_video_width = mobj.group(1)
2069
2070                 # Retrieve video playlist to extract media URL
2071                 # I'm not completely sure what all these options are, but we
2072                 # seem to need most of them, otherwise the server sends a 401.
2073                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2074                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2075                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2076                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2077                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2078                 try:
2079                         self.report_download_webpage(video_id)
2080                         webpage = urllib2.urlopen(request).read()
2081                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2082                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2083                         return
2084
2085                 # Extract media URL from playlist XML
2086                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2087                 if mobj is None:
2088                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2089                         return
2090                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2091                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2092
2093                 try:
2094                         # Process video information
2095                         self._downloader.process_info({
2096                                 'id':           video_id.decode('utf-8'),
2097                                 'url':          video_url,
2098                                 'uploader':     video_uploader,
2099                                 'upload_date':  u'NA',
2100                                 'title':        video_title,
2101                                 'stitle':       simple_title,
2102                                 'ext':          video_extension.decode('utf-8'),
2103                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2104                                 'description':  video_description,
2105                                 'thumbnail':    video_thumbnail,
2106                                 'player_url':   None,
2107                         })
2108                 except UnavailableVideoError:
2109                         self._downloader.trouble(u'\nERROR: unable to download video')
2110
2111
2112 class VimeoIE(InfoExtractor):
2113         """Information extractor for vimeo.com."""
2114
2115         # _VALID_URL matches Vimeo URLs
2116         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2117         IE_NAME = u'vimeo'
2118
2119         def __init__(self, downloader=None):
2120                 InfoExtractor.__init__(self, downloader)
2121
2122         def report_download_webpage(self, video_id):
2123                 """Report webpage download."""
2124                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2125
2126         def report_extraction(self, video_id):
2127                 """Report information extraction."""
2128                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2129
2130         def _real_extract(self, url, new_video=True):
2131                 # Extract ID from URL
2132                 mobj = re.match(self._VALID_URL, url)
2133                 if mobj is None:
2134                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2135                         return
2136
2137                 # At this point we have a new video
2138                 self._downloader.increment_downloads()
2139                 video_id = mobj.group(1)
2140
2141                 # Retrieve video webpage to extract further information
2142                 request = urllib2.Request(url, None, std_headers)
2143                 try:
2144                         self.report_download_webpage(video_id)
2145                         webpage = urllib2.urlopen(request).read()
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148                         return
2149
2150                 # Now we begin extracting as much information as we can from what we
2151                 # retrieved. First we extract the information common to all extractors,
2152                 # and latter we extract those that are Vimeo specific.
2153                 self.report_extraction(video_id)
2154
2155                 # Extract the config JSON
2156                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2157                 try:
2158                         config = json.loads(config)
2159                 except:
2160                         self._downloader.trouble(u'ERROR: unable to extract info section')
2161                         return
2162
2163                 # Extract title
2164                 video_title = config["video"]["title"]
2165                 simple_title = _simplify_title(video_title)
2166
2167                 # Extract uploader
2168                 video_uploader = config["video"]["owner"]["name"]
2169
2170                 # Extract video thumbnail
2171                 video_thumbnail = config["video"]["thumbnail"]
2172
2173                 # Extract video description
2174                 try:
2175                         lxml.etree
2176                 except NameError:
2177                         video_description = u'No description available.'
2178                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2179                         if mobj is not None:
2180                                 video_description = mobj.group(1)
2181                 else:
2182                         html_parser = lxml.etree.HTMLParser()
2183                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2184                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2185                         # TODO use another parser
2186
2187                 # Extract upload date
2188                 video_upload_date = u'NA'
2189                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2190                 if mobj is not None:
2191                         video_upload_date = mobj.group(1)
2192
2193                 # Vimeo specific: extract request signature and timestamp
2194                 sig = config['request']['signature']
2195                 timestamp = config['request']['timestamp']
2196
2197                 # Vimeo specific: extract video codec and quality information
2198                 # TODO bind to format param
2199                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2200                 for codec in codecs:
2201                         if codec[0] in config["video"]["files"]:
2202                                 video_codec = codec[0]
2203                                 video_extension = codec[1]
2204                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2205                                 else: quality = 'sd'
2206                                 break
2207                 else:
2208                         self._downloader.trouble(u'ERROR: no known codec found')
2209                         return
2210
2211                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2212                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2213
2214                 try:
2215                         # Process video information
2216                         self._downloader.process_info({
2217                                 'id':           video_id,
2218                                 'url':          video_url,
2219                                 'uploader':     video_uploader,
2220                                 'upload_date':  video_upload_date,
2221                                 'title':        video_title,
2222                                 'stitle':       simple_title,
2223                                 'ext':          video_extension,
2224                                 'thumbnail':    video_thumbnail,
2225                                 'description':  video_description,
2226                                 'player_url':   None,
2227                         })
2228                 except UnavailableVideoError:
2229                         self._downloader.trouble(u'ERROR: unable to download video')
2230
2231
2232 class GenericIE(InfoExtractor):
2233         """Generic last-resort information extractor."""
2234
2235         _VALID_URL = r'.*'
2236         IE_NAME = u'generic'
2237
2238         def __init__(self, downloader=None):
2239                 InfoExtractor.__init__(self, downloader)
2240
2241         def report_download_webpage(self, video_id):
2242                 """Report webpage download."""
2243                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2244                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2245
2246         def report_extraction(self, video_id):
2247                 """Report information extraction."""
2248                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2249
2250         def report_following_redirect(self, new_url):
2251                 """Report information extraction."""
2252                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
2253
2254         def _test_redirect(self, url):
2255                 """Check if it is a redirect, like url shorteners, in case restart chain."""
2256                 class HeadRequest(urllib2.Request):
2257                         def get_method(self):
2258                                 return "HEAD"
2259
2260                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
2261                         """
2262                         Subclass the HTTPRedirectHandler to make it use our
2263                         HeadRequest also on the redirected URL
2264                         """
2265                         def redirect_request(self, req, fp, code, msg, headers, newurl):
2266                                 if code in (301, 302, 303, 307):
2267                                     newurl = newurl.replace(' ', '%20')
2268                                     newheaders = dict((k,v) for k,v in req.headers.items()
2269                                                       if k.lower() not in ("content-length", "content-type"))
2270                                     return HeadRequest(newurl,
2271                                                        headers=newheaders,
2272                                                        origin_req_host=req.get_origin_req_host(),
2273                                                        unverifiable=True)
2274                                 else:
2275                                     raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
2276
2277                 class HTTPMethodFallback(urllib2.BaseHandler):
2278                         """
2279                         Fallback to GET if HEAD is not allowed (405 HTTP error)
2280                         """
2281                         def http_error_405(self, req, fp, code, msg, headers):
2282                                 fp.read()
2283                                 fp.close()
2284
2285                                 newheaders = dict((k,v) for k,v in req.headers.items()
2286                                                   if k.lower() not in ("content-length", "content-type"))
2287                                 return self.parent.open(urllib2.Request(req.get_full_url(),
2288                                                                  headers=newheaders,
2289                                                                  origin_req_host=req.get_origin_req_host(),
2290                                                                  unverifiable=True))
2291
2292                 # Build our opener
2293                 opener = urllib2.OpenerDirector()
2294                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
2295                                         HTTPMethodFallback, HEADRedirectHandler,
2296                                         urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
2297                         opener.add_handler(handler())
2298
2299                 response = opener.open(HeadRequest(url))
2300                 new_url = response.geturl()
2301
2302                 if url == new_url: return False
2303
2304                 self.report_following_redirect(new_url)
2305                 self._downloader.download([new_url])
2306                 return True
2307
2308         def _real_extract(self, url):
2309                 if self._test_redirect(url): return
2310
2311                 # At this point we have a new video
2312                 self._downloader.increment_downloads()
2313
2314                 video_id = url.split('/')[-1]
2315                 request = urllib2.Request(url)
2316                 try:
2317                         self.report_download_webpage(video_id)
2318                         webpage = urllib2.urlopen(request).read()
2319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2321                         return
2322                 except ValueError, err:
2323                         # since this is the last-resort InfoExtractor, if
2324                         # this error is thrown, it'll be thrown here
2325                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2326                         return
2327
2328                 self.report_extraction(video_id)
2329                 # Start with something easy: JW Player in SWFObject
2330                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2331                 if mobj is None:
2332                         # Broaden the search a little bit
2333                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2334                 if mobj is None:
2335                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2336                         return
2337
2338                 # It's possible that one of the regexes
2339                 # matched, but returned an empty group:
2340                 if mobj.group(1) is None:
2341                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2342                         return
2343
2344                 video_url = urllib.unquote(mobj.group(1))
2345                 video_id = os.path.basename(video_url)
2346
2347                 # here's a fun little line of code for you:
2348                 video_extension = os.path.splitext(video_id)[1][1:]
2349                 video_id = os.path.splitext(video_id)[0]
2350
2351                 # it's tempting to parse this further, but you would
2352                 # have to take into account all the variations like
2353                 #   Video Title - Site Name
2354                 #   Site Name | Video Title
2355                 #   Video Title - Tagline | Site Name
2356                 # and so on and so forth; it's just not practical
2357                 mobj = re.search(r'<title>(.*)</title>', webpage)
2358                 if mobj is None:
2359                         self._downloader.trouble(u'ERROR: unable to extract title')
2360                         return
2361                 video_title = mobj.group(1).decode('utf-8')
2362                 video_title = sanitize_title(video_title)
2363                 simple_title = _simplify_title(video_title)
2364
2365                 # video uploader is domain name
2366                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2367                 if mobj is None:
2368                         self._downloader.trouble(u'ERROR: unable to extract title')
2369                         return
2370                 video_uploader = mobj.group(1).decode('utf-8')
2371
2372                 try:
2373                         # Process video information
2374                         self._downloader.process_info({
2375                                 'id':           video_id.decode('utf-8'),
2376                                 'url':          video_url.decode('utf-8'),
2377                                 'uploader':     video_uploader,
2378                                 'upload_date':  u'NA',
2379                                 'title':        video_title,
2380                                 'stitle':       simple_title,
2381                                 'ext':          video_extension.decode('utf-8'),
2382                                 'format':       u'NA',
2383                                 'player_url':   None,
2384                         })
2385                 except UnavailableVideoError, err:
2386                         self._downloader.trouble(u'\nERROR: unable to download video')
2387
2388
2389 class YoutubeSearchIE(InfoExtractor):
2390         """Information Extractor for YouTube search queries."""
2391         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2392         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2393         _youtube_ie = None
2394         _max_youtube_results = 1000
2395         IE_NAME = u'youtube:search'
2396
2397         def __init__(self, youtube_ie, downloader=None):
2398                 InfoExtractor.__init__(self, downloader)
2399                 self._youtube_ie = youtube_ie
2400
2401         def report_download_page(self, query, pagenum):
2402                 """Report attempt to download playlist page with given number."""
2403                 query = query.decode(preferredencoding())
2404                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2405
2406         def _real_initialize(self):
2407                 self._youtube_ie.initialize()
2408
2409         def _real_extract(self, query):
2410                 mobj = re.match(self._VALID_URL, query)
2411                 if mobj is None:
2412                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2413                         return
2414
2415                 prefix, query = query.split(':')
2416                 prefix = prefix[8:]
2417                 query = query.encode('utf-8')
2418                 if prefix == '':
2419                         self._download_n_results(query, 1)
2420                         return
2421                 elif prefix == 'all':
2422                         self._download_n_results(query, self._max_youtube_results)
2423                         return
2424                 else:
2425                         try:
2426                                 n = long(prefix)
2427                                 if n <= 0:
2428                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2429                                         return
2430                                 elif n > self._max_youtube_results:
2431                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2432                                         n = self._max_youtube_results
2433                                 self._download_n_results(query, n)
2434                                 return
2435                         except ValueError: # parsing prefix as integer fails
2436                                 self._download_n_results(query, 1)
2437                                 return
2438
2439         def _download_n_results(self, query, n):
2440                 """Downloads a specified number of results for a query"""
2441
2442                 video_ids = []
2443                 pagenum = 0
2444                 limit = n
2445
2446                 while (50 * pagenum) < limit:
2447                         self.report_download_page(query, pagenum+1)
2448                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2449                         request = urllib2.Request(result_url)
2450                         try:
2451                                 data = urllib2.urlopen(request).read()
2452                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2453                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2454                                 return
2455                         api_response = json.loads(data)['data']
2456
2457                         new_ids = list(video['id'] for video in api_response['items'])
2458                         video_ids += new_ids
2459
2460                         limit = min(n, api_response['totalItems'])
2461                         pagenum += 1
2462
2463                 if len(video_ids) > n:
2464                         video_ids = video_ids[:n]
2465                 for id in video_ids:
2466                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2467                 return
2468
2469
2470 class GoogleSearchIE(InfoExtractor):
2471         """Information Extractor for Google Video search queries."""
2472         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2473         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2474         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2475         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2476         _google_ie = None
2477         _max_google_results = 1000
2478         IE_NAME = u'video.google:search'
2479
2480         def __init__(self, google_ie, downloader=None):
2481                 InfoExtractor.__init__(self, downloader)
2482                 self._google_ie = google_ie
2483
2484         def report_download_page(self, query, pagenum):
2485                 """Report attempt to download playlist page with given number."""
2486                 query = query.decode(preferredencoding())
2487                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2488
2489         def _real_initialize(self):
2490                 self._google_ie.initialize()
2491
2492         def _real_extract(self, query):
2493                 mobj = re.match(self._VALID_URL, query)
2494                 if mobj is None:
2495                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2496                         return
2497
2498                 prefix, query = query.split(':')
2499                 prefix = prefix[8:]
2500                 query = query.encode('utf-8')
2501                 if prefix == '':
2502                         self._download_n_results(query, 1)
2503                         return
2504                 elif prefix == 'all':
2505                         self._download_n_results(query, self._max_google_results)
2506                         return
2507                 else:
2508                         try:
2509                                 n = long(prefix)
2510                                 if n <= 0:
2511                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2512                                         return
2513                                 elif n > self._max_google_results:
2514                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2515                                         n = self._max_google_results
2516                                 self._download_n_results(query, n)
2517                                 return
2518                         except ValueError: # parsing prefix as integer fails
2519                                 self._download_n_results(query, 1)
2520                                 return
2521
2522         def _download_n_results(self, query, n):
2523                 """Downloads a specified number of results for a query"""
2524
2525                 video_ids = []
2526                 pagenum = 0
2527
2528                 while True:
2529                         self.report_download_page(query, pagenum)
2530                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2531                         request = urllib2.Request(result_url)
2532                         try:
2533                                 page = urllib2.urlopen(request).read()
2534                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536                                 return
2537
2538                         # Extract video identifiers
2539                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540                                 video_id = mobj.group(1)
2541                                 if video_id not in video_ids:
2542                                         video_ids.append(video_id)
2543                                         if len(video_ids) == n:
2544                                                 # Specified n videos reached
2545                                                 for id in video_ids:
2546                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2547                                                 return
2548
2549                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2550                                 for id in video_ids:
2551                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2552                                 return
2553
2554                         pagenum = pagenum + 1
2555
2556
2557 class YahooSearchIE(InfoExtractor):
2558         """Information Extractor for Yahoo! Video search queries."""
2559         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2560         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2561         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2562         _MORE_PAGES_INDICATOR = r'\s*Next'
2563         _yahoo_ie = None
2564         _max_yahoo_results = 1000
2565         IE_NAME = u'video.yahoo:search'
2566
2567         def __init__(self, yahoo_ie, downloader=None):
2568                 InfoExtractor.__init__(self, downloader)
2569                 self._yahoo_ie = yahoo_ie
2570
2571         def report_download_page(self, query, pagenum):
2572                 """Report attempt to download playlist page with given number."""
2573                 query = query.decode(preferredencoding())
2574                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2575
2576         def _real_initialize(self):
2577                 self._yahoo_ie.initialize()
2578
2579         def _real_extract(self, query):
2580                 mobj = re.match(self._VALID_URL, query)
2581                 if mobj is None:
2582                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2583                         return
2584
2585                 prefix, query = query.split(':')
2586                 prefix = prefix[8:]
2587                 query = query.encode('utf-8')
2588                 if prefix == '':
2589                         self._download_n_results(query, 1)
2590                         return
2591                 elif prefix == 'all':
2592                         self._download_n_results(query, self._max_yahoo_results)
2593                         return
2594                 else:
2595                         try:
2596                                 n = long(prefix)
2597                                 if n <= 0:
2598                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2599                                         return
2600                                 elif n > self._max_yahoo_results:
2601                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2602                                         n = self._max_yahoo_results
2603                                 self._download_n_results(query, n)
2604                                 return
2605                         except ValueError: # parsing prefix as integer fails
2606                                 self._download_n_results(query, 1)
2607                                 return
2608
2609         def _download_n_results(self, query, n):
2610                 """Downloads a specified number of results for a query"""
2611
2612                 video_ids = []
2613                 already_seen = set()
2614                 pagenum = 1
2615
2616                 while True:
2617                         self.report_download_page(query, pagenum)
2618                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2619                         request = urllib2.Request(result_url)
2620                         try:
2621                                 page = urllib2.urlopen(request).read()
2622                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2624                                 return
2625
2626                         # Extract video identifiers
2627                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2628                                 video_id = mobj.group(1)
2629                                 if video_id not in already_seen:
2630                                         video_ids.append(video_id)
2631                                         already_seen.add(video_id)
2632                                         if len(video_ids) == n:
2633                                                 # Specified n videos reached
2634                                                 for id in video_ids:
2635                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2636                                                 return
2637
2638                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2639                                 for id in video_ids:
2640                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2641                                 return
2642
2643                         pagenum = pagenum + 1
2644
2645
2646 class YoutubePlaylistIE(InfoExtractor):
2647         """Information Extractor for YouTube playlists."""
2648
2649         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2650         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2651         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2652         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2653         _youtube_ie = None
2654         IE_NAME = u'youtube:playlist'
2655
2656         def __init__(self, youtube_ie, downloader=None):
2657                 InfoExtractor.__init__(self, downloader)
2658                 self._youtube_ie = youtube_ie
2659
2660         def report_download_page(self, playlist_id, pagenum):
2661                 """Report attempt to download playlist page with given number."""
2662                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2663
2664         def _real_initialize(self):
2665                 self._youtube_ie.initialize()
2666
2667         def _real_extract(self, url):
2668                 # Extract playlist id
2669                 mobj = re.match(self._VALID_URL, url)
2670                 if mobj is None:
2671                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2672                         return
2673
2674                 # Single video case
2675                 if mobj.group(3) is not None:
2676                         self._youtube_ie.extract(mobj.group(3))
2677                         return
2678
2679                 # Download playlist pages
2680                 # prefix is 'p' as default for playlists but there are other types that need extra care
2681                 playlist_prefix = mobj.group(1)
2682                 if playlist_prefix == 'a':
2683                         playlist_access = 'artist'
2684                 else:
2685                         playlist_prefix = 'p'
2686                         playlist_access = 'view_play_list'
2687                 playlist_id = mobj.group(2)
2688                 video_ids = []
2689                 pagenum = 1
2690
2691                 while True:
2692                         self.report_download_page(playlist_id, pagenum)
2693                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2694                         request = urllib2.Request(url)
2695                         try:
2696                                 page = urllib2.urlopen(request).read()
2697                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2699                                 return
2700
2701                         # Extract video identifiers
2702                         ids_in_page = []
2703                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2704                                 if mobj.group(1) not in ids_in_page:
2705                                         ids_in_page.append(mobj.group(1))
2706                         video_ids.extend(ids_in_page)
2707
2708                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2709                                 break
2710                         pagenum = pagenum + 1
2711
2712                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2713                 playlistend = self._downloader.params.get('playlistend', -1)
2714                 if playlistend == -1:
2715                         video_ids = video_ids[playliststart:]
2716                 else:
2717                         video_ids = video_ids[playliststart:playlistend]
2718
2719                 for id in video_ids:
2720                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2721                 return
2722
2723
2724 class YoutubeUserIE(InfoExtractor):
2725         """Information Extractor for YouTube users."""
2726
2727         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2728         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2729         _GDATA_PAGE_SIZE = 50
2730         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2731         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2732         _youtube_ie = None
2733         IE_NAME = u'youtube:user'
2734
2735         def __init__(self, youtube_ie, downloader=None):
2736                 InfoExtractor.__init__(self, downloader)
2737                 self._youtube_ie = youtube_ie
2738
2739         def report_download_page(self, username, start_index):
2740                 """Report attempt to download user page."""
2741                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2742                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2743
2744         def _real_initialize(self):
2745                 self._youtube_ie.initialize()
2746
2747         def _real_extract(self, url):
2748                 # Extract username
2749                 mobj = re.match(self._VALID_URL, url)
2750                 if mobj is None:
2751                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2752                         return
2753
2754                 username = mobj.group(1)
2755
2756                 # Download video ids using YouTube Data API. Result size per
2757                 # query is limited (currently to 50 videos) so we need to query
2758                 # page by page until there are no video ids - it means we got
2759                 # all of them.
2760
2761                 video_ids = []
2762                 pagenum = 0
2763
2764                 while True:
2765                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2766                         self.report_download_page(username, start_index)
2767
2768                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2769
2770                         try:
2771                                 page = urllib2.urlopen(request).read()
2772                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2774                                 return
2775
2776                         # Extract video identifiers
2777                         ids_in_page = []
2778
2779                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2780                                 if mobj.group(1) not in ids_in_page:
2781                                         ids_in_page.append(mobj.group(1))
2782
2783                         video_ids.extend(ids_in_page)
2784
2785                         # A little optimization - if current page is not
2786                         # "full", ie. does not contain PAGE_SIZE video ids then
2787                         # we can assume that this page is the last one - there
2788                         # are no more ids on further pages - no need to query
2789                         # again.
2790
2791                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2792                                 break
2793
2794                         pagenum += 1
2795
2796                 all_ids_count = len(video_ids)
2797                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2798                 playlistend = self._downloader.params.get('playlistend', -1)
2799
2800                 if playlistend == -1:
2801                         video_ids = video_ids[playliststart:]
2802                 else:
2803                         video_ids = video_ids[playliststart:playlistend]
2804
2805                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2806                                 (username, all_ids_count, len(video_ids)))
2807
2808                 for video_id in video_ids:
2809                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2810
2811
2812 class DepositFilesIE(InfoExtractor):
2813         """Information extractor for depositfiles.com"""
2814
2815         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2816         IE_NAME = u'DepositFiles'
2817
2818         def __init__(self, downloader=None):
2819                 InfoExtractor.__init__(self, downloader)
2820
2821         def report_download_webpage(self, file_id):
2822                 """Report webpage download."""
2823                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2824
2825         def report_extraction(self, file_id):
2826                 """Report information extraction."""
2827                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2828
2829         def _real_extract(self, url):
2830                 # At this point we have a new file
2831                 self._downloader.increment_downloads()
2832
2833                 file_id = url.split('/')[-1]
2834                 # Rebuild url in english locale
2835                 url = 'http://depositfiles.com/en/files/' + file_id
2836
2837                 # Retrieve file webpage with 'Free download' button pressed
2838                 free_download_indication = { 'gateway_result' : '1' }
2839                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2840                 try:
2841                         self.report_download_webpage(file_id)
2842                         webpage = urllib2.urlopen(request).read()
2843                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2844                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2845                         return
2846
2847                 # Search for the real file URL
2848                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2849                 if (mobj is None) or (mobj.group(1) is None):
2850                         # Try to figure out reason of the error.
2851                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2852                         if (mobj is not None) and (mobj.group(1) is not None):
2853                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2854                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2855                         else:
2856                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2857                         return
2858
2859                 file_url = mobj.group(1)
2860                 file_extension = os.path.splitext(file_url)[1][1:]
2861
2862                 # Search for file title
2863                 mobj = re.search(r'<b title="(.*?)">', webpage)
2864                 if mobj is None:
2865                         self._downloader.trouble(u'ERROR: unable to extract title')
2866                         return
2867                 file_title = mobj.group(1).decode('utf-8')
2868
2869                 try:
2870                         # Process file information
2871                         self._downloader.process_info({
2872                                 'id':           file_id.decode('utf-8'),
2873                                 'url':          file_url.decode('utf-8'),
2874                                 'uploader':     u'NA',
2875                                 'upload_date':  u'NA',
2876                                 'title':        file_title,
2877                                 'stitle':       file_title,
2878                                 'ext':          file_extension.decode('utf-8'),
2879                                 'format':       u'NA',
2880                                 'player_url':   None,
2881                         })
2882                 except UnavailableVideoError, err:
2883                         self._downloader.trouble(u'ERROR: unable to download file')
2884
2885
2886 class FacebookIE(InfoExtractor):
2887         """Information Extractor for Facebook"""
2888
2889         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2890         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2891         _NETRC_MACHINE = 'facebook'
2892         _available_formats = ['video', 'highqual', 'lowqual']
2893         _video_extensions = {
2894                 'video': 'mp4',
2895                 'highqual': 'mp4',
2896                 'lowqual': 'mp4',
2897         }
2898         IE_NAME = u'facebook'
2899
2900         def __init__(self, downloader=None):
2901                 InfoExtractor.__init__(self, downloader)
2902
2903         def _reporter(self, message):
2904                 """Add header and report message."""
2905                 self._downloader.to_screen(u'[facebook] %s' % message)
2906
2907         def report_login(self):
2908                 """Report attempt to log in."""
2909                 self._reporter(u'Logging in')
2910
2911         def report_video_webpage_download(self, video_id):
2912                 """Report attempt to download video webpage."""
2913                 self._reporter(u'%s: Downloading video webpage' % video_id)
2914
2915         def report_information_extraction(self, video_id):
2916                 """Report attempt to extract video information."""
2917                 self._reporter(u'%s: Extracting video information' % video_id)
2918
2919         def _parse_page(self, video_webpage):
2920                 """Extract video information from page"""
2921                 # General data
2922                 data = {'title': r'\("video_title", "(.*?)"\)',
2923                         'description': r'<div class="datawrap">(.*?)</div>',
2924                         'owner': r'\("video_owner_name", "(.*?)"\)',
2925                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2926                         }
2927                 video_info = {}
2928                 for piece in data.keys():
2929                         mobj = re.search(data[piece], video_webpage)
2930                         if mobj is not None:
2931                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2932
2933                 # Video urls
2934                 video_urls = {}
2935                 for fmt in self._available_formats:
2936                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2937                         if mobj is not None:
2938                                 # URL is in a Javascript segment inside an escaped Unicode format within
2939                                 # the generally utf-8 page
2940                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2941                 video_info['video_urls'] = video_urls
2942
2943                 return video_info
2944
2945         def _real_initialize(self):
2946                 if self._downloader is None:
2947                         return
2948
2949                 useremail = None
2950                 password = None
2951                 downloader_params = self._downloader.params
2952
2953                 # Attempt to use provided username and password or .netrc data
2954                 if downloader_params.get('username', None) is not None:
2955                         useremail = downloader_params['username']
2956                         password = downloader_params['password']
2957                 elif downloader_params.get('usenetrc', False):
2958                         try:
2959                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2960                                 if info is not None:
2961                                         useremail = info[0]
2962                                         password = info[2]
2963                                 else:
2964                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2965                         except (IOError, netrc.NetrcParseError), err:
2966                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2967                                 return
2968
2969                 if useremail is None:
2970                         return
2971
2972                 # Log in
2973                 login_form = {
2974                         'email': useremail,
2975                         'pass': password,
2976                         'login': 'Log+In'
2977                         }
2978                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2979                 try:
2980                         self.report_login()
2981                         login_results = urllib2.urlopen(request).read()
2982                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2983                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2984                                 return
2985                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2986                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2987                         return
2988
2989         def _real_extract(self, url):
2990                 mobj = re.match(self._VALID_URL, url)
2991                 if mobj is None:
2992                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2993                         return
2994                 video_id = mobj.group('ID')
2995
2996                 # Get video webpage
2997                 self.report_video_webpage_download(video_id)
2998                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2999                 try:
3000                         page = urllib2.urlopen(request)
3001                         video_webpage = page.read()
3002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3003                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3004                         return
3005
3006                 # Start extracting information
3007                 self.report_information_extraction(video_id)
3008
3009                 # Extract information
3010                 video_info = self._parse_page(video_webpage)
3011
3012                 # uploader
3013                 if 'owner' not in video_info:
3014                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3015                         return
3016                 video_uploader = video_info['owner']
3017
3018                 # title
3019                 if 'title' not in video_info:
3020                         self._downloader.trouble(u'ERROR: unable to extract video title')
3021                         return
3022                 video_title = video_info['title']
3023                 video_title = video_title.decode('utf-8')
3024                 video_title = sanitize_title(video_title)
3025
3026                 simple_title = _simplify_title(video_title)
3027
3028                 # thumbnail image
3029                 if 'thumbnail' not in video_info:
3030                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3031                         video_thumbnail = ''
3032                 else:
3033                         video_thumbnail = video_info['thumbnail']
3034
3035                 # upload date
3036                 upload_date = u'NA'
3037                 if 'upload_date' in video_info:
3038                         upload_time = video_info['upload_date']
3039                         timetuple = email.utils.parsedate_tz(upload_time)
3040                         if timetuple is not None:
3041                                 try:
3042                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3043                                 except:
3044                                         pass
3045
3046                 # description
3047                 video_description = video_info.get('description', 'No description available.')
3048
3049                 url_map = video_info['video_urls']
3050                 if len(url_map.keys()) > 0:
3051                         # Decide which formats to download
3052                         req_format = self._downloader.params.get('format', None)
3053                         format_limit = self._downloader.params.get('format_limit', None)
3054
3055                         if format_limit is not None and format_limit in self._available_formats:
3056                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3057                         else:
3058                                 format_list = self._available_formats
3059                         existing_formats = [x for x in format_list if x in url_map]
3060                         if len(existing_formats) == 0:
3061                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3062                                 return
3063                         if req_format is None:
3064                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3065                         elif req_format == 'worst':
3066                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3067                         elif req_format == '-1':
3068                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3069                         else:
3070                                 # Specific format
3071                                 if req_format not in url_map:
3072                                         self._downloader.trouble(u'ERROR: requested format not available')
3073                                         return
3074                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3075
3076                 for format_param, video_real_url in video_url_list:
3077
3078                         # At this point we have a new video
3079                         self._downloader.increment_downloads()
3080
3081                         # Extension
3082                         video_extension = self._video_extensions.get(format_param, 'mp4')
3083
3084                         try:
3085                                 # Process video information
3086                                 self._downloader.process_info({
3087                                         'id':           video_id.decode('utf-8'),
3088                                         'url':          video_real_url.decode('utf-8'),
3089                                         'uploader':     video_uploader.decode('utf-8'),
3090                                         'upload_date':  upload_date,
3091                                         'title':        video_title,
3092                                         'stitle':       simple_title,
3093                                         'ext':          video_extension.decode('utf-8'),
3094                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3095                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3096                                         'description':  video_description.decode('utf-8'),
3097                                         'player_url':   None,
3098                                 })
3099                         except UnavailableVideoError, err:
3100                                 self._downloader.trouble(u'\nERROR: unable to download video')
3101
3102 class BlipTVIE(InfoExtractor):
3103         """Information extractor for blip.tv"""
3104
3105         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3106         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3107         IE_NAME = u'blip.tv'
3108
3109         def report_extraction(self, file_id):
3110                 """Report information extraction."""
3111                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3112
3113         def report_direct_download(self, title):
3114                 """Report information extraction."""
3115                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3116
3117         def _real_extract(self, url):
3118                 mobj = re.match(self._VALID_URL, url)
3119                 if mobj is None:
3120                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3121                         return
3122
3123                 if '?' in url:
3124                         cchar = '&'
3125                 else:
3126                         cchar = '?'
3127                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3128                 request = urllib2.Request(json_url)
3129                 self.report_extraction(mobj.group(1))
3130                 info = None
3131                 try:
3132                         urlh = urllib2.urlopen(request)
3133                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3134                                 basename = url.split('/')[-1]
3135                                 title,ext = os.path.splitext(basename)
3136                                 title = title.decode('UTF-8')
3137                                 ext = ext.replace('.', '')
3138                                 self.report_direct_download(title)
3139                                 info = {
3140                                         'id': title,
3141                                         'url': url,
3142                                         'title': title,
3143                                         'stitle': _simplify_title(title),
3144                                         'ext': ext,
3145                                         'urlhandle': urlh
3146                                 }
3147                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3149                         return
3150                 if info is None: # Regular URL
3151                         try:
3152                                 json_code = urlh.read()
3153                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3154                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3155                                 return
3156
3157                         try:
3158                                 json_data = json.loads(json_code)
3159                                 if 'Post' in json_data:
3160                                         data = json_data['Post']
3161                                 else:
3162                                         data = json_data
3163
3164                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3165                                 video_url = data['media']['url']
3166                                 umobj = re.match(self._URL_EXT, video_url)
3167                                 if umobj is None:
3168                                         raise ValueError('Can not determine filename extension')
3169                                 ext = umobj.group(1)
3170
3171                                 info = {
3172                                         'id': data['item_id'],
3173                                         'url': video_url,
3174                                         'uploader': data['display_name'],
3175                                         'upload_date': upload_date,
3176                                         'title': data['title'],
3177                                         'stitle': _simplify_title(data['title']),
3178                                         'ext': ext,
3179                                         'format': data['media']['mimeType'],
3180                                         'thumbnail': data['thumbnailUrl'],
3181                                         'description': data['description'],
3182                                         'player_url': data['embedUrl']
3183                                 }
3184                         except (ValueError,KeyError), err:
3185                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3186                                 return
3187
3188                 self._downloader.increment_downloads()
3189
3190                 try:
3191                         self._downloader.process_info(info)
3192                 except UnavailableVideoError, err:
3193                         self._downloader.trouble(u'\nERROR: unable to download video')
3194
3195
3196 class MyVideoIE(InfoExtractor):
3197         """Information Extractor for myvideo.de."""
3198
3199         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3200         IE_NAME = u'myvideo'
3201
3202         def __init__(self, downloader=None):
3203                 InfoExtractor.__init__(self, downloader)
3204
3205         def report_download_webpage(self, video_id):
3206                 """Report webpage download."""
3207                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3208
3209         def report_extraction(self, video_id):
3210                 """Report information extraction."""
3211                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3212
3213         def _real_extract(self,url):
3214                 mobj = re.match(self._VALID_URL, url)
3215                 if mobj is None:
3216                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3217                         return
3218
3219                 video_id = mobj.group(1)
3220
3221                 # Get video webpage
3222                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3223                 try:
3224                         self.report_download_webpage(video_id)
3225                         webpage = urllib2.urlopen(request).read()
3226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3228                         return
3229
3230                 self.report_extraction(video_id)
3231                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3232                                  webpage)
3233                 if mobj is None:
3234                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3235                         return
3236                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3237
3238                 mobj = re.search('<title>([^<]+)</title>', webpage)
3239                 if mobj is None:
3240                         self._downloader.trouble(u'ERROR: unable to extract title')
3241                         return
3242
3243                 video_title = mobj.group(1)
3244                 video_title = sanitize_title(video_title)
3245
3246                 simple_title = _simplify_title(video_title)
3247
3248                 try:
3249                         self._downloader.process_info({
3250                                 'id':           video_id,
3251                                 'url':          video_url,
3252                                 'uploader':     u'NA',
3253                                 'upload_date':  u'NA',
3254                                 'title':        video_title,
3255                                 'stitle':       simple_title,
3256                                 'ext':          u'flv',
3257                                 'format':       u'NA',
3258                                 'player_url':   None,
3259                         })
3260                 except UnavailableVideoError:
3261                         self._downloader.trouble(u'\nERROR: Unable to download video')
3262
3263 class ComedyCentralIE(InfoExtractor):
3264         """Information extractor for The Daily Show and Colbert Report """
3265
3266         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3267         IE_NAME = u'comedycentral'
3268
3269         def report_extraction(self, episode_id):
3270                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3271
3272         def report_config_download(self, episode_id):
3273                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3274
3275         def report_index_download(self, episode_id):
3276                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3277
3278         def report_player_url(self, episode_id):
3279                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3280
3281         def _real_extract(self, url):
3282                 mobj = re.match(self._VALID_URL, url)
3283                 if mobj is None:
3284                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3285                         return
3286
3287                 if mobj.group('shortname'):
3288                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3289                                 url = u'http://www.thedailyshow.com/full-episodes/'
3290                         else:
3291                                 url = u'http://www.colbertnation.com/full-episodes/'
3292                         mobj = re.match(self._VALID_URL, url)
3293                         assert mobj is not None
3294
3295                 dlNewest = not mobj.group('episode')
3296                 if dlNewest:
3297                         epTitle = mobj.group('showname')
3298                 else:
3299                         epTitle = mobj.group('episode')
3300
3301                 req = urllib2.Request(url)
3302                 self.report_extraction(epTitle)
3303                 try:
3304                         htmlHandle = urllib2.urlopen(req)
3305                         html = htmlHandle.read()
3306                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3307                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3308                         return
3309                 if dlNewest:
3310                         url = htmlHandle.geturl()
3311                         mobj = re.match(self._VALID_URL, url)
3312                         if mobj is None:
3313                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3314                                 return
3315                         if mobj.group('episode') == '':
3316                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3317                                 return
3318                         epTitle = mobj.group('episode')
3319
3320                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3321                 if len(mMovieParams) == 0:
3322                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3323                         return
3324
3325                 playerUrl_raw = mMovieParams[0][0]
3326                 self.report_player_url(epTitle)
3327                 try:
3328                         urlHandle = urllib2.urlopen(playerUrl_raw)
3329                         playerUrl = urlHandle.geturl()
3330                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3331                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3332                         return
3333
3334                 uri = mMovieParams[0][1]
3335                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3336                 self.report_index_download(epTitle)
3337                 try:
3338                         indexXml = urllib2.urlopen(indexUrl).read()
3339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3341                         return
3342
3343                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3344                 itemEls = idoc.findall('.//item')
3345                 for itemEl in itemEls:
3346                         mediaId = itemEl.findall('./guid')[0].text
3347                         shortMediaId = mediaId.split(':')[-1]
3348                         showId = mediaId.split(':')[-2].replace('.com', '')
3349                         officialTitle = itemEl.findall('./title')[0].text
3350                         officialDate = itemEl.findall('./pubDate')[0].text
3351
3352                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3353                                                 urllib.urlencode({'uri': mediaId}))
3354                         configReq = urllib2.Request(configUrl)
3355                         self.report_config_download(epTitle)
3356                         try:
3357                                 configXml = urllib2.urlopen(configReq).read()
3358                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3360                                 return
3361
3362                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3363                         turls = []
3364                         for rendition in cdoc.findall('.//rendition'):
3365                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3366                                 turls.append(finfo)
3367
3368                         if len(turls) == 0:
3369                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3370                                 continue
3371
3372                         # For now, just pick the highest bitrate
3373                         format,video_url = turls[-1]
3374
3375                         self._downloader.increment_downloads()
3376
3377                         effTitle = showId + u'-' + epTitle
3378                         info = {
3379                                 'id': shortMediaId,
3380                                 'url': video_url,
3381                                 'uploader': showId,
3382                                 'upload_date': officialDate,
3383                                 'title': effTitle,
3384                                 'stitle': _simplify_title(effTitle),
3385                                 'ext': 'mp4',
3386                                 'format': format,
3387                                 'thumbnail': None,
3388                                 'description': officialTitle,
3389                                 'player_url': playerUrl
3390                         }
3391
3392                         try:
3393                                 self._downloader.process_info(info)
3394                         except UnavailableVideoError, err:
3395                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3396                                 continue
3397
3398
3399 class EscapistIE(InfoExtractor):
3400         """Information extractor for The Escapist """
3401
3402         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3403         IE_NAME = u'escapist'
3404
3405         def report_extraction(self, showName):
3406                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3407
3408         def report_config_download(self, showName):
3409                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3410
3411         def _real_extract(self, url):
3412                 htmlParser = HTMLParser.HTMLParser()
3413
3414                 mobj = re.match(self._VALID_URL, url)
3415                 if mobj is None:
3416                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3417                         return
3418                 showName = mobj.group('showname')
3419                 videoId = mobj.group('episode')
3420
3421                 self.report_extraction(showName)
3422                 try:
3423                         webPage = urllib2.urlopen(url).read()
3424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3425                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3426                         return
3427
3428                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3429                 description = htmlParser.unescape(descMatch.group(1))
3430                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3431                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3432                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3433                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3434                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3435                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3436
3437                 self.report_config_download(showName)
3438                 try:
3439                         configJSON = urllib2.urlopen(configUrl).read()
3440                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3441                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3442                         return
3443
3444                 # Technically, it's JavaScript, not JSON
3445                 configJSON = configJSON.replace("'", '"')
3446
3447                 try:
3448                         config = json.loads(configJSON)
3449                 except (ValueError,), err:
3450                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3451                         return
3452
3453                 playlist = config['playlist']
3454                 videoUrl = playlist[1]['url']
3455
3456                 self._downloader.increment_downloads()
3457                 info = {
3458                         'id': videoId,
3459                         'url': videoUrl,
3460                         'uploader': showName,
3461                         'upload_date': None,
3462                         'title': showName,
3463                         'stitle': _simplify_title(showName),
3464                         'ext': 'flv',
3465                         'format': 'flv',
3466                         'thumbnail': imgUrl,
3467                         'description': description,
3468                         'player_url': playerUrl,
3469                 }
3470
3471                 try:
3472                         self._downloader.process_info(info)
3473                 except UnavailableVideoError, err:
3474                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3475
3476
3477 class CollegeHumorIE(InfoExtractor):
3478         """Information extractor for collegehumor.com"""
3479
3480         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3481         IE_NAME = u'collegehumor'
3482
3483         def report_webpage(self, video_id):
3484                 """Report information extraction."""
3485                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3486
3487         def report_extraction(self, video_id):
3488                 """Report information extraction."""
3489                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3490
3491         def _real_extract(self, url):
3492                 htmlParser = HTMLParser.HTMLParser()
3493
3494                 mobj = re.match(self._VALID_URL, url)
3495                 if mobj is None:
3496                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3497                         return
3498                 video_id = mobj.group('videoid')
3499
3500                 self.report_webpage(video_id)
3501                 request = urllib2.Request(url)
3502                 try:
3503                         webpage = urllib2.urlopen(request).read()
3504                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3505                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3506                         return
3507
3508                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3509                 if m is None:
3510                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3511                         return
3512                 internal_video_id = m.group('internalvideoid')
3513
3514                 info = {
3515                         'id': video_id,
3516                         'internal_id': internal_video_id,
3517                 }
3518
3519                 self.report_extraction(video_id)
3520                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3521                 try:
3522                         metaXml = urllib2.urlopen(xmlUrl).read()
3523                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3524                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3525                         return
3526
3527                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3528                 try:
3529                         videoNode = mdoc.findall('./video')[0]
3530                         info['description'] = videoNode.findall('./description')[0].text
3531                         info['title'] = videoNode.findall('./caption')[0].text
3532                         info['stitle'] = _simplify_title(info['title'])
3533                         info['url'] = videoNode.findall('./file')[0].text
3534                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3535                         info['ext'] = info['url'].rpartition('.')[2]
3536                         info['format'] = info['ext']
3537                 except IndexError:
3538                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3539                         return
3540
3541                 self._downloader.increment_downloads()
3542
3543                 try:
3544                         self._downloader.process_info(info)
3545                 except UnavailableVideoError, err:
3546                         self._downloader.trouble(u'\nERROR: unable to download video')
3547
3548
3549 class XVideosIE(InfoExtractor):
3550         """Information extractor for xvideos.com"""
3551
3552         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3553         IE_NAME = u'xvideos'
3554
3555         def report_webpage(self, video_id):
3556                 """Report information extraction."""
3557                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3558
3559         def report_extraction(self, video_id):
3560                 """Report information extraction."""
3561                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3562
3563         def _real_extract(self, url):
3564                 htmlParser = HTMLParser.HTMLParser()
3565
3566                 mobj = re.match(self._VALID_URL, url)
3567                 if mobj is None:
3568                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3569                         return
3570                 video_id = mobj.group(1).decode('utf-8')
3571
3572                 self.report_webpage(video_id)
3573
3574                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3575                 try:
3576                         webpage = urllib2.urlopen(request).read()
3577                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3578                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3579                         return
3580
3581                 self.report_extraction(video_id)
3582
3583
3584                 # Extract video URL
3585                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3586                 if mobj is None:
3587                         self._downloader.trouble(u'ERROR: unable to extract video url')
3588                         return
3589                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3590
3591
3592                 # Extract title
3593                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3594                 if mobj is None:
3595                         self._downloader.trouble(u'ERROR: unable to extract video title')
3596                         return
3597                 video_title = mobj.group(1).decode('utf-8')
3598
3599
3600                 # Extract video thumbnail
3601                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3602                 if mobj is None:
3603                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3604                         return
3605                 video_thumbnail = mobj.group(1).decode('utf-8')
3606
3607
3608
3609                 self._downloader.increment_downloads()
3610                 info = {
3611                         'id': video_id,
3612                         'url': video_url,
3613                         'uploader': None,
3614                         'upload_date': None,
3615                         'title': video_title,
3616                         'stitle': _simplify_title(video_title),
3617                         'ext': 'flv',
3618                         'format': 'flv',
3619                         'thumbnail': video_thumbnail,
3620                         'description': None,
3621                         'player_url': None,
3622                 }
3623
3624                 try:
3625                         self._downloader.process_info(info)
3626                 except UnavailableVideoError, err:
3627                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3628
3629
3630 class SoundcloudIE(InfoExtractor):
3631         """Information extractor for soundcloud.com
3632            To access the media, the uid of the song and a stream token
3633            must be extracted from the page source and the script must make
3634            a request to media.soundcloud.com/crossdomain.xml. Then
3635            the media can be grabbed by requesting from an url composed
3636            of the stream token and uid
3637          """
3638
3639         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3640         IE_NAME = u'soundcloud'
3641
3642         def __init__(self, downloader=None):
3643                 InfoExtractor.__init__(self, downloader)
3644
3645         def report_webpage(self, video_id):
3646                 """Report information extraction."""
3647                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3648
3649         def report_extraction(self, video_id):
3650                 """Report information extraction."""
3651                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3652
3653         def _real_extract(self, url):
3654                 htmlParser = HTMLParser.HTMLParser()
3655
3656                 mobj = re.match(self._VALID_URL, url)
3657                 if mobj is None:
3658                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3659                         return
3660
3661                 # extract uploader (which is in the url)
3662                 uploader = mobj.group(1).decode('utf-8')
3663                 # extract simple title (uploader + slug of song title)
3664                 slug_title =  mobj.group(2).decode('utf-8')
3665                 simple_title = uploader + '-' + slug_title
3666
3667                 self.report_webpage('%s/%s' % (uploader, slug_title))
3668
3669                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3670                 try:
3671                         webpage = urllib2.urlopen(request).read()
3672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3673                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3674                         return
3675
3676                 self.report_extraction('%s/%s' % (uploader, slug_title))
3677
3678                 # extract uid and stream token that soundcloud hands out for access
3679                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3680                 if mobj:
3681                         video_id = mobj.group(1)
3682                         stream_token = mobj.group(2)
3683
3684                 # extract unsimplified title
3685                 mobj = re.search('"title":"(.*?)",', webpage)
3686                 if mobj:
3687                         title = mobj.group(1)
3688
3689                 # construct media url (with uid/token)
3690                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3691                 mediaURL = mediaURL % (video_id, stream_token)
3692
3693                 # description
3694                 description = u'No description available'
3695                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3696                 if mobj:
3697                         description = mobj.group(1)
3698
3699                 # upload date
3700                 upload_date = None
3701                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3702                 if mobj:
3703                         try:
3704                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3705                         except Exception, e:
3706                                 print str(e)
3707
3708                 # for soundcloud, a request to a cross domain is required for cookies
3709                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3710
3711                 try:
3712                         self._downloader.process_info({
3713                                 'id':           video_id.decode('utf-8'),
3714                                 'url':          mediaURL,
3715                                 'uploader':     uploader.decode('utf-8'),
3716                                 'upload_date':  upload_date,
3717                                 'title':        simple_title.decode('utf-8'),
3718                                 'stitle':       simple_title.decode('utf-8'),
3719                                 'ext':          u'mp3',
3720                                 'format':       u'NA',
3721                                 'player_url':   None,
3722                                 'description': description.decode('utf-8')
3723                         })
3724                 except UnavailableVideoError:
3725                         self._downloader.trouble(u'\nERROR: unable to download video')
3726
3727
3728 class InfoQIE(InfoExtractor):
3729         """Information extractor for infoq.com"""
3730
3731         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3732         IE_NAME = u'infoq'
3733
3734         def report_webpage(self, video_id):
3735                 """Report information extraction."""
3736                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3737
3738         def report_extraction(self, video_id):
3739                 """Report information extraction."""
3740                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3741
3742         def _real_extract(self, url):
3743                 htmlParser = HTMLParser.HTMLParser()
3744
3745                 mobj = re.match(self._VALID_URL, url)
3746                 if mobj is None:
3747                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3748                         return
3749
3750                 self.report_webpage(url)
3751
3752                 request = urllib2.Request(url)
3753                 try:
3754                         webpage = urllib2.urlopen(request).read()
3755                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3756                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3757                         return
3758
3759                 self.report_extraction(url)
3760
3761
3762                 # Extract video URL
3763                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3764                 if mobj is None:
3765                         self._downloader.trouble(u'ERROR: unable to extract video url')
3766                         return
3767                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3768
3769
3770                 # Extract title
3771                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3772                 if mobj is None:
3773                         self._downloader.trouble(u'ERROR: unable to extract video title')
3774                         return
3775                 video_title = mobj.group(1).decode('utf-8')
3776
3777                 # Extract description
3778                 video_description = u'No description available.'
3779                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3780                 if mobj is not None:
3781                         video_description = mobj.group(1).decode('utf-8')
3782
3783                 video_filename = video_url.split('/')[-1]
3784                 video_id, extension = video_filename.split('.')
3785
3786                 self._downloader.increment_downloads()
3787                 info = {
3788                         'id': video_id,
3789                         'url': video_url,
3790                         'uploader': None,
3791                         'upload_date': None,
3792                         'title': video_title,
3793                         'stitle': _simplify_title(video_title),
3794                         'ext': extension,
3795                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3796                         'thumbnail': None,
3797                         'description': video_description,
3798                         'player_url': None,
3799                 }
3800
3801                 try:
3802                         self._downloader.process_info(info)
3803                 except UnavailableVideoError, err:
3804                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3805
3806 class MixcloudIE(InfoExtractor):
3807         """Information extractor for www.mixcloud.com"""
3808         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3809         IE_NAME = u'mixcloud'
3810
3811         def __init__(self, downloader=None):
3812                 InfoExtractor.__init__(self, downloader)
3813
3814         def report_download_json(self, file_id):
3815                 """Report JSON download."""
3816                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3817
3818         def report_extraction(self, file_id):
3819                 """Report information extraction."""
3820                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3821
3822         def get_urls(self, jsonData, fmt, bitrate='best'):
3823                 """Get urls from 'audio_formats' section in json"""
3824                 file_url = None
3825                 try:
3826                         bitrate_list = jsonData[fmt]
3827                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3828                                 bitrate = max(bitrate_list) # select highest
3829
3830                         url_list = jsonData[fmt][bitrate]
3831                 except TypeError: # we have no bitrate info.
3832                         url_list = jsonData[fmt]
3833
3834                 return url_list
3835
3836         def check_urls(self, url_list):
3837                 """Returns 1st active url from list"""
3838                 for url in url_list:
3839                         try:
3840                                 urllib2.urlopen(url)
3841                                 return url
3842                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3843                                 url = None
3844
3845                 return None
3846
3847         def _print_formats(self, formats):
3848                 print 'Available formats:'
3849                 for fmt in formats.keys():
3850                         for b in formats[fmt]:
3851                                 try:
3852                                         ext = formats[fmt][b][0]
3853                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3854                                 except TypeError: # we have no bitrate info
3855                                         ext = formats[fmt][0]
3856                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3857                                         break
3858
3859         def _real_extract(self, url):
3860                 mobj = re.match(self._VALID_URL, url)
3861                 if mobj is None:
3862                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3863                         return
3864                 # extract uploader & filename from url
3865                 uploader = mobj.group(1).decode('utf-8')
3866                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3867
3868                 # construct API request
3869                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3870                 # retrieve .json file with links to files
3871                 request = urllib2.Request(file_url)
3872                 try:
3873                         self.report_download_json(file_url)
3874                         jsonData = urllib2.urlopen(request).read()
3875                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3876                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3877                         return
3878
3879                 # parse JSON
3880                 json_data = json.loads(jsonData)
3881                 player_url = json_data['player_swf_url']
3882                 formats = dict(json_data['audio_formats'])
3883
3884                 req_format = self._downloader.params.get('format', None)
3885                 bitrate = None
3886
3887                 if self._downloader.params.get('listformats', None):
3888                         self._print_formats(formats)
3889                         return
3890
3891                 if req_format is None or req_format == 'best':
3892                         for format_param in formats.keys():
3893                                 url_list = self.get_urls(formats, format_param)
3894                                 # check urls
3895                                 file_url = self.check_urls(url_list)
3896                                 if file_url is not None:
3897                                         break # got it!
3898                 else:
3899                         if req_format not in formats.keys():
3900                                 self._downloader.trouble(u'ERROR: format is not available')
3901                                 return
3902
3903                         url_list = self.get_urls(formats, req_format)
3904                         file_url = self.check_urls(url_list)
3905                         format_param = req_format
3906
3907                 # We have audio
3908                 self._downloader.increment_downloads()
3909                 try:
3910                         # Process file information
3911                         self._downloader.process_info({
3912                                 'id': file_id.decode('utf-8'),
3913                                 'url': file_url.decode('utf-8'),
3914                                 'uploader':     uploader.decode('utf-8'),
3915                                 'upload_date': u'NA',
3916                                 'title': json_data['name'],
3917                                 'stitle': _simplify_title(json_data['name']),
3918                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3919                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3920                                 'thumbnail': json_data['thumbnail_url'],
3921                                 'description': json_data['description'],
3922                                 'player_url': player_url.decode('utf-8'),
3923                         })
3924                 except UnavailableVideoError, err:
3925                         self._downloader.trouble(u'ERROR: unable to download file')
3926
3927 class StanfordOpenClassroomIE(InfoExtractor):
3928         """Information extractor for Stanford's Open ClassRoom"""
3929
3930         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3931         IE_NAME = u'stanfordoc'
3932
3933         def report_download_webpage(self, objid):
3934                 """Report information extraction."""
3935                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3936
3937         def report_extraction(self, video_id):
3938                 """Report information extraction."""
3939                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3940
3941         def _real_extract(self, url):
3942                 mobj = re.match(self._VALID_URL, url)
3943                 if mobj is None:
3944                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3945                         return
3946
3947                 if mobj.group('course') and mobj.group('video'): # A specific video
3948                         course = mobj.group('course')
3949                         video = mobj.group('video')
3950                         info = {
3951                                 'id': _simplify_title(course + '_' + video),
3952                         }
3953
3954                         self.report_extraction(info['id'])
3955                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3956                         xmlUrl = baseUrl + video + '.xml'
3957                         try:
3958                                 metaXml = urllib2.urlopen(xmlUrl).read()
3959                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3960                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3961                                 return
3962                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3963                         try:
3964                                 info['title'] = mdoc.findall('./title')[0].text
3965                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3966                         except IndexError:
3967                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3968                                 return
3969                         info['stitle'] = _simplify_title(info['title'])
3970                         info['ext'] = info['url'].rpartition('.')[2]
3971                         info['format'] = info['ext']
3972                         self._downloader.increment_downloads()
3973                         try:
3974                                 self._downloader.process_info(info)
3975                         except UnavailableVideoError, err:
3976                                 self._downloader.trouble(u'\nERROR: unable to download video')
3977                 elif mobj.group('course'): # A course page
3978                         unescapeHTML = HTMLParser.HTMLParser().unescape
3979
3980                         course = mobj.group('course')
3981                         info = {
3982                                 'id': _simplify_title(course),
3983                                 'type': 'playlist',
3984                         }
3985
3986                         self.report_download_webpage(info['id'])
3987                         try:
3988                                 coursepage = urllib2.urlopen(url).read()
3989                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3990                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3991                                 return
3992
3993                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3994                         if m:
3995                                 info['title'] = unescapeHTML(m.group(1))
3996                         else:
3997                                 info['title'] = info['id']
3998                         info['stitle'] = _simplify_title(info['title'])
3999
4000                         m = re.search('<description>([^<]+)</description>', coursepage)
4001                         if m:
4002                                 info['description'] = unescapeHTML(m.group(1))
4003
4004                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
4005                         info['list'] = [
4006                                 {
4007                                         'type': 'reference',
4008                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
4009                                 }
4010                                         for vpage in links]
4011
4012                         for entry in info['list']:
4013                                 assert entry['type'] == 'reference'
4014                                 self.extract(entry['url'])
4015                 else: # Root page
4016                         unescapeHTML = HTMLParser.HTMLParser().unescape
4017
4018                         info = {
4019                                 'id': 'Stanford OpenClassroom',
4020                                 'type': 'playlist',
4021                         }
4022
4023                         self.report_download_webpage(info['id'])
4024                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4025                         try:
4026                                 rootpage = urllib2.urlopen(rootURL).read()
4027                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4028                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4029                                 return
4030
4031                         info['title'] = info['id']
4032                         info['stitle'] = _simplify_title(info['title'])
4033
4034                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4035                         info['list'] = [
4036                                 {
4037                                         'type': 'reference',
4038                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
4039                                 }
4040                                         for cpage in links]
4041
4042                         for entry in info['list']:
4043                                 assert entry['type'] == 'reference'
4044                                 self.extract(entry['url'])
4045
4046 class MTVIE(InfoExtractor):
4047         """Information extractor for MTV.com"""
4048
4049         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4050         IE_NAME = u'mtv'
4051
4052         def report_webpage(self, video_id):
4053                 """Report information extraction."""
4054                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4055
4056         def report_extraction(self, video_id):
4057                 """Report information extraction."""
4058                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4059
4060         def _real_extract(self, url):
4061                 mobj = re.match(self._VALID_URL, url)
4062                 if mobj is None:
4063                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4064                         return
4065                 if not mobj.group('proto'):
4066                         url = 'http://' + url
4067                 video_id = mobj.group('videoid')
4068                 self.report_webpage(video_id)
4069
4070                 request = urllib2.Request(url)
4071                 try:
4072                         webpage = urllib2.urlopen(request).read()
4073                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4074                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4075                         return
4076
4077                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4078                 if mobj is None:
4079                         self._downloader.trouble(u'ERROR: unable to extract song name')
4080                         return
4081                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4082                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4083                 if mobj is None:
4084                         self._downloader.trouble(u'ERROR: unable to extract performer')
4085                         return
4086                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4087                 video_title = performer + ' - ' + song_name
4088
4089                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4090                 if mobj is None:
4091                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4092                         return
4093                 mtvn_uri = mobj.group(1)
4094
4095                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4096                 if mobj is None:
4097                         self._downloader.trouble(u'ERROR: unable to extract content id')
4098                         return
4099                 content_id = mobj.group(1)
4100
4101                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4102                 self.report_extraction(video_id)
4103                 request = urllib2.Request(videogen_url)
4104                 try:
4105                         metadataXml = urllib2.urlopen(request).read()
4106                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4107                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4108                         return
4109
4110                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4111                 renditions = mdoc.findall('.//rendition')
4112
4113                 # For now, always pick the highest quality.
4114                 rendition = renditions[-1]
4115
4116                 try:
4117                         _,_,ext = rendition.attrib['type'].partition('/')
4118                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4119                         video_url = rendition.find('./src').text
4120                 except KeyError:
4121                         self._downloader.trouble('Invalid rendition field.')
4122                         return
4123
4124                 self._downloader.increment_downloads()
4125                 info = {
4126                         'id': video_id,
4127                         'url': video_url,
4128                         'uploader': performer,
4129                         'title': video_title,
4130                         'stitle': _simplify_title(video_title),
4131                         'ext': ext,
4132                         'format': format,
4133                 }
4134
4135                 try:
4136                         self._downloader.process_info(info)
4137                 except UnavailableVideoError, err:
4138                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4139
4140
4141 class PostProcessor(object):
4142         """Post Processor class.
4143
4144         PostProcessor objects can be added to downloaders with their
4145         add_post_processor() method. When the downloader has finished a
4146         successful download, it will take its internal chain of PostProcessors
4147         and start calling the run() method on each one of them, first with
4148         an initial argument and then with the returned value of the previous
4149         PostProcessor.
4150
4151         The chain will be stopped if one of them ever returns None or the end
4152         of the chain is reached.
4153
4154         PostProcessor objects follow a "mutual registration" process similar
4155         to InfoExtractor objects.
4156         """
4157
4158         _downloader = None
4159
4160         def __init__(self, downloader=None):
4161                 self._downloader = downloader
4162
4163         def set_downloader(self, downloader):
4164                 """Sets the downloader for this PP."""
4165                 self._downloader = downloader
4166
4167         def run(self, information):
4168                 """Run the PostProcessor.
4169
4170                 The "information" argument is a dictionary like the ones
4171                 composed by InfoExtractors. The only difference is that this
4172                 one has an extra field called "filepath" that points to the
4173                 downloaded file.
4174
4175                 When this method returns None, the postprocessing chain is
4176                 stopped. However, this method may return an information
4177                 dictionary that will be passed to the next postprocessing
4178                 object in the chain. It can be the one it received after
4179                 changing some fields.
4180
4181                 In addition, this method may raise a PostProcessingError
4182                 exception that will be taken into account by the downloader
4183                 it was called from.
4184                 """
4185                 return information # by default, do nothing
4186
4187 class AudioConversionError(BaseException):
4188         def __init__(self, message):
4189                 self.message = message
4190
4191 class FFmpegExtractAudioPP(PostProcessor):
4192
4193         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4194                 PostProcessor.__init__(self, downloader)
4195                 if preferredcodec is None:
4196                         preferredcodec = 'best'
4197                 self._preferredcodec = preferredcodec
4198                 self._preferredquality = preferredquality
4199                 self._keepvideo = keepvideo
4200
4201         @staticmethod
4202         def get_audio_codec(path):
4203                 try:
4204                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4205                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4206                         output = handle.communicate()[0]
4207                         if handle.wait() != 0:
4208                                 return None
4209                 except (IOError, OSError):
4210                         return None
4211                 audio_codec = None
4212                 for line in output.split('\n'):
4213                         if line.startswith('codec_name='):
4214                                 audio_codec = line.split('=')[1].strip()
4215                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4216                                 return audio_codec
4217                 return None
4218
4219         @staticmethod
4220         def run_ffmpeg(path, out_path, codec, more_opts):
4221                 if codec is None:
4222                         acodec_opts = []
4223                 else:
4224                         acodec_opts = ['-acodec', codec]
4225                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4226                 try:
4227                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4228                         stdout,stderr = p.communicate()
4229                 except (IOError, OSError):
4230                         e = sys.exc_info()[1]
4231                         if isinstance(e, OSError) and e.errno == 2:
4232                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4233                         else:
4234                                 raise e
4235                 if p.returncode != 0:
4236                         msg = stderr.strip().split('\n')[-1]
4237                         raise AudioConversionError(msg)
4238
4239         def run(self, information):
4240                 path = information['filepath']
4241
4242                 filecodec = self.get_audio_codec(path)
4243                 if filecodec is None:
4244                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4245                         return None
4246
4247                 more_opts = []
4248                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4249                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4250                                 # Lossless, but in another container
4251                                 acodec = 'copy'
4252                                 extension = self._preferredcodec
4253                                 more_opts = ['-absf', 'aac_adtstoasc']
4254                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4255                                 # Lossless if possible
4256                                 acodec = 'copy'
4257                                 extension = filecodec
4258                                 if filecodec == 'aac':
4259                                         more_opts = ['-f', 'adts']
4260                                 if filecodec == 'vorbis':
4261                                         extension = 'ogg'
4262                         else:
4263                                 # MP3 otherwise.
4264                                 acodec = 'libmp3lame'
4265                                 extension = 'mp3'
4266                                 more_opts = []
4267                                 if self._preferredquality is not None:
4268                                         more_opts += ['-ab', self._preferredquality]
4269                 else:
4270                         # We convert the audio (lossy)
4271                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4272                         extension = self._preferredcodec
4273                         more_opts = []
4274                         if self._preferredquality is not None:
4275                                 more_opts += ['-ab', self._preferredquality]
4276                         if self._preferredcodec == 'aac':
4277                                 more_opts += ['-f', 'adts']
4278                         if self._preferredcodec == 'm4a':
4279                                 more_opts += ['-absf', 'aac_adtstoasc']
4280                         if self._preferredcodec == 'vorbis':
4281                                 extension = 'ogg'
4282                         if self._preferredcodec == 'wav':
4283                                 extension = 'wav'
4284                                 more_opts += ['-f', 'wav']
4285
4286                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4287                 new_path = prefix + sep + extension
4288                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4289                 try:
4290                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4291                 except:
4292                         etype,e,tb = sys.exc_info()
4293                         if isinstance(e, AudioConversionError):
4294                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4295                         else:
4296                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4297                         return None
4298
4299                 # Try to update the date time for extracted audio file.
4300                 if information.get('filetime') is not None:
4301                         try:
4302                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4303                         except:
4304                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4305
4306                 if not self._keepvideo:
4307                         try:
4308                                 os.remove(_encodeFilename(path))
4309                         except (IOError, OSError):
4310                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4311                                 return None
4312
4313                 information['filepath'] = new_path
4314                 return information
4315
4316
4317 def updateSelf(downloader, filename):
4318         ''' Update the program file with the latest version from the repository '''
4319         # Note: downloader only used for options
4320         if not os.access(filename, os.W_OK):
4321                 sys.exit('ERROR: no write permissions on %s' % filename)
4322
4323         downloader.to_screen(u'Updating to latest version...')
4324
4325         try:
4326                 try:
4327                         urlh = urllib.urlopen(UPDATE_URL)
4328                         newcontent = urlh.read()
4329
4330                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4331                         if vmatch is not None and vmatch.group(1) == __version__:
4332                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4333                                 return
4334                 finally:
4335                         urlh.close()
4336         except (IOError, OSError), err:
4337                 sys.exit('ERROR: unable to download latest version')
4338
4339         try:
4340                 outf = open(filename, 'wb')
4341                 try:
4342                         outf.write(newcontent)
4343                 finally:
4344                         outf.close()
4345         except (IOError, OSError), err:
4346                 sys.exit('ERROR: unable to overwrite current version')
4347
4348         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4349
4350 def parseOpts():
4351         def _readOptions(filename_bytes):
4352                 try:
4353                         optionf = open(filename_bytes)
4354                 except IOError:
4355                         return [] # silently skip if file is not present
4356                 try:
4357                         res = []
4358                         for l in optionf:
4359                                 res += shlex.split(l, comments=True)
4360                 finally:
4361                         optionf.close()
4362                 return res
4363
4364         def _format_option_string(option):
4365                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4366
4367                 opts = []
4368
4369                 if option._short_opts: opts.append(option._short_opts[0])
4370                 if option._long_opts: opts.append(option._long_opts[0])
4371                 if len(opts) > 1: opts.insert(1, ', ')
4372
4373                 if option.takes_value(): opts.append(' %s' % option.metavar)
4374
4375                 return "".join(opts)
4376
4377         def _find_term_columns():
4378                 columns = os.environ.get('COLUMNS', None)
4379                 if columns:
4380                         return int(columns)
4381
4382                 try:
4383                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4384                         out,err = sp.communicate()
4385                         return int(out.split()[1])
4386                 except:
4387                         pass
4388                 return None
4389
4390         max_width = 80
4391         max_help_position = 80
4392
4393         # No need to wrap help messages if we're on a wide console
4394         columns = _find_term_columns()
4395         if columns: max_width = columns
4396
4397         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4398         fmt.format_option_strings = _format_option_string
4399
4400         kw = {
4401                 'version'   : __version__,
4402                 'formatter' : fmt,
4403                 'usage' : '%prog [options] url [url...]',
4404                 'conflict_handler' : 'resolve',
4405         }
4406
4407         parser = optparse.OptionParser(**kw)
4408
4409         # option groups
4410         general        = optparse.OptionGroup(parser, 'General Options')
4411         selection      = optparse.OptionGroup(parser, 'Video Selection')
4412         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4413         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4414         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4415         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4416         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4417
4418         general.add_option('-h', '--help',
4419                         action='help', help='print this help text and exit')
4420         general.add_option('-v', '--version',
4421                         action='version', help='print program version and exit')
4422         general.add_option('-U', '--update',
4423                         action='store_true', dest='update_self', help='update this program to latest version')
4424         general.add_option('-i', '--ignore-errors',
4425                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4426         general.add_option('-r', '--rate-limit',
4427                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4428         general.add_option('-R', '--retries',
4429                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4430         general.add_option('--dump-user-agent',
4431                         action='store_true', dest='dump_user_agent',
4432                         help='display the current browser identification', default=False)
4433         general.add_option('--list-extractors',
4434                         action='store_true', dest='list_extractors',
4435                         help='List all supported extractors and the URLs they would handle', default=False)
4436
4437         selection.add_option('--playlist-start',
4438                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4439         selection.add_option('--playlist-end',
4440                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4441         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4442         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4443         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4444
4445         authentication.add_option('-u', '--username',
4446                         dest='username', metavar='USERNAME', help='account username')
4447         authentication.add_option('-p', '--password',
4448                         dest='password', metavar='PASSWORD', help='account password')
4449         authentication.add_option('-n', '--netrc',
4450                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4451
4452
4453         video_format.add_option('-f', '--format',
4454                         action='store', dest='format', metavar='FORMAT', help='video format code')
4455         video_format.add_option('--all-formats',
4456                         action='store_const', dest='format', help='download all available video formats', const='all')
4457         video_format.add_option('--prefer-free-formats',
4458                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4459         video_format.add_option('--max-quality',
4460                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4461         video_format.add_option('-F', '--list-formats',
4462                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4463         video_format.add_option('--write-srt',
4464                         action='store_true', dest='writesubtitles',
4465                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4466         video_format.add_option('--srt-lang',
4467                         action='store', dest='subtitleslang', metavar='LANG',
4468                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4469
4470
4471         verbosity.add_option('-q', '--quiet',
4472                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4473         verbosity.add_option('-s', '--simulate',
4474                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4475         verbosity.add_option('--skip-download',
4476                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4477         verbosity.add_option('-g', '--get-url',
4478                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4479         verbosity.add_option('-e', '--get-title',
4480                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4481         verbosity.add_option('--get-thumbnail',
4482                         action='store_true', dest='getthumbnail',
4483                         help='simulate, quiet but print thumbnail URL', default=False)
4484         verbosity.add_option('--get-description',
4485                         action='store_true', dest='getdescription',
4486                         help='simulate, quiet but print video description', default=False)
4487         verbosity.add_option('--get-filename',
4488                         action='store_true', dest='getfilename',
4489                         help='simulate, quiet but print output filename', default=False)
4490         verbosity.add_option('--get-format',
4491                         action='store_true', dest='getformat',
4492                         help='simulate, quiet but print output format', default=False)
4493         verbosity.add_option('--no-progress',
4494                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4495         verbosity.add_option('--console-title',
4496                         action='store_true', dest='consoletitle',
4497                         help='display progress in console titlebar', default=False)
4498         verbosity.add_option('-v', '--verbose',
4499                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4500
4501
4502         filesystem.add_option('-t', '--title',
4503                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4504         filesystem.add_option('-l', '--literal',
4505                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4506         filesystem.add_option('-A', '--auto-number',
4507                         action='store_true', dest='autonumber',
4508                         help='number downloaded files starting from 00000', default=False)
4509         filesystem.add_option('-o', '--output',
4510                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4511         filesystem.add_option('-a', '--batch-file',
4512                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4513         filesystem.add_option('-w', '--no-overwrites',
4514                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4515         filesystem.add_option('-c', '--continue',
4516                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4517         filesystem.add_option('--no-continue',
4518                         action='store_false', dest='continue_dl',
4519                         help='do not resume partially downloaded files (restart from beginning)')
4520         filesystem.add_option('--cookies',
4521                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4522         filesystem.add_option('--no-part',
4523                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4524         filesystem.add_option('--no-mtime',
4525                         action='store_false', dest='updatetime',
4526                         help='do not use the Last-modified header to set the file modification time', default=True)
4527         filesystem.add_option('--write-description',
4528                         action='store_true', dest='writedescription',
4529                         help='write video description to a .description file', default=False)
4530         filesystem.add_option('--write-info-json',
4531                         action='store_true', dest='writeinfojson',
4532                         help='write video metadata to a .info.json file', default=False)
4533
4534
4535         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4536                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4537         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4538                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4539         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4540                         help='ffmpeg audio bitrate specification, 128k by default')
4541         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4542                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4543
4544
4545         parser.add_option_group(general)
4546         parser.add_option_group(selection)
4547         parser.add_option_group(filesystem)
4548         parser.add_option_group(verbosity)
4549         parser.add_option_group(video_format)
4550         parser.add_option_group(authentication)
4551         parser.add_option_group(postproc)
4552
4553         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4554         if xdg_config_home:
4555                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4556         else:
4557                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4558         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4559         opts, args = parser.parse_args(argv)
4560
4561         return parser, opts, args
4562
4563 def gen_extractors():
4564         """ Return a list of an instance of every supported extractor.
4565         The order does matter; the first extractor matched is the one handling the URL.
4566         """
4567         youtube_ie = YoutubeIE()
4568         google_ie = GoogleIE()
4569         yahoo_ie = YahooIE()
4570         return [
4571                 YoutubePlaylistIE(youtube_ie),
4572                 YoutubeUserIE(youtube_ie),
4573                 YoutubeSearchIE(youtube_ie),
4574                 youtube_ie,
4575                 MetacafeIE(youtube_ie),
4576                 DailymotionIE(),
4577                 google_ie,
4578                 GoogleSearchIE(google_ie),
4579                 PhotobucketIE(),
4580                 yahoo_ie,
4581                 YahooSearchIE(yahoo_ie),
4582                 DepositFilesIE(),
4583                 FacebookIE(),
4584                 BlipTVIE(),
4585                 VimeoIE(),
4586                 MyVideoIE(),
4587                 ComedyCentralIE(),
4588                 EscapistIE(),
4589                 CollegeHumorIE(),
4590                 XVideosIE(),
4591                 SoundcloudIE(),
4592                 InfoQIE(),
4593                 MixcloudIE(),
4594                 StanfordOpenClassroomIE(),
4595                 MTVIE(),
4596
4597                 GenericIE()
4598         ]
4599
4600 def _real_main():
4601         parser, opts, args = parseOpts()
4602
4603         # Open appropriate CookieJar
4604         if opts.cookiefile is None:
4605                 jar = cookielib.CookieJar()
4606         else:
4607                 try:
4608                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4609                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4610                                 jar.load()
4611                 except (IOError, OSError), err:
4612                         sys.exit(u'ERROR: unable to open cookie file')
4613
4614         # Dump user agent
4615         if opts.dump_user_agent:
4616                 print std_headers['User-Agent']
4617                 sys.exit(0)
4618
4619         # Batch file verification
4620         batchurls = []
4621         if opts.batchfile is not None:
4622                 try:
4623                         if opts.batchfile == '-':
4624                                 batchfd = sys.stdin
4625                         else:
4626                                 batchfd = open(opts.batchfile, 'r')
4627                         batchurls = batchfd.readlines()
4628                         batchurls = [x.strip() for x in batchurls]
4629                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4630                 except IOError:
4631                         sys.exit(u'ERROR: batch file could not be read')
4632         all_urls = batchurls + args
4633         all_urls = map(lambda url: url.strip(), all_urls)
4634
4635         # General configuration
4636         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4637         proxy_handler = urllib2.ProxyHandler()
4638         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4639         urllib2.install_opener(opener)
4640         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4641
4642         if opts.verbose:
4643                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4644
4645         extractors = gen_extractors()
4646
4647         if opts.list_extractors:
4648                 for ie in extractors:
4649                         print(ie.IE_NAME)
4650                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4651                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4652                         for mu in matchedUrls:
4653                                 print(u'  ' + mu)
4654                 sys.exit(0)
4655
4656         # Conflicting, missing and erroneous options
4657         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4658                 parser.error(u'using .netrc conflicts with giving username/password')
4659         if opts.password is not None and opts.username is None:
4660                 parser.error(u'account username missing')
4661         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4662                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4663         if opts.usetitle and opts.useliteral:
4664                 parser.error(u'using title conflicts with using literal title')
4665         if opts.username is not None and opts.password is None:
4666                 opts.password = getpass.getpass(u'Type account password and press return:')
4667         if opts.ratelimit is not None:
4668                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4669                 if numeric_limit is None:
4670                         parser.error(u'invalid rate limit specified')
4671                 opts.ratelimit = numeric_limit
4672         if opts.retries is not None:
4673                 try:
4674                         opts.retries = long(opts.retries)
4675                 except (TypeError, ValueError), err:
4676                         parser.error(u'invalid retry count specified')
4677         try:
4678                 opts.playliststart = int(opts.playliststart)
4679                 if opts.playliststart <= 0:
4680                         raise ValueError(u'Playlist start must be positive')
4681         except (TypeError, ValueError), err:
4682                 parser.error(u'invalid playlist start number specified')
4683         try:
4684                 opts.playlistend = int(opts.playlistend)
4685                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4686                         raise ValueError(u'Playlist end must be greater than playlist start')
4687         except (TypeError, ValueError), err:
4688                 parser.error(u'invalid playlist end number specified')
4689         if opts.extractaudio:
4690                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4691                         parser.error(u'invalid audio format specified')
4692
4693         # File downloader
4694         fd = FileDownloader({
4695                 'usenetrc': opts.usenetrc,
4696                 'username': opts.username,
4697                 'password': opts.password,
4698                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4699                 'forceurl': opts.geturl,
4700                 'forcetitle': opts.gettitle,
4701                 'forcethumbnail': opts.getthumbnail,
4702                 'forcedescription': opts.getdescription,
4703                 'forcefilename': opts.getfilename,
4704                 'forceformat': opts.getformat,
4705                 'simulate': opts.simulate,
4706                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4707                 'format': opts.format,
4708                 'format_limit': opts.format_limit,
4709                 'listformats': opts.listformats,
4710                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4711                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4712                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4713                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4714                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4715                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4716                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4717                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4718                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4719                         or u'%(id)s.%(ext)s'),
4720                 'ignoreerrors': opts.ignoreerrors,
4721                 'ratelimit': opts.ratelimit,
4722                 'nooverwrites': opts.nooverwrites,
4723                 'retries': opts.retries,
4724                 'continuedl': opts.continue_dl,
4725                 'noprogress': opts.noprogress,
4726                 'playliststart': opts.playliststart,
4727                 'playlistend': opts.playlistend,
4728                 'logtostderr': opts.outtmpl == '-',
4729                 'consoletitle': opts.consoletitle,
4730                 'nopart': opts.nopart,
4731                 'updatetime': opts.updatetime,
4732                 'writedescription': opts.writedescription,
4733                 'writeinfojson': opts.writeinfojson,
4734                 'writesubtitles': opts.writesubtitles,
4735                 'subtitleslang': opts.subtitleslang,
4736                 'matchtitle': opts.matchtitle,
4737                 'rejecttitle': opts.rejecttitle,
4738                 'max_downloads': opts.max_downloads,
4739                 'prefer_free_formats': opts.prefer_free_formats,
4740                 'verbose': opts.verbose,
4741                 })
4742         for extractor in extractors:
4743                 fd.add_info_extractor(extractor)
4744
4745         # PostProcessors
4746         if opts.extractaudio:
4747                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4748
4749         # Update version
4750         if opts.update_self:
4751                 updateSelf(fd, sys.argv[0])
4752
4753         # Maybe do nothing
4754         if len(all_urls) < 1:
4755                 if not opts.update_self:
4756                         parser.error(u'you must provide at least one URL')
4757                 else:
4758                         sys.exit()
4759
4760         try:
4761                 retcode = fd.download(all_urls)
4762         except MaxDownloadsReached:
4763                 fd.to_screen(u'--max-download limit reached, aborting.')
4764                 retcode = 101
4765
4766         # Dump cookie jar if requested
4767         if opts.cookiefile is not None:
4768                 try:
4769                         jar.save()
4770                 except (IOError, OSError), err:
4771                         sys.exit(u'ERROR: unable to save cookie jar')
4772
4773         sys.exit(retcode)
4774
4775 def main():
4776         try:
4777                 _real_main()
4778         except DownloadError:
4779                 sys.exit(1)
4780         except SameFileError:
4781                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4782         except KeyboardInterrupt:
4783                 sys.exit(u'\nERROR: Interrupted by user')
4784
4785 if __name__ == '__main__':
4786         main()
4787
4788 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: