_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2012.02.27'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25
  26 import cookielib
  27 import datetime
  28 import getpass
  29 import gzip
  30 import htmlentitydefs
  31 import HTMLParser
  32 import httplib
  33 import locale
  34 import math
  35 import netrc
  36 import optparse
  37 import os
  38 import os.path
  39 import re
  40 import shlex
  41 import socket
  42 import string
  43 import subprocess
  44 import sys
  45 import time
  46 import urllib
  47 import urllib2
  48 import warnings
  49 import zlib
  50
  51 if os.name == 'nt':
  52         import ctypes
  53
  54 try:
  55         import email.utils
  56 except ImportError: # Python 2.4
  57         import email.Utils
  58 try:
  59         import cStringIO as StringIO
  60 except ImportError:
  61         import StringIO
  62
  63 # parse_qs was moved from the cgi module to the urlparse module recently.
  64 try:
  65         from urlparse import parse_qs
  66 except ImportError:
  67         from cgi import parse_qs
  68
  69 try:
  70         import lxml.etree
  71 except ImportError:
  72         pass # Handled below
  73
  74 try:
  75         import xml.etree.ElementTree
  76 except ImportError: # Python<2.5: Not officially supported, but let it slip
  77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  78
  79 std_headers = {
  80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83         'Accept-Encoding': 'gzip, deflate',
  84         'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87 try:
  88         import json
  89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  90         import re
  91         class json(object):
  92                 @staticmethod
  93                 def loads(s):
  94                         s = s.decode('UTF-8')
  95                         def raiseError(msg, i):
  96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  97                         def skipSpace(i, expectMore=True):
  98                                 while i < len(s) and s[i] in ' \t\r\n':
  99                                         i += 1
 100                                 if expectMore:
 101                                         if i >= len(s):
 102                                                 raiseError('Premature end', i)
 103                                 return i
 104                         def decodeEscape(match):
 105                                 esc = match.group(1)
 106                                 _STATIC = {
 107                                         '"': '"',
 108                                         '\\': '\\',
 109                                         '/': '/',
 110                                         'b': unichr(0x8),
 111                                         'f': unichr(0xc),
 112                                         'n': '\n',
 113                                         'r': '\r',
 114                                         't': '\t',
 115                                 }
 116                                 if esc in _STATIC:
 117                                         return _STATIC[esc]
 118                                 if esc[0] == 'u':
 119                                         if len(esc) == 1+4:
 120                                                 return unichr(int(esc[1:5], 16))
 121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 122                                                 hi = int(esc[1:5], 16)
 123                                                 low = int(esc[7:11], 16)
 124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 125                                 raise ValueError('Unknown escape ' + str(esc))
 126                         def parseString(i):
 127                                 i += 1
 128                                 e = i
 129                                 while True:
 130                                         e = s.index('"', e)
 131                                         bslashes = 0
 132                                         while s[e-bslashes-1] == '\\':
 133                                                 bslashes += 1
 134                                         if bslashes % 2 == 1:
 135                                                 e += 1
 136                                                 continue
 137                                         break
 138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 139                                 stri = rexp.sub(decodeEscape, s[i:e])
 140                                 return (e+1,stri)
 141                         def parseObj(i):
 142                                 i += 1
 143                                 res = {}
 144                                 i = skipSpace(i)
 145                                 if s[i] == '}': # Empty dictionary
 146                                         return (i+1,res)
 147                                 while True:
 148                                         if s[i] != '"':
 149                                                 raiseError('Expected a string object key', i)
 150                                         i,key = parseString(i)
 151                                         i = skipSpace(i)
 152                                         if i >= len(s) or s[i] != ':':
 153                                                 raiseError('Expected a colon', i)
 154                                         i,val = parse(i+1)
 155                                         res[key] = val
 156                                         i = skipSpace(i)
 157                                         if s[i] == '}':
 158                                                 return (i+1, res)
 159                                         if s[i] != ',':
 160                                                 raiseError('Expected comma or closing curly brace', i)
 161                                         i = skipSpace(i+1)
 162                         def parseArray(i):
 163                                 res = []
 164                                 i = skipSpace(i+1)
 165                                 if s[i] == ']': # Empty array
 166                                         return (i+1,res)
 167                                 while True:
 168                                         i,val = parse(i)
 169                                         res.append(val)
 170                                         i = skipSpace(i) # Raise exception if premature end
 171                                         if s[i] == ']':
 172                                                 return (i+1, res)
 173                                         if s[i] != ',':
 174                                                 raiseError('Expected a comma or closing bracket', i)
 175                                         i = skipSpace(i+1)
 176                         def parseDiscrete(i):
 177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 178                                         if s.startswith(k, i):
 179                                                 return (i+len(k), v)
 180                                 raiseError('Not a boolean (or null)', i)
 181                         def parseNumber(i):
 182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 183                                 if mobj is None:
 184                                         raiseError('Not a number', i)
 185                                 nums = mobj.group(1)
 186                                 if '.' in nums or 'e' in nums or 'E' in nums:
 187                                         return (i+len(nums), float(nums))
 188                                 return (i+len(nums), int(nums))
 189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 190                         def parse(i):
 191                                 i = skipSpace(i)
 192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 193                                 i = skipSpace(i, False)
 194                                 return (i,res)
 195                         i,res = parse(0)
 196                         if i < len(s):
 197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 198                         return res
 199
 200 def preferredencoding():
 201         """Get preferred encoding.
 202
 203         Returns the best encoding scheme for the system, based on
 204         locale.getpreferredencoding() and some further tweaks.
 205         """
 206         def yield_preferredencoding():
 207                 try:
 208                         pref = locale.getpreferredencoding()
 209                         u'TEST'.encode(pref)
 210                 except:
 211                         pref = 'UTF-8'
 212                 while True:
 213                         yield pref
 214         return yield_preferredencoding().next()
 215
 216
 217 def htmlentity_transform(matchobj):
 218         """Transforms an HTML entity to a Unicode character.
 219
 220         This function receives a match object and is intended to be used with
 221         the re.sub() function.
 222         """
 223         entity = matchobj.group(1)
 224
 225         # Known non-numeric HTML entity
 226         if entity in htmlentitydefs.name2codepoint:
 227                 return unichr(htmlentitydefs.name2codepoint[entity])
 228
 229         # Unicode character
 230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 231         if mobj is not None:
 232                 numstr = mobj.group(1)
 233                 if numstr.startswith(u'x'):
 234                         base = 16
 235                         numstr = u'0%s' % numstr
 236                 else:
 237                         base = 10
 238                 return unichr(long(numstr, base))
 239
 240         # Unknown entity in name, return its literal representation
 241         return (u'&%s;' % entity)
 242
 243
 244 def sanitize_title(utitle):
 245         """Sanitizes a video title so it could be used as part of a filename."""
 246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 247         return utitle.replace(unicode(os.sep), u'%')
 248
 249
 250 def sanitize_open(filename, open_mode):
 251         """Try to open the given filename, and slightly tweak it if this fails.
 252
 253         Attempts to open the given filename. If this fails, it tries to change
 254         the filename slightly, step by step, until it's either able to open it
 255         or it fails and raises a final exception, like the standard open()
 256         function.
 257
 258         It returns the tuple (stream, definitive_file_name).
 259         """
 260         try:
 261                 if filename == u'-':
 262                         if sys.platform == 'win32':
 263                                 import msvcrt
 264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 265                         return (sys.stdout, filename)
 266                 stream = open(_encodeFilename(filename), open_mode)
 267                 return (stream, filename)
 268         except (IOError, OSError), err:
 269                 # In case of error, try to remove win32 forbidden chars
 270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 271
 272                 # An exception here should be caught in the caller
 273                 stream = open(_encodeFilename(filename), open_mode)
 274                 return (stream, filename)
 275
 276
 277 def timeconvert(timestr):
 278         """Convert RFC 2822 defined time string into system timestamp"""
 279         timestamp = None
 280         timetuple = email.utils.parsedate_tz(timestr)
 281         if timetuple is not None:
 282                 timestamp = email.utils.mktime_tz(timetuple)
 283         return timestamp
 284
 285 def _simplify_title(title):
 286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 287         return expr.sub(u'_', title).strip(u'_')
 288
 289 def _orderedSet(iterable):
 290         """ Remove all duplicates from the input iterable """
 291         res = []
 292         for el in iterable:
 293                 if el not in res:
 294                         res.append(el)
 295         return res
 296
 297 def _unescapeHTML(s):
 298         """
 299         @param s a string (of type unicode)
 300         """
 301         assert type(s) == type(u'')
 302
 303         htmlParser = HTMLParser.HTMLParser()
 304         return htmlParser.unescape(s)
 305
 306 def _encodeFilename(s):
 307         """
 308         @param s The name of the file (of type unicode)
 309         """
 310
 311         assert type(s) == type(u'')
 312
 313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 317                 return s
 318         else:
 319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 320
 321 class DownloadError(Exception):
 322         """Download Error exception.
 323
 324         This exception may be thrown by FileDownloader objects if they are not
 325         configured to continue on errors. They will contain the appropriate
 326         error message.
 327         """
 328         pass
 329
 330
 331 class SameFileError(Exception):
 332         """Same File exception.
 333
 334         This exception will be thrown by FileDownloader objects if they detect
 335         multiple files would have to be downloaded to the same file on disk.
 336         """
 337         pass
 338
 339
 340 class PostProcessingError(Exception):
 341         """Post Processing exception.
 342
 343         This exception may be raised by PostProcessor's .run() method to
 344         indicate an error in the postprocessing task.
 345         """
 346         pass
 347
 348 class MaxDownloadsReached(Exception):
 349         """ --max-downloads limit has been reached. """
 350         pass
 351
 352
 353 class UnavailableVideoError(Exception):
 354         """Unavailable Format exception.
 355
 356         This exception will be thrown when a video is requested
 357         in a format that is not available for that video.
 358         """
 359         pass
 360
 361
 362 class ContentTooShortError(Exception):
 363         """Content Too Short exception.
 364
 365         This exception may be raised by FileDownloader objects when a file they
 366         download is too small for what the server announced first, indicating
 367         the connection was probably interrupted.
 368         """
 369         # Both in bytes
 370         downloaded = None
 371         expected = None
 372
 373         def __init__(self, downloaded, expected):
 374                 self.downloaded = downloaded
 375                 self.expected = expected
 376
 377
 378 class YoutubeDLHandler(urllib2.HTTPHandler):
 379         """Handler for HTTP requests and responses.
 380
 381         This class, when installed with an OpenerDirector, automatically adds
 382         the standard headers to every HTTP request and handles gzipped and
 383         deflated responses from web servers. If compression is to be avoided in
 384         a particular request, the original request in the program code only has
 385         to include the HTTP header "Youtubedl-No-Compression", which will be
 386         removed before making the real request.
 387
 388         Part of this code was copied from:
 389
 390         http://techknack.net/python-urllib2-handlers/
 391
 392         Andrew Rowls, the author of that code, agreed to release it to the
 393         public domain.
 394         """
 395
 396         @staticmethod
 397         def deflate(data):
 398                 try:
 399                         return zlib.decompress(data, -zlib.MAX_WBITS)
 400                 except zlib.error:
 401                         return zlib.decompress(data)
 402
 403         @staticmethod
 404         def addinfourl_wrapper(stream, headers, url, code):
 405                 if hasattr(urllib2.addinfourl, 'getcode'):
 406                         return urllib2.addinfourl(stream, headers, url, code)
 407                 ret = urllib2.addinfourl(stream, headers, url)
 408                 ret.code = code
 409                 return ret
 410
 411         def http_request(self, req):
 412                 for h in std_headers:
 413                         if h in req.headers:
 414                                 del req.headers[h]
 415                         req.add_header(h, std_headers[h])
 416                 if 'Youtubedl-no-compression' in req.headers:
 417                         if 'Accept-encoding' in req.headers:
 418                                 del req.headers['Accept-encoding']
 419                         del req.headers['Youtubedl-no-compression']
 420                 return req
 421
 422         def http_response(self, req, resp):
 423                 old_resp = resp
 424                 # gzip
 425                 if resp.headers.get('Content-encoding', '') == 'gzip':
 426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 428                         resp.msg = old_resp.msg
 429                 # deflate
 430                 if resp.headers.get('Content-encoding', '') == 'deflate':
 431                         gz = StringIO.StringIO(self.deflate(resp.read()))
 432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 433                         resp.msg = old_resp.msg
 434                 return resp
 435
 436
 437 class FileDownloader(object):
 438         """File Downloader class.
 439
 440         File downloader objects are the ones responsible of downloading the
 441         actual video file and writing it to disk if the user has requested
 442         it, among some other tasks. In most cases there should be one per
 443         program. As, given a video URL, the downloader doesn't know how to
 444         extract all the needed information, task that InfoExtractors do, it
 445         has to pass the URL to one of them.
 446
 447         For this, file downloader objects have a method that allows
 448         InfoExtractors to be registered in a given order. When it is passed
 449         a URL, the file downloader handles it to the first InfoExtractor it
 450         finds that reports being able to handle it. The InfoExtractor extracts
 451         all the information about the video or videos the URL refers to, and
 452         asks the FileDownloader to process the video information, possibly
 453         downloading the video.
 454
 455         File downloaders accept a lot of parameters. In order not to saturate
 456         the object constructor with arguments, it receives a dictionary of
 457         options instead. These options are available through the params
 458         attribute for the InfoExtractors to use. The FileDownloader also
 459         registers itself as the downloader in charge for the InfoExtractors
 460         that are added to it, so this is a "mutual registration".
 461
 462         Available options:
 463
 464         username:         Username for authentication purposes.
 465         password:         Password for authentication purposes.
 466         usenetrc:         Use netrc for authentication instead.
 467         quiet:            Do not print messages to stdout.
 468         forceurl:         Force printing final URL.
 469         forcetitle:       Force printing title.
 470         forcethumbnail:   Force printing thumbnail URL.
 471         forcedescription: Force printing description.
 472         forcefilename:    Force printing final filename.
 473         simulate:         Do not download the video files.
 474         format:           Video format code.
 475         format_limit:     Highest quality format to try.
 476         outtmpl:          Template for output names.
 477         ignoreerrors:     Do not stop on download errors.
 478         ratelimit:        Download speed limit, in bytes/sec.
 479         nooverwrites:     Prevent overwriting files.
 480         retries:          Number of times to retry for HTTP error 5xx
 481         continuedl:       Try to continue downloads if possible.
 482         noprogress:       Do not print the progress bar.
 483         playliststart:    Playlist item to start at.
 484         playlistend:      Playlist item to end at.
 485         matchtitle:       Download only matching titles.
 486         rejecttitle:      Reject downloads for matching titles.
 487         logtostderr:      Log messages to stderr instead of stdout.
 488         consoletitle:     Display progress in console window's titlebar.
 489         nopart:           Do not use temporary .part files.
 490         updatetime:       Use the Last-modified header to set output file timestamps.
 491         writedescription: Write the video description to a .description file
 492         writeinfojson:    Write the video description to a .info.json file
 493         writesubtitles:   Write the video subtitles to a .srt file
 494         subtitleslang:    Language of the subtitles to download
 495         """
 496
 497         params = None
 498         _ies = []
 499         _pps = []
 500         _download_retcode = None
 501         _num_downloads = None
 502         _screen_file = None
 503
 504         def __init__(self, params):
 505                 """Create a FileDownloader object with the given options."""
 506                 self._ies = []
 507                 self._pps = []
 508                 self._download_retcode = 0
 509                 self._num_downloads = 0
 510                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 511                 self.params = params
 512
 513         @staticmethod
 514         def format_bytes(bytes):
 515                 if bytes is None:
 516                         return 'N/A'
 517                 if type(bytes) is str:
 518                         bytes = float(bytes)
 519                 if bytes == 0.0:
 520                         exponent = 0
 521                 else:
 522                         exponent = long(math.log(bytes, 1024.0))
 523                 suffix = 'bkMGTPEZY'[exponent]
 524                 converted = float(bytes) / float(1024 ** exponent)
 525                 return '%.2f%s' % (converted, suffix)
 526
 527         @staticmethod
 528         def calc_percent(byte_counter, data_len):
 529                 if data_len is None:
 530                         return '---.-%'
 531                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 532
 533         @staticmethod
 534         def calc_eta(start, now, total, current):
 535                 if total is None:
 536                         return '--:--'
 537                 dif = now - start
 538                 if current == 0 or dif < 0.001: # One millisecond
 539                         return '--:--'
 540                 rate = float(current) / dif
 541                 eta = long((float(total) - float(current)) / rate)
 542                 (eta_mins, eta_secs) = divmod(eta, 60)
 543                 if eta_mins > 99:
 544                         return '--:--'
 545                 return '%02d:%02d' % (eta_mins, eta_secs)
 546
 547         @staticmethod
 548         def calc_speed(start, now, bytes):
 549                 dif = now - start
 550                 if bytes == 0 or dif < 0.001: # One millisecond
 551                         return '%10s' % '---b/s'
 552                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 553
 554         @staticmethod
 555         def best_block_size(elapsed_time, bytes):
 556                 new_min = max(bytes / 2.0, 1.0)
 557                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 558                 if elapsed_time < 0.001:
 559                         return long(new_max)
 560                 rate = bytes / elapsed_time
 561                 if rate > new_max:
 562                         return long(new_max)
 563                 if rate < new_min:
 564                         return long(new_min)
 565                 return long(rate)
 566
 567         @staticmethod
 568         def parse_bytes(bytestr):
 569                 """Parse a string indicating a byte quantity into a long integer."""
 570                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 571                 if matchobj is None:
 572                         return None
 573                 number = float(matchobj.group(1))
 574                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 575                 return long(round(number * multiplier))
 576
 577         def add_info_extractor(self, ie):
 578                 """Add an InfoExtractor object to the end of the list."""
 579                 self._ies.append(ie)
 580                 ie.set_downloader(self)
 581
 582         def add_post_processor(self, pp):
 583                 """Add a PostProcessor object to the end of the chain."""
 584                 self._pps.append(pp)
 585                 pp.set_downloader(self)
 586
 587         def to_screen(self, message, skip_eol=False):
 588                 """Print message to stdout if not in quiet mode."""
 589                 assert type(message) == type(u'')
 590                 if not self.params.get('quiet', False):
 591                         terminator = [u'\n', u''][skip_eol]
 592                         output = message + terminator
 593
 594                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 595                                 output = output.encode(preferredencoding(), 'ignore')
 596                         self._screen_file.write(output)
 597                         self._screen_file.flush()
 598
 599         def to_stderr(self, message):
 600                 """Print message to stderr."""
 601                 print >>sys.stderr, message.encode(preferredencoding())
 602
 603         def to_cons_title(self, message):
 604                 """Set console/terminal window title to message."""
 605                 if not self.params.get('consoletitle', False):
 606                         return
 607                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 608                         # c_wchar_p() might not be necessary if `message` is
 609                         # already of type unicode()
 610                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 611                 elif 'TERM' in os.environ:
 612                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 613
 614         def fixed_template(self):
 615                 """Checks if the output template is fixed."""
 616                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 617
 618         def trouble(self, message=None):
 619                 """Determine action to take when a download problem appears.
 620
 621                 Depending on if the downloader has been configured to ignore
 622                 download errors or not, this method may throw an exception or
 623                 not when errors are found, after printing the message.
 624                 """
 625                 if message is not None:
 626                         self.to_stderr(message)
 627                 if not self.params.get('ignoreerrors', False):
 628                         raise DownloadError(message)
 629                 self._download_retcode = 1
 630
 631         def slow_down(self, start_time, byte_counter):
 632                 """Sleep if the download speed is over the rate limit."""
 633                 rate_limit = self.params.get('ratelimit', None)
 634                 if rate_limit is None or byte_counter == 0:
 635                         return
 636                 now = time.time()
 637                 elapsed = now - start_time
 638                 if elapsed <= 0.0:
 639                         return
 640                 speed = float(byte_counter) / elapsed
 641                 if speed > rate_limit:
 642                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 643
 644         def temp_name(self, filename):
 645                 """Returns a temporary filename for the given filename."""
 646                 if self.params.get('nopart', False) or filename == u'-' or \
 647                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 648                         return filename
 649                 return filename + u'.part'
 650
 651         def undo_temp_name(self, filename):
 652                 if filename.endswith(u'.part'):
 653                         return filename[:-len(u'.part')]
 654                 return filename
 655
 656         def try_rename(self, old_filename, new_filename):
 657                 try:
 658                         if old_filename == new_filename:
 659                                 return
 660                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 661                 except (IOError, OSError), err:
 662                         self.trouble(u'ERROR: unable to rename file')
 663
 664         def try_utime(self, filename, last_modified_hdr):
 665                 """Try to set the last-modified time of the given file."""
 666                 if last_modified_hdr is None:
 667                         return
 668                 if not os.path.isfile(_encodeFilename(filename)):
 669                         return
 670                 timestr = last_modified_hdr
 671                 if timestr is None:
 672                         return
 673                 filetime = timeconvert(timestr)
 674                 if filetime is None:
 675                         return filetime
 676                 try:
 677                         os.utime(filename, (time.time(), filetime))
 678                 except:
 679                         pass
 680                 return filetime
 681
 682         def report_writedescription(self, descfn):
 683                 """ Report that the description file is being written """
 684                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 685
 686         def report_writesubtitles(self, srtfn):
 687                 """ Report that the subtitles file is being written """
 688                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 689
 690         def report_writeinfojson(self, infofn):
 691                 """ Report that the metadata file has been written """
 692                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 693
 694         def report_destination(self, filename):
 695                 """Report destination filename."""
 696                 self.to_screen(u'[download] Destination: ' + filename)
 697
 698         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 699                 """Report download progress."""
 700                 if self.params.get('noprogress', False):
 701                         return
 702                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 703                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 704                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 705                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 706
 707         def report_resuming_byte(self, resume_len):
 708                 """Report attempt to resume at given byte."""
 709                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 710
 711         def report_retry(self, count, retries):
 712                 """Report retry in case of HTTP error 5xx"""
 713                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 714
 715         def report_file_already_downloaded(self, file_name):
 716                 """Report file has already been fully downloaded."""
 717                 try:
 718                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 719                 except (UnicodeEncodeError), err:
 720                         self.to_screen(u'[download] The file has already been downloaded')
 721
 722         def report_unable_to_resume(self):
 723                 """Report it was impossible to resume download."""
 724                 self.to_screen(u'[download] Unable to resume')
 725
 726         def report_finish(self):
 727                 """Report download finished."""
 728                 if self.params.get('noprogress', False):
 729                         self.to_screen(u'[download] Download completed')
 730                 else:
 731                         self.to_screen(u'')
 732
 733         def increment_downloads(self):
 734                 """Increment the ordinal that assigns a number to each file."""
 735                 self._num_downloads += 1
 736
 737         def prepare_filename(self, info_dict):
 738                 """Generate the output filename."""
 739                 try:
 740                         template_dict = dict(info_dict)
 741                         template_dict['epoch'] = unicode(long(time.time()))
 742                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 743                         filename = self.params['outtmpl'] % template_dict
 744                         return filename
 745                 except (ValueError, KeyError), err:
 746                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 747                         return None
 748
 749         def _match_entry(self, info_dict):
 750                 """ Returns None iff the file should be downloaded """
 751
 752                 title = info_dict['title']
 753                 matchtitle = self.params.get('matchtitle', False)
 754                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 755                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 756                 rejecttitle = self.params.get('rejecttitle', False)
 757                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 758                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 759                 return None
 760
 761         def process_info(self, info_dict):
 762                 """Process a single dictionary returned by an InfoExtractor."""
 763
 764                 reason = self._match_entry(info_dict)
 765                 if reason is not None:
 766                         self.to_screen(u'[download] ' + reason)
 767                         return
 768
 769                 max_downloads = self.params.get('max_downloads')
 770                 if max_downloads is not None:
 771                         if self._num_downloads > int(max_downloads):
 772                                 raise MaxDownloadsReached()
 773
 774                 filename = self.prepare_filename(info_dict)
 775
 776                 # Forced printings
 777                 if self.params.get('forcetitle', False):
 778                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 779                 if self.params.get('forceurl', False):
 780                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 781                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 782                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 783                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 784                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 785                 if self.params.get('forcefilename', False) and filename is not None:
 786                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 787                 if self.params.get('forceformat', False):
 788                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 789
 790                 # Do nothing else if in simulate mode
 791                 if self.params.get('simulate', False):
 792                         return
 793
 794                 if filename is None:
 795                         return
 796
 797                 try:
 798                         dn = os.path.dirname(_encodeFilename(filename))
 799                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 800                                 os.makedirs(dn)
 801                 except (OSError, IOError), err:
 802                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 803                         return
 804
 805                 if self.params.get('writedescription', False):
 806                         try:
 807                                 descfn = filename + u'.description'
 808                                 self.report_writedescription(descfn)
 809                                 descfile = open(_encodeFilename(descfn), 'wb')
 810                                 try:
 811                                         descfile.write(info_dict['description'].encode('utf-8'))
 812                                 finally:
 813                                         descfile.close()
 814                         except (OSError, IOError):
 815                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 816                                 return
 817
 818                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 819                         # subtitles download errors are already managed as troubles in relevant IE
 820                         # that way it will silently go on when used with unsupporting IE
 821                         try:
 822                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 823                                 self.report_writesubtitles(srtfn)
 824                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 825                                 try:
 826                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 827                                 finally:
 828                                         srtfile.close()
 829                         except (OSError, IOError):
 830                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 831                                 return
 832
 833                 if self.params.get('writeinfojson', False):
 834                         infofn = filename + u'.info.json'
 835                         self.report_writeinfojson(infofn)
 836                         try:
 837                                 json.dump
 838                         except (NameError,AttributeError):
 839                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 840                                 return
 841                         try:
 842                                 infof = open(_encodeFilename(infofn), 'wb')
 843                                 try:
 844                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 845                                         json.dump(json_info_dict, infof)
 846                                 finally:
 847                                         infof.close()
 848                         except (OSError, IOError):
 849                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 850                                 return
 851
 852                 if not self.params.get('skip_download', False):
 853                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 854                                 success = True
 855                         else:
 856                                 try:
 857                                         success = self._do_download(filename, info_dict)
 858                                 except (OSError, IOError), err:
 859                                         raise UnavailableVideoError
 860                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 861                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 862                                         return
 863                                 except (ContentTooShortError, ), err:
 864                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 865                                         return
 866
 867                         if success:
 868                                 try:
 869                                         self.post_process(filename, info_dict)
 870                                 except (PostProcessingError), err:
 871                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 872                                         return
 873
 874         def download(self, url_list):
 875                 """Download a given list of URLs."""
 876                 if len(url_list) > 1 and self.fixed_template():
 877                         raise SameFileError(self.params['outtmpl'])
 878
 879                 for url in url_list:
 880                         suitable_found = False
 881                         for ie in self._ies:
 882                                 # Go to next InfoExtractor if not suitable
 883                                 if not ie.suitable(url):
 884                                         continue
 885
 886                                 # Suitable InfoExtractor found
 887                                 suitable_found = True
 888
 889                                 # Extract information from URL and process it
 890                                 ie.extract(url)
 891
 892                                 # Suitable InfoExtractor had been found; go to next URL
 893                                 break
 894
 895                         if not suitable_found:
 896                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 897
 898                 return self._download_retcode
 899
 900         def post_process(self, filename, ie_info):
 901                 """Run the postprocessing chain on the given file."""
 902                 info = dict(ie_info)
 903                 info['filepath'] = filename
 904                 for pp in self._pps:
 905                         info = pp.run(info)
 906                         if info is None:
 907                                 break
 908
 909         def _download_with_rtmpdump(self, filename, url, player_url):
 910                 self.report_destination(filename)
 911                 tmpfilename = self.temp_name(filename)
 912
 913                 # Check for rtmpdump first
 914                 try:
 915                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 916                 except (OSError, IOError):
 917                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 918                         return False
 919
 920                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 921                 # the connection was interrumpted and resuming appears to be
 922                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 923                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 924                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 925                 if self.params.get('verbose', False):
 926                         try:
 927                                 import pipes
 928                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 929                         except ImportError:
 930                                 shell_quote = repr
 931                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 932                 retval = subprocess.call(args)
 933                 while retval == 2 or retval == 1:
 934                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 935                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 936                         time.sleep(5.0) # This seems to be needed
 937                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 938                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 939                         if prevsize == cursize and retval == 1:
 940                                 break
 941                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 942                         if prevsize == cursize and retval == 2 and cursize > 1024:
 943                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 944                                 retval = 0
 945                                 break
 946                 if retval == 0:
 947                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 948                         self.try_rename(tmpfilename, filename)
 949                         return True
 950                 else:
 951                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 952                         return False
 953
 954         def _do_download(self, filename, info_dict):
 955                 url = info_dict['url']
 956                 player_url = info_dict.get('player_url', None)
 957
 958                 # Check file already present
 959                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 960                         self.report_file_already_downloaded(filename)
 961                         return True
 962
 963                 # Attempt to download using rtmpdump
 964                 if url.startswith('rtmp'):
 965                         return self._download_with_rtmpdump(filename, url, player_url)
 966
 967                 tmpfilename = self.temp_name(filename)
 968                 stream = None
 969
 970                 # Do not include the Accept-Encoding header
 971                 headers = {'Youtubedl-no-compression': 'True'}
 972                 basic_request = urllib2.Request(url, None, headers)
 973                 request = urllib2.Request(url, None, headers)
 974
 975                 # Establish possible resume length
 976                 if os.path.isfile(_encodeFilename(tmpfilename)):
 977                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 978                 else:
 979                         resume_len = 0
 980
 981                 open_mode = 'wb'
 982                 if resume_len != 0:
 983                         if self.params.get('continuedl', False):
 984                                 self.report_resuming_byte(resume_len)
 985                                 request.add_header('Range','bytes=%d-' % resume_len)
 986                                 open_mode = 'ab'
 987                         else:
 988                                 resume_len = 0
 989
 990                 count = 0
 991                 retries = self.params.get('retries', 0)
 992                 while count <= retries:
 993                         # Establish connection
 994                         try:
 995                                 if count == 0 and 'urlhandle' in info_dict:
 996                                         data = info_dict['urlhandle']
 997                                 data = urllib2.urlopen(request)
 998                                 break
 999                         except (urllib2.HTTPError, ), err:
1000                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001                                         # Unexpected HTTP error
1002                                         raise
1003                                 elif err.code == 416:
1004                                         # Unable to resume (requested range not satisfiable)
1005                                         try:
1006                                                 # Open the connection again without the range header
1007                                                 data = urllib2.urlopen(basic_request)
1008                                                 content_length = data.info()['Content-Length']
1009                                         except (urllib2.HTTPError, ), err:
1010                                                 if err.code < 500 or err.code >= 600:
1011                                                         raise
1012                                         else:
1013                                                 # Examine the reported length
1014                                                 if (content_length is not None and
1015                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016                                                         # The file had already been fully downloaded.
1017                                                         # Explanation to the above condition: in issue #175 it was revealed that
1018                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1019                                                         # changing the file size slightly and causing problems for some users. So
1020                                                         # I decided to implement a suggested change and consider the file
1021                                                         # completely downloaded if the file size differs less than 100 bytes from
1022                                                         # the one in the hard drive.
1023                                                         self.report_file_already_downloaded(filename)
1024                                                         self.try_rename(tmpfilename, filename)
1025                                                         return True
1026                                                 else:
1027                                                         # The length does not match, we start the download over
1028                                                         self.report_unable_to_resume()
1029                                                         open_mode = 'wb'
1030                                                         break
1031                         # Retry
1032                         count += 1
1033                         if count <= retries:
1034                                 self.report_retry(count, retries)
1035
1036                 if count > retries:
1037                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1038                         return False
1039
1040                 data_len = data.info().get('Content-length', None)
1041                 if data_len is not None:
1042                         data_len = long(data_len) + resume_len
1043                 data_len_str = self.format_bytes(data_len)
1044                 byte_counter = 0 + resume_len
1045                 block_size = 1024
1046                 start = time.time()
1047                 while True:
1048                         # Download and write
1049                         before = time.time()
1050                         data_block = data.read(block_size)
1051                         after = time.time()
1052                         if len(data_block) == 0:
1053                                 break
1054                         byte_counter += len(data_block)
1055
1056                         # Open file just in time
1057                         if stream is None:
1058                                 try:
1059                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060                                         assert stream is not None
1061                                         filename = self.undo_temp_name(tmpfilename)
1062                                         self.report_destination(filename)
1063                                 except (OSError, IOError), err:
1064                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065                                         return False
1066                         try:
1067                                 stream.write(data_block)
1068                         except (IOError, OSError), err:
1069                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070                                 return False
1071                         block_size = self.best_block_size(after - before, len(data_block))
1072
1073                         # Progress message
1074                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075                         if data_len is None:
1076                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077                         else:
1078                                 percent_str = self.calc_percent(byte_counter, data_len)
1079                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082                         # Apply rate limit
1083                         self.slow_down(start, byte_counter - resume_len)
1084
1085                 if stream is None:
1086                         self.trouble(u'\nERROR: Did not get any data blocks')
1087                         return False
1088                 stream.close()
1089                 self.report_finish()
1090                 if data_len is not None and byte_counter != data_len:
1091                         raise ContentTooShortError(byte_counter, long(data_len))
1092                 self.try_rename(tmpfilename, filename)
1093
1094                 # Update file modification time
1095                 if self.params.get('updatetime', True):
1096                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098                 return True
1099
1100
1101 class InfoExtractor(object):
1102         """Information Extractor class.
1103
1104         Information extractors are the classes that, given a URL, extract
1105         information from the video (or videos) the URL refers to. This
1106         information includes the real video URL, the video title and simplified
1107         title, author and others. The information is stored in a dictionary
1108         which is then passed to the FileDownloader. The FileDownloader
1109         processes this information possibly downloading the video to the file
1110         system, among other possible outcomes. The dictionaries must include
1111         the following fields:
1112
1113         id:             Video identifier.
1114         url:            Final video URL.
1115         uploader:       Nickname of the video uploader.
1116         title:          Literal title.
1117         stitle:         Simplified title.
1118         ext:            Video filename extension.
1119         format:         Video format.
1120         player_url:     SWF Player URL (may be None).
1121
1122         The following fields are optional. Their primary purpose is to allow
1123         youtube-dl to serve as the backend for a video search function, such
1124         as the one in youtube2mp3.  They are only used when their respective
1125         forced printing functions are called:
1126
1127         thumbnail:      Full URL to a video thumbnail image.
1128         description:    One-line video description.
1129
1130         Subclasses of this one should re-define the _real_initialize() and
1131         _real_extract() methods and define a _VALID_URL regexp.
1132         Probably, they should also be added to the list of extractors.
1133         """
1134
1135         _ready = False
1136         _downloader = None
1137
1138         def __init__(self, downloader=None):
1139                 """Constructor. Receives an optional downloader."""
1140                 self._ready = False
1141                 self.set_downloader(downloader)
1142
1143         def suitable(self, url):
1144                 """Receives a URL and returns True if suitable for this IE."""
1145                 return re.match(self._VALID_URL, url) is not None
1146
1147         def initialize(self):
1148                 """Initializes an instance (authentication, etc)."""
1149                 if not self._ready:
1150                         self._real_initialize()
1151                         self._ready = True
1152
1153         def extract(self, url):
1154                 """Extracts URL information and returns it in list of dicts."""
1155                 self.initialize()
1156                 return self._real_extract(url)
1157
1158         def set_downloader(self, downloader):
1159                 """Sets the downloader for this IE."""
1160                 self._downloader = downloader
1161
1162         def _real_initialize(self):
1163                 """Real initialization process. Redefine in subclasses."""
1164                 pass
1165
1166         def _real_extract(self, url):
1167                 """Real extraction process. Redefine in subclasses."""
1168                 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172         """Information extractor for youtube.com."""
1173
1174         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1179         _NETRC_MACHINE = 'youtube'
1180         # Listed in order of quality
1181         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183         _video_extensions = {
1184                 '13': '3gp',
1185                 '17': 'mp4',
1186                 '18': 'mp4',
1187                 '22': 'mp4',
1188                 '37': 'mp4',
1189                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190                 '43': 'webm',
1191                 '44': 'webm',
1192                 '45': 'webm',
1193         }
1194         _video_dimensions = {
1195                 '5': '240x400',
1196                 '6': '???',
1197                 '13': '???',
1198                 '17': '144x176',
1199                 '18': '360x640',
1200                 '22': '720x1280',
1201                 '34': '360x640',
1202                 '35': '480x854',
1203                 '37': '1080x1920',
1204                 '38': '3072x4096',
1205                 '43': '360x640',
1206                 '44': '480x854',
1207                 '45': '720x1280',
1208         }
1209         IE_NAME = u'youtube'
1210
1211         def report_lang(self):
1212                 """Report attempt to set language."""
1213                 self._downloader.to_screen(u'[youtube] Setting language')
1214
1215         def report_login(self):
1216                 """Report attempt to log in."""
1217                 self._downloader.to_screen(u'[youtube] Logging in')
1218
1219         def report_age_confirmation(self):
1220                 """Report attempt to confirm age."""
1221                 self._downloader.to_screen(u'[youtube] Confirming age')
1222
1223         def report_video_webpage_download(self, video_id):
1224                 """Report attempt to download video webpage."""
1225                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226
1227         def report_video_info_webpage_download(self, video_id):
1228                 """Report attempt to download video info webpage."""
1229                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230
1231         def report_video_subtitles_download(self, video_id):
1232                 """Report attempt to download video info webpage."""
1233                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234
1235         def report_information_extraction(self, video_id):
1236                 """Report attempt to extract video information."""
1237                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238
1239         def report_unavailable_format(self, video_id, format):
1240                 """Report extracted video URL."""
1241                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242
1243         def report_rtmp_download(self):
1244                 """Indicate the download will use the RTMP protocol."""
1245                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246
1247         def _closed_captions_xml_to_srt(self, xml_string):
1248                 srt = ''
1249                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250                 # TODO parse xml instead of regex
1251                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252                         if not dur: dur = '4'
1253                         start = float(start)
1254                         end = start + float(dur)
1255                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259                         srt += str(n) + '\n'
1260                         srt += start + ' --> ' + end + '\n'
1261                         srt += caption + '\n\n'
1262                 return srt
1263
1264         def _print_formats(self, formats):
1265                 print 'Available formats:'
1266                 for x in formats:
1267                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268
1269         def _real_initialize(self):
1270                 if self._downloader is None:
1271                         return
1272
1273                 username = None
1274                 password = None
1275                 downloader_params = self._downloader.params
1276
1277                 # Attempt to use provided username and password or .netrc data
1278                 if downloader_params.get('username', None) is not None:
1279                         username = downloader_params['username']
1280                         password = downloader_params['password']
1281                 elif downloader_params.get('usenetrc', False):
1282                         try:
1283                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284                                 if info is not None:
1285                                         username = info[0]
1286                                         password = info[2]
1287                                 else:
1288                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289                         except (IOError, netrc.NetrcParseError), err:
1290                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1291                                 return
1292
1293                 # Set language
1294                 request = urllib2.Request(self._LANG_URL)
1295                 try:
1296                         self.report_lang()
1297                         urllib2.urlopen(request).read()
1298                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300                         return
1301
1302                 # No authentication to be performed
1303                 if username is None:
1304                         return
1305
1306                 # Log in
1307                 login_form = {
1308                                 'current_form': 'loginForm',
1309                                 'next':         '/',
1310                                 'action_login': 'Log In',
1311                                 'username':     username,
1312                                 'password':     password,
1313                                 }
1314                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315                 try:
1316                         self.report_login()
1317                         login_results = urllib2.urlopen(request).read()
1318                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320                                 return
1321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1323                         return
1324
1325                 # Confirm age
1326                 age_form = {
1327                                 'next_url':             '/',
1328                                 'action_confirm':       'Confirm',
1329                                 }
1330                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331                 try:
1332                         self.report_age_confirmation()
1333                         age_results = urllib2.urlopen(request).read()
1334                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336                         return
1337
1338         def _real_extract(self, url):
1339                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1340                 mobj = re.search(self._NEXT_URL_RE, url)
1341                 if mobj:
1342                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1343
1344                 # Extract video id from URL
1345                 mobj = re.match(self._VALID_URL, url)
1346                 if mobj is None:
1347                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1348                         return
1349                 video_id = mobj.group(2)
1350
1351                 # Get video webpage
1352                 self.report_video_webpage_download(video_id)
1353                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1354                 try:
1355                         video_webpage = urllib2.urlopen(request).read()
1356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1358                         return
1359
1360                 # Attempt to extract SWF player URL
1361                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1362                 if mobj is not None:
1363                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1364                 else:
1365                         player_url = None
1366
1367                 # Get video info
1368                 self.report_video_info_webpage_download(video_id)
1369                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1370                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1371                                         % (video_id, el_type))
1372                         request = urllib2.Request(video_info_url)
1373                         try:
1374                                 video_info_webpage = urllib2.urlopen(request).read()
1375                                 video_info = parse_qs(video_info_webpage)
1376                                 if 'token' in video_info:
1377                                         break
1378                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1379                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1380                                 return
1381                 if 'token' not in video_info:
1382                         if 'reason' in video_info:
1383                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1384                         else:
1385                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1386                         return
1387
1388                 # Start extracting information
1389                 self.report_information_extraction(video_id)
1390
1391                 # uploader
1392                 if 'author' not in video_info:
1393                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1394                         return
1395                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1396
1397                 # title
1398                 if 'title' not in video_info:
1399                         self._downloader.trouble(u'ERROR: unable to extract video title')
1400                         return
1401                 video_title = urllib.unquote_plus(video_info['title'][0])
1402                 video_title = video_title.decode('utf-8')
1403                 video_title = sanitize_title(video_title)
1404
1405                 # simplified title
1406                 simple_title = _simplify_title(video_title)
1407
1408                 # thumbnail image
1409                 if 'thumbnail_url' not in video_info:
1410                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1411                         video_thumbnail = ''
1412                 else:   # don't panic if we can't find it
1413                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1414
1415                 # upload date
1416                 upload_date = u'NA'
1417                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1418                 if mobj is not None:
1419                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1420                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1421                         for expression in format_expressions:
1422                                 try:
1423                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1424                                 except:
1425                                         pass
1426
1427                 # description
1428                 try:
1429                         lxml.etree
1430                 except NameError:
1431                         video_description = u'No description available.'
1432                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1433                         if mobj is not None:
1434                                 video_description = mobj.group(1).decode('utf-8')
1435                 else:
1436                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1437                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1438                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1439                         # TODO use another parser
1440
1441                 # closed captions
1442                 video_subtitles = None
1443                 if self._downloader.params.get('writesubtitles', False):
1444                         self.report_video_subtitles_download(video_id)
1445                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1446                         try:
1447                                 srt_list = urllib2.urlopen(request).read()
1448                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1450                         else:
1451                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1452                                 if srt_lang_list:
1453                                         if self._downloader.params.get('subtitleslang', False):
1454                                                 srt_lang = self._downloader.params.get('subtitleslang')
1455                                         elif 'en' in srt_lang_list:
1456                                                 srt_lang = 'en'
1457                                         else:
1458                                                 srt_lang = srt_lang_list[0]
1459                                         if not srt_lang in srt_lang_list:
1460                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1461                                         else:
1462                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1463                                                 try:
1464                                                         srt_xml = urllib2.urlopen(request).read()
1465                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1466                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1467                                                 else:
1468                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1469                                 else:
1470                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1471
1472                 # token
1473                 video_token = urllib.unquote_plus(video_info['token'][0])
1474
1475                 # Decide which formats to download
1476                 req_format = self._downloader.params.get('format', None)
1477
1478                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1479                         self.report_rtmp_download()
1480                         video_url_list = [(None, video_info['conn'][0])]
1481                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1482                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1483                         url_data = [parse_qs(uds) for uds in url_data_strs]
1484                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1485                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1486
1487                         format_limit = self._downloader.params.get('format_limit', None)
1488                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1489                         if format_limit is not None and format_limit in available_formats:
1490                                 format_list = available_formats[available_formats.index(format_limit):]
1491                         else:
1492                                 format_list = available_formats
1493                         existing_formats = [x for x in format_list if x in url_map]
1494                         if len(existing_formats) == 0:
1495                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1496                                 return
1497                         if self._downloader.params.get('listformats', None):
1498                                 self._print_formats(existing_formats)
1499                                 return
1500                         if req_format is None or req_format == 'best':
1501                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1502                         elif req_format == 'worst':
1503                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1504                         elif req_format in ('-1', 'all'):
1505                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1506                         else:
1507                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1508                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1509                                 req_formats = req_format.split('/')
1510                                 video_url_list = None
1511                                 for rf in req_formats:
1512                                         if rf in url_map:
1513                                                 video_url_list = [(rf, url_map[rf])]
1514                                                 break
1515                                 if video_url_list is None:
1516                                         self._downloader.trouble(u'ERROR: requested format not available')
1517                                         return
1518                 else:
1519                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1520                         return
1521
1522                 for format_param, video_real_url in video_url_list:
1523                         # At this point we have a new video
1524                         self._downloader.increment_downloads()
1525
1526                         # Extension
1527                         video_extension = self._video_extensions.get(format_param, 'flv')
1528
1529                         try:
1530                                 # Process video information
1531                                 self._downloader.process_info({
1532                                         'id':           video_id.decode('utf-8'),
1533                                         'url':          video_real_url.decode('utf-8'),
1534                                         'uploader':     video_uploader.decode('utf-8'),
1535                                         'upload_date':  upload_date,
1536                                         'title':        video_title,
1537                                         'stitle':       simple_title,
1538                                         'ext':          video_extension.decode('utf-8'),
1539                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1540                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1541                                         'description':  video_description,
1542                                         'player_url':   player_url,
1543                                         'subtitles':    video_subtitles
1544                                 })
1545                         except UnavailableVideoError, err:
1546                                 self._downloader.trouble(u'\nERROR: unable to download video')
1547
1548
1549 class MetacafeIE(InfoExtractor):
1550         """Information Extractor for metacafe.com."""
1551
1552         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1553         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1554         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1555         _youtube_ie = None
1556         IE_NAME = u'metacafe'
1557
1558         def __init__(self, youtube_ie, downloader=None):
1559                 InfoExtractor.__init__(self, downloader)
1560                 self._youtube_ie = youtube_ie
1561
1562         def report_disclaimer(self):
1563                 """Report disclaimer retrieval."""
1564                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1565
1566         def report_age_confirmation(self):
1567                 """Report attempt to confirm age."""
1568                 self._downloader.to_screen(u'[metacafe] Confirming age')
1569
1570         def report_download_webpage(self, video_id):
1571                 """Report webpage download."""
1572                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1573
1574         def report_extraction(self, video_id):
1575                 """Report information extraction."""
1576                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1577
1578         def _real_initialize(self):
1579                 # Retrieve disclaimer
1580                 request = urllib2.Request(self._DISCLAIMER)
1581                 try:
1582                         self.report_disclaimer()
1583                         disclaimer = urllib2.urlopen(request).read()
1584                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1586                         return
1587
1588                 # Confirm age
1589                 disclaimer_form = {
1590                         'filters': '0',
1591                         'submit': "Continue - I'm over 18",
1592                         }
1593                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1594                 try:
1595                         self.report_age_confirmation()
1596                         disclaimer = urllib2.urlopen(request).read()
1597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1598                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1599                         return
1600
1601         def _real_extract(self, url):
1602                 # Extract id and simplified title from URL
1603                 mobj = re.match(self._VALID_URL, url)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1606                         return
1607
1608                 video_id = mobj.group(1)
1609
1610                 # Check if video comes from YouTube
1611                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1612                 if mobj2 is not None:
1613                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1614                         return
1615
1616                 # At this point we have a new video
1617                 self._downloader.increment_downloads()
1618
1619                 simple_title = mobj.group(2).decode('utf-8')
1620
1621                 # Retrieve video webpage to extract further information
1622                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1623                 try:
1624                         self.report_download_webpage(video_id)
1625                         webpage = urllib2.urlopen(request).read()
1626                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1628                         return
1629
1630                 # Extract URL, uploader and title from webpage
1631                 self.report_extraction(video_id)
1632                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1633                 if mobj is not None:
1634                         mediaURL = urllib.unquote(mobj.group(1))
1635                         video_extension = mediaURL[-3:]
1636
1637                         # Extract gdaKey if available
1638                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1639                         if mobj is None:
1640                                 video_url = mediaURL
1641                         else:
1642                                 gdaKey = mobj.group(1)
1643                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1644                 else:
1645                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1646                         if mobj is None:
1647                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648                                 return
1649                         vardict = parse_qs(mobj.group(1))
1650                         if 'mediaData' not in vardict:
1651                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652                                 return
1653                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1654                         if mobj is None:
1655                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656                                 return
1657                         mediaURL = mobj.group(1).replace('\\/', '/')
1658                         video_extension = mediaURL[-3:]
1659                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1660
1661                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract title')
1664                         return
1665                 video_title = mobj.group(1).decode('utf-8')
1666                 video_title = sanitize_title(video_title)
1667
1668                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1671                         return
1672                 video_uploader = mobj.group(1)
1673
1674                 try:
1675                         # Process video information
1676                         self._downloader.process_info({
1677                                 'id':           video_id.decode('utf-8'),
1678                                 'url':          video_url.decode('utf-8'),
1679                                 'uploader':     video_uploader.decode('utf-8'),
1680                                 'upload_date':  u'NA',
1681                                 'title':        video_title,
1682                                 'stitle':       simple_title,
1683                                 'ext':          video_extension.decode('utf-8'),
1684                                 'format':       u'NA',
1685                                 'player_url':   None,
1686                         })
1687                 except UnavailableVideoError:
1688                         self._downloader.trouble(u'\nERROR: unable to download video')
1689
1690
1691 class DailymotionIE(InfoExtractor):
1692         """Information Extractor for Dailymotion"""
1693
1694         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1695         IE_NAME = u'dailymotion'
1696
1697         def __init__(self, downloader=None):
1698                 InfoExtractor.__init__(self, downloader)
1699
1700         def report_download_webpage(self, video_id):
1701                 """Report webpage download."""
1702                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1703
1704         def report_extraction(self, video_id):
1705                 """Report information extraction."""
1706                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1707
1708         def _real_extract(self, url):
1709                 # Extract id and simplified title from URL
1710                 mobj = re.match(self._VALID_URL, url)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1713                         return
1714
1715                 # At this point we have a new video
1716                 self._downloader.increment_downloads()
1717                 video_id = mobj.group(1)
1718
1719                 video_extension = 'flv'
1720
1721                 # Retrieve video webpage to extract further information
1722                 request = urllib2.Request(url)
1723                 request.add_header('Cookie', 'family_filter=off')
1724                 try:
1725                         self.report_download_webpage(video_id)
1726                         webpage = urllib2.urlopen(request).read()
1727                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1729                         return
1730
1731                 # Extract URL, uploader and title from webpage
1732                 self.report_extraction(video_id)
1733                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1736                         return
1737                 sequence = urllib.unquote(mobj.group(1))
1738                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1739                 if mobj is None:
1740                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1741                         return
1742                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1743
1744                 # if needed add http://www.dailymotion.com/ if relative URL
1745
1746                 video_url = mediaURL
1747
1748                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1749                 if mobj is None:
1750                         self._downloader.trouble(u'ERROR: unable to extract title')
1751                         return
1752                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1753                 video_title = sanitize_title(video_title)
1754                 simple_title = _simplify_title(video_title)
1755
1756                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1757                 if mobj is None:
1758                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1759                         return
1760                 video_uploader = mobj.group(1)
1761
1762                 try:
1763                         # Process video information
1764                         self._downloader.process_info({
1765                                 'id':           video_id.decode('utf-8'),
1766                                 'url':          video_url.decode('utf-8'),
1767                                 'uploader':     video_uploader.decode('utf-8'),
1768                                 'upload_date':  u'NA',
1769                                 'title':        video_title,
1770                                 'stitle':       simple_title,
1771                                 'ext':          video_extension.decode('utf-8'),
1772                                 'format':       u'NA',
1773                                 'player_url':   None,
1774                         })
1775                 except UnavailableVideoError:
1776                         self._downloader.trouble(u'\nERROR: unable to download video')
1777
1778
1779 class GoogleIE(InfoExtractor):
1780         """Information extractor for video.google.com."""
1781
1782         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1783         IE_NAME = u'video.google'
1784
1785         def __init__(self, downloader=None):
1786                 InfoExtractor.__init__(self, downloader)
1787
1788         def report_download_webpage(self, video_id):
1789                 """Report webpage download."""
1790                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1791
1792         def report_extraction(self, video_id):
1793                 """Report information extraction."""
1794                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1795
1796         def _real_extract(self, url):
1797                 # Extract id from URL
1798                 mobj = re.match(self._VALID_URL, url)
1799                 if mobj is None:
1800                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1801                         return
1802
1803                 # At this point we have a new video
1804                 self._downloader.increment_downloads()
1805                 video_id = mobj.group(1)
1806
1807                 video_extension = 'mp4'
1808
1809                 # Retrieve video webpage to extract further information
1810                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1811                 try:
1812                         self.report_download_webpage(video_id)
1813                         webpage = urllib2.urlopen(request).read()
1814                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1815                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1816                         return
1817
1818                 # Extract URL, uploader, and title from webpage
1819                 self.report_extraction(video_id)
1820                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1821                 if mobj is None:
1822                         video_extension = 'flv'
1823                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1826                         return
1827                 mediaURL = urllib.unquote(mobj.group(1))
1828                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1829                 mediaURL = mediaURL.replace('\\x26', '\x26')
1830
1831                 video_url = mediaURL
1832
1833                 mobj = re.search(r'<title>(.*)</title>', webpage)
1834                 if mobj is None:
1835                         self._downloader.trouble(u'ERROR: unable to extract title')
1836                         return
1837                 video_title = mobj.group(1).decode('utf-8')
1838                 video_title = sanitize_title(video_title)
1839                 simple_title = _simplify_title(video_title)
1840
1841                 # Extract video description
1842                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1843                 if mobj is None:
1844                         self._downloader.trouble(u'ERROR: unable to extract video description')
1845                         return
1846                 video_description = mobj.group(1).decode('utf-8')
1847                 if not video_description:
1848                         video_description = 'No description available.'
1849
1850                 # Extract video thumbnail
1851                 if self._downloader.params.get('forcethumbnail', False):
1852                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1853                         try:
1854                                 webpage = urllib2.urlopen(request).read()
1855                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1857                                 return
1858                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1859                         if mobj is None:
1860                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1861                                 return
1862                         video_thumbnail = mobj.group(1)
1863                 else:   # we need something to pass to process_info
1864                         video_thumbnail = ''
1865
1866                 try:
1867                         # Process video information
1868                         self._downloader.process_info({
1869                                 'id':           video_id.decode('utf-8'),
1870                                 'url':          video_url.decode('utf-8'),
1871                                 'uploader':     u'NA',
1872                                 'upload_date':  u'NA',
1873                                 'title':        video_title,
1874                                 'stitle':       simple_title,
1875                                 'ext':          video_extension.decode('utf-8'),
1876                                 'format':       u'NA',
1877                                 'player_url':   None,
1878                         })
1879                 except UnavailableVideoError:
1880                         self._downloader.trouble(u'\nERROR: unable to download video')
1881
1882
1883 class PhotobucketIE(InfoExtractor):
1884         """Information extractor for photobucket.com."""
1885
1886         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1887         IE_NAME = u'photobucket'
1888
1889         def __init__(self, downloader=None):
1890                 InfoExtractor.__init__(self, downloader)
1891
1892         def report_download_webpage(self, video_id):
1893                 """Report webpage download."""
1894                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1895
1896         def report_extraction(self, video_id):
1897                 """Report information extraction."""
1898                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1899
1900         def _real_extract(self, url):
1901                 # Extract id from URL
1902                 mobj = re.match(self._VALID_URL, url)
1903                 if mobj is None:
1904                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1905                         return
1906
1907                 # At this point we have a new video
1908                 self._downloader.increment_downloads()
1909                 video_id = mobj.group(1)
1910
1911                 video_extension = 'flv'
1912
1913                 # Retrieve video webpage to extract further information
1914                 request = urllib2.Request(url)
1915                 try:
1916                         self.report_download_webpage(video_id)
1917                         webpage = urllib2.urlopen(request).read()
1918                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1919                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1920                         return
1921
1922                 # Extract URL, uploader, and title from webpage
1923                 self.report_extraction(video_id)
1924                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1925                 if mobj is None:
1926                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1927                         return
1928                 mediaURL = urllib.unquote(mobj.group(1))
1929
1930                 video_url = mediaURL
1931
1932                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1933                 if mobj is None:
1934                         self._downloader.trouble(u'ERROR: unable to extract title')
1935                         return
1936                 video_title = mobj.group(1).decode('utf-8')
1937                 video_title = sanitize_title(video_title)
1938                 simple_title = _simplify_title(vide_title)
1939
1940                 video_uploader = mobj.group(2).decode('utf-8')
1941
1942                 try:
1943                         # Process video information
1944                         self._downloader.process_info({
1945                                 'id':           video_id.decode('utf-8'),
1946                                 'url':          video_url.decode('utf-8'),
1947                                 'uploader':     video_uploader,
1948                                 'upload_date':  u'NA',
1949                                 'title':        video_title,
1950                                 'stitle':       simple_title,
1951                                 'ext':          video_extension.decode('utf-8'),
1952                                 'format':       u'NA',
1953                                 'player_url':   None,
1954                         })
1955                 except UnavailableVideoError:
1956                         self._downloader.trouble(u'\nERROR: unable to download video')
1957
1958
1959 class YahooIE(InfoExtractor):
1960         """Information extractor for video.yahoo.com."""
1961
1962         # _VALID_URL matches all Yahoo! Video URLs
1963         # _VPAGE_URL matches only the extractable '/watch/' URLs
1964         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1965         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1966         IE_NAME = u'video.yahoo'
1967
1968         def __init__(self, downloader=None):
1969                 InfoExtractor.__init__(self, downloader)
1970
1971         def report_download_webpage(self, video_id):
1972                 """Report webpage download."""
1973                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1974
1975         def report_extraction(self, video_id):
1976                 """Report information extraction."""
1977                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1978
1979         def _real_extract(self, url, new_video=True):
1980                 # Extract ID from URL
1981                 mobj = re.match(self._VALID_URL, url)
1982                 if mobj is None:
1983                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1984                         return
1985
1986                 # At this point we have a new video
1987                 self._downloader.increment_downloads()
1988                 video_id = mobj.group(2)
1989                 video_extension = 'flv'
1990
1991                 # Rewrite valid but non-extractable URLs as
1992                 # extractable English language /watch/ URLs
1993                 if re.match(self._VPAGE_URL, url) is None:
1994                         request = urllib2.Request(url)
1995                         try:
1996                                 webpage = urllib2.urlopen(request).read()
1997                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1999                                 return
2000
2001                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2002                         if mobj is None:
2003                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2004                                 return
2005                         yahoo_id = mobj.group(1)
2006
2007                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2008                         if mobj is None:
2009                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2010                                 return
2011                         yahoo_vid = mobj.group(1)
2012
2013                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2014                         return self._real_extract(url, new_video=False)
2015
2016                 # Retrieve video webpage to extract further information
2017                 request = urllib2.Request(url)
2018                 try:
2019                         self.report_download_webpage(video_id)
2020                         webpage = urllib2.urlopen(request).read()
2021                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2022                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2023                         return
2024
2025                 # Extract uploader and title from webpage
2026                 self.report_extraction(video_id)
2027                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video title')
2030                         return
2031                 video_title = mobj.group(1).decode('utf-8')
2032                 simple_title = _simplify_title(video_title)
2033
2034                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2035                 if mobj is None:
2036                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2037                         return
2038                 video_uploader = mobj.group(1).decode('utf-8')
2039
2040                 # Extract video thumbnail
2041                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2042                 if mobj is None:
2043                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2044                         return
2045                 video_thumbnail = mobj.group(1).decode('utf-8')
2046
2047                 # Extract video description
2048                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2049                 if mobj is None:
2050                         self._downloader.trouble(u'ERROR: unable to extract video description')
2051                         return
2052                 video_description = mobj.group(1).decode('utf-8')
2053                 if not video_description:
2054                         video_description = 'No description available.'
2055
2056                 # Extract video height and width
2057                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2058                 if mobj is None:
2059                         self._downloader.trouble(u'ERROR: unable to extract video height')
2060                         return
2061                 yv_video_height = mobj.group(1)
2062
2063                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: unable to extract video width')
2066                         return
2067                 yv_video_width = mobj.group(1)
2068
2069                 # Retrieve video playlist to extract media URL
2070                 # I'm not completely sure what all these options are, but we
2071                 # seem to need most of them, otherwise the server sends a 401.
2072                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2073                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2074                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2075                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2076                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2077                 try:
2078                         self.report_download_webpage(video_id)
2079                         webpage = urllib2.urlopen(request).read()
2080                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2082                         return
2083
2084                 # Extract media URL from playlist XML
2085                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2086                 if mobj is None:
2087                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2088                         return
2089                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2090                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2091
2092                 try:
2093                         # Process video information
2094                         self._downloader.process_info({
2095                                 'id':           video_id.decode('utf-8'),
2096                                 'url':          video_url,
2097                                 'uploader':     video_uploader,
2098                                 'upload_date':  u'NA',
2099                                 'title':        video_title,
2100                                 'stitle':       simple_title,
2101                                 'ext':          video_extension.decode('utf-8'),
2102                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2103                                 'description':  video_description,
2104                                 'thumbnail':    video_thumbnail,
2105                                 'player_url':   None,
2106                         })
2107                 except UnavailableVideoError:
2108                         self._downloader.trouble(u'\nERROR: unable to download video')
2109
2110
2111 class VimeoIE(InfoExtractor):
2112         """Information extractor for vimeo.com."""
2113
2114         # _VALID_URL matches Vimeo URLs
2115         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2116         IE_NAME = u'vimeo'
2117
2118         def __init__(self, downloader=None):
2119                 InfoExtractor.__init__(self, downloader)
2120
2121         def report_download_webpage(self, video_id):
2122                 """Report webpage download."""
2123                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2124
2125         def report_extraction(self, video_id):
2126                 """Report information extraction."""
2127                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2128
2129         def _real_extract(self, url, new_video=True):
2130                 # Extract ID from URL
2131                 mobj = re.match(self._VALID_URL, url)
2132                 if mobj is None:
2133                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2134                         return
2135
2136                 # At this point we have a new video
2137                 self._downloader.increment_downloads()
2138                 video_id = mobj.group(1)
2139
2140                 # Retrieve video webpage to extract further information
2141                 request = urllib2.Request(url, None, std_headers)
2142                 try:
2143                         self.report_download_webpage(video_id)
2144                         webpage = urllib2.urlopen(request).read()
2145                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2147                         return
2148
2149                 # Now we begin extracting as much information as we can from what we
2150                 # retrieved. First we extract the information common to all extractors,
2151                 # and latter we extract those that are Vimeo specific.
2152                 self.report_extraction(video_id)
2153
2154                 # Extract the config JSON
2155                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2156                 try:
2157                         config = json.loads(config)
2158                 except:
2159                         self._downloader.trouble(u'ERROR: unable to extract info section')
2160                         return
2161
2162                 # Extract title
2163                 video_title = config["video"]["title"]
2164                 simple_title = _simplify_title(video_title)
2165
2166                 # Extract uploader
2167                 video_uploader = config["video"]["owner"]["name"]
2168
2169                 # Extract video thumbnail
2170                 video_thumbnail = config["video"]["thumbnail"]
2171
2172                 # Extract video description
2173                 try:
2174                         lxml.etree
2175                 except NameError:
2176                         video_description = u'No description available.'
2177                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2178                         if mobj is not None:
2179                                 video_description = mobj.group(1)
2180                 else:
2181                         html_parser = lxml.etree.HTMLParser()
2182                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2183                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2184                         # TODO use another parser
2185
2186                 # Extract upload date
2187                 video_upload_date = u'NA'
2188                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2189                 if mobj is not None:
2190                         video_upload_date = mobj.group(1)
2191
2192                 # Vimeo specific: extract request signature and timestamp
2193                 sig = config['request']['signature']
2194                 timestamp = config['request']['timestamp']
2195
2196                 # Vimeo specific: extract video codec and quality information
2197                 # TODO bind to format param
2198                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2199                 for codec in codecs:
2200                         if codec[0] in config["video"]["files"]:
2201                                 video_codec = codec[0]
2202                                 video_extension = codec[1]
2203                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2204                                 else: quality = 'sd'
2205                                 break
2206                 else:
2207                         self._downloader.trouble(u'ERROR: no known codec found')
2208                         return
2209
2210                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2211                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2212
2213                 try:
2214                         # Process video information
2215                         self._downloader.process_info({
2216                                 'id':           video_id,
2217                                 'url':          video_url,
2218                                 'uploader':     video_uploader,
2219                                 'upload_date':  video_upload_date,
2220                                 'title':        video_title,
2221                                 'stitle':       simple_title,
2222                                 'ext':          video_extension,
2223                                 'thumbnail':    video_thumbnail,
2224                                 'description':  video_description,
2225                                 'player_url':   None,
2226                         })
2227                 except UnavailableVideoError:
2228                         self._downloader.trouble(u'ERROR: unable to download video')
2229
2230
2231 class GenericIE(InfoExtractor):
2232         """Generic last-resort information extractor."""
2233
2234         _VALID_URL = r'.*'
2235         IE_NAME = u'generic'
2236
2237         def __init__(self, downloader=None):
2238                 InfoExtractor.__init__(self, downloader)
2239
2240         def report_download_webpage(self, video_id):
2241                 """Report webpage download."""
2242                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2243                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2244
2245         def report_extraction(self, video_id):
2246                 """Report information extraction."""
2247                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2248
2249         def _real_extract(self, url):
2250                 # At this point we have a new video
2251                 self._downloader.increment_downloads()
2252
2253                 video_id = url.split('/')[-1]
2254                 request = urllib2.Request(url)
2255                 try:
2256                         self.report_download_webpage(video_id)
2257                         webpage = urllib2.urlopen(request).read()
2258                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2259                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2260                         return
2261                 except ValueError, err:
2262                         # since this is the last-resort InfoExtractor, if
2263                         # this error is thrown, it'll be thrown here
2264                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2265                         return
2266
2267                 self.report_extraction(video_id)
2268                 # Start with something easy: JW Player in SWFObject
2269                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2270                 if mobj is None:
2271                         # Broaden the search a little bit
2272                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2273                 if mobj is None:
2274                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2275                         return
2276
2277                 # It's possible that one of the regexes
2278                 # matched, but returned an empty group:
2279                 if mobj.group(1) is None:
2280                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2281                         return
2282
2283                 video_url = urllib.unquote(mobj.group(1))
2284                 video_id = os.path.basename(video_url)
2285
2286                 # here's a fun little line of code for you:
2287                 video_extension = os.path.splitext(video_id)[1][1:]
2288                 video_id = os.path.splitext(video_id)[0]
2289
2290                 # it's tempting to parse this further, but you would
2291                 # have to take into account all the variations like
2292                 #   Video Title - Site Name
2293                 #   Site Name | Video Title
2294                 #   Video Title - Tagline | Site Name
2295                 # and so on and so forth; it's just not practical
2296                 mobj = re.search(r'<title>(.*)</title>', webpage)
2297                 if mobj is None:
2298                         self._downloader.trouble(u'ERROR: unable to extract title')
2299                         return
2300                 video_title = mobj.group(1).decode('utf-8')
2301                 video_title = sanitize_title(video_title)
2302                 simple_title = _simplify_title(video_title)
2303
2304                 # video uploader is domain name
2305                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2306                 if mobj is None:
2307                         self._downloader.trouble(u'ERROR: unable to extract title')
2308                         return
2309                 video_uploader = mobj.group(1).decode('utf-8')
2310
2311                 try:
2312                         # Process video information
2313                         self._downloader.process_info({
2314                                 'id':           video_id.decode('utf-8'),
2315                                 'url':          video_url.decode('utf-8'),
2316                                 'uploader':     video_uploader,
2317                                 'upload_date':  u'NA',
2318                                 'title':        video_title,
2319                                 'stitle':       simple_title,
2320                                 'ext':          video_extension.decode('utf-8'),
2321                                 'format':       u'NA',
2322                                 'player_url':   None,
2323                         })
2324                 except UnavailableVideoError, err:
2325                         self._downloader.trouble(u'\nERROR: unable to download video')
2326
2327
2328 class YoutubeSearchIE(InfoExtractor):
2329         """Information Extractor for YouTube search queries."""
2330         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2331         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2332         _youtube_ie = None
2333         _max_youtube_results = 1000
2334         IE_NAME = u'youtube:search'
2335
2336         def __init__(self, youtube_ie, downloader=None):
2337                 InfoExtractor.__init__(self, downloader)
2338                 self._youtube_ie = youtube_ie
2339
2340         def report_download_page(self, query, pagenum):
2341                 """Report attempt to download playlist page with given number."""
2342                 query = query.decode(preferredencoding())
2343                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2344
2345         def _real_initialize(self):
2346                 self._youtube_ie.initialize()
2347
2348         def _real_extract(self, query):
2349                 mobj = re.match(self._VALID_URL, query)
2350                 if mobj is None:
2351                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2352                         return
2353
2354                 prefix, query = query.split(':')
2355                 prefix = prefix[8:]
2356                 query = query.encode('utf-8')
2357                 if prefix == '':
2358                         self._download_n_results(query, 1)
2359                         return
2360                 elif prefix == 'all':
2361                         self._download_n_results(query, self._max_youtube_results)
2362                         return
2363                 else:
2364                         try:
2365                                 n = long(prefix)
2366                                 if n <= 0:
2367                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2368                                         return
2369                                 elif n > self._max_youtube_results:
2370                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2371                                         n = self._max_youtube_results
2372                                 self._download_n_results(query, n)
2373                                 return
2374                         except ValueError: # parsing prefix as integer fails
2375                                 self._download_n_results(query, 1)
2376                                 return
2377
2378         def _download_n_results(self, query, n):
2379                 """Downloads a specified number of results for a query"""
2380
2381                 video_ids = []
2382                 pagenum = 0
2383                 limit = n
2384
2385                 while (50 * pagenum) < limit:
2386                         self.report_download_page(query, pagenum+1)
2387                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2388                         request = urllib2.Request(result_url)
2389                         try:
2390                                 data = urllib2.urlopen(request).read()
2391                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2393                                 return
2394                         api_response = json.loads(data)['data']
2395
2396                         new_ids = list(video['id'] for video in api_response['items'])
2397                         video_ids += new_ids
2398
2399                         limit = min(n, api_response['totalItems'])
2400                         pagenum += 1
2401
2402                 if len(video_ids) > n:
2403                         video_ids = video_ids[:n]
2404                 for id in video_ids:
2405                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2406                 return
2407
2408
2409 class GoogleSearchIE(InfoExtractor):
2410         """Information Extractor for Google Video search queries."""
2411         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2412         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2413         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2414         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2415         _google_ie = None
2416         _max_google_results = 1000
2417         IE_NAME = u'video.google:search'
2418
2419         def __init__(self, google_ie, downloader=None):
2420                 InfoExtractor.__init__(self, downloader)
2421                 self._google_ie = google_ie
2422
2423         def report_download_page(self, query, pagenum):
2424                 """Report attempt to download playlist page with given number."""
2425                 query = query.decode(preferredencoding())
2426                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2427
2428         def _real_initialize(self):
2429                 self._google_ie.initialize()
2430
2431         def _real_extract(self, query):
2432                 mobj = re.match(self._VALID_URL, query)
2433                 if mobj is None:
2434                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2435                         return
2436
2437                 prefix, query = query.split(':')
2438                 prefix = prefix[8:]
2439                 query = query.encode('utf-8')
2440                 if prefix == '':
2441                         self._download_n_results(query, 1)
2442                         return
2443                 elif prefix == 'all':
2444                         self._download_n_results(query, self._max_google_results)
2445                         return
2446                 else:
2447                         try:
2448                                 n = long(prefix)
2449                                 if n <= 0:
2450                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2451                                         return
2452                                 elif n > self._max_google_results:
2453                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2454                                         n = self._max_google_results
2455                                 self._download_n_results(query, n)
2456                                 return
2457                         except ValueError: # parsing prefix as integer fails
2458                                 self._download_n_results(query, 1)
2459                                 return
2460
2461         def _download_n_results(self, query, n):
2462                 """Downloads a specified number of results for a query"""
2463
2464                 video_ids = []
2465                 pagenum = 0
2466
2467                 while True:
2468                         self.report_download_page(query, pagenum)
2469                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2470                         request = urllib2.Request(result_url)
2471                         try:
2472                                 page = urllib2.urlopen(request).read()
2473                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2475                                 return
2476
2477                         # Extract video identifiers
2478                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2479                                 video_id = mobj.group(1)
2480                                 if video_id not in video_ids:
2481                                         video_ids.append(video_id)
2482                                         if len(video_ids) == n:
2483                                                 # Specified n videos reached
2484                                                 for id in video_ids:
2485                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2486                                                 return
2487
2488                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2489                                 for id in video_ids:
2490                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2491                                 return
2492
2493                         pagenum = pagenum + 1
2494
2495
2496 class YahooSearchIE(InfoExtractor):
2497         """Information Extractor for Yahoo! Video search queries."""
2498         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2499         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2500         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2501         _MORE_PAGES_INDICATOR = r'\s*Next'
2502         _yahoo_ie = None
2503         _max_yahoo_results = 1000
2504         IE_NAME = u'video.yahoo:search'
2505
2506         def __init__(self, yahoo_ie, downloader=None):
2507                 InfoExtractor.__init__(self, downloader)
2508                 self._yahoo_ie = yahoo_ie
2509
2510         def report_download_page(self, query, pagenum):
2511                 """Report attempt to download playlist page with given number."""
2512                 query = query.decode(preferredencoding())
2513                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2514
2515         def _real_initialize(self):
2516                 self._yahoo_ie.initialize()
2517
2518         def _real_extract(self, query):
2519                 mobj = re.match(self._VALID_URL, query)
2520                 if mobj is None:
2521                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2522                         return
2523
2524                 prefix, query = query.split(':')
2525                 prefix = prefix[8:]
2526                 query = query.encode('utf-8')
2527                 if prefix == '':
2528                         self._download_n_results(query, 1)
2529                         return
2530                 elif prefix == 'all':
2531                         self._download_n_results(query, self._max_yahoo_results)
2532                         return
2533                 else:
2534                         try:
2535                                 n = long(prefix)
2536                                 if n <= 0:
2537                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2538                                         return
2539                                 elif n > self._max_yahoo_results:
2540                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2541                                         n = self._max_yahoo_results
2542                                 self._download_n_results(query, n)
2543                                 return
2544                         except ValueError: # parsing prefix as integer fails
2545                                 self._download_n_results(query, 1)
2546                                 return
2547
2548         def _download_n_results(self, query, n):
2549                 """Downloads a specified number of results for a query"""
2550
2551                 video_ids = []
2552                 already_seen = set()
2553                 pagenum = 1
2554
2555                 while True:
2556                         self.report_download_page(query, pagenum)
2557                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2558                         request = urllib2.Request(result_url)
2559                         try:
2560                                 page = urllib2.urlopen(request).read()
2561                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2563                                 return
2564
2565                         # Extract video identifiers
2566                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2567                                 video_id = mobj.group(1)
2568                                 if video_id not in already_seen:
2569                                         video_ids.append(video_id)
2570                                         already_seen.add(video_id)
2571                                         if len(video_ids) == n:
2572                                                 # Specified n videos reached
2573                                                 for id in video_ids:
2574                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2575                                                 return
2576
2577                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2578                                 for id in video_ids:
2579                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2580                                 return
2581
2582                         pagenum = pagenum + 1
2583
2584
2585 class YoutubePlaylistIE(InfoExtractor):
2586         """Information Extractor for YouTube playlists."""
2587
2588         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2589         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2590         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2591         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2592         _youtube_ie = None
2593         IE_NAME = u'youtube:playlist'
2594
2595         def __init__(self, youtube_ie, downloader=None):
2596                 InfoExtractor.__init__(self, downloader)
2597                 self._youtube_ie = youtube_ie
2598
2599         def report_download_page(self, playlist_id, pagenum):
2600                 """Report attempt to download playlist page with given number."""
2601                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2602
2603         def _real_initialize(self):
2604                 self._youtube_ie.initialize()
2605
2606         def _real_extract(self, url):
2607                 # Extract playlist id
2608                 mobj = re.match(self._VALID_URL, url)
2609                 if mobj is None:
2610                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2611                         return
2612
2613                 # Single video case
2614                 if mobj.group(3) is not None:
2615                         self._youtube_ie.extract(mobj.group(3))
2616                         return
2617
2618                 # Download playlist pages
2619                 # prefix is 'p' as default for playlists but there are other types that need extra care
2620                 playlist_prefix = mobj.group(1)
2621                 if playlist_prefix == 'a':
2622                         playlist_access = 'artist'
2623                 else:
2624                         playlist_prefix = 'p'
2625                         playlist_access = 'view_play_list'
2626                 playlist_id = mobj.group(2)
2627                 video_ids = []
2628                 pagenum = 1
2629
2630                 while True:
2631                         self.report_download_page(playlist_id, pagenum)
2632                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2633                         request = urllib2.Request(url)
2634                         try:
2635                                 page = urllib2.urlopen(request).read()
2636                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2637                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2638                                 return
2639
2640                         # Extract video identifiers
2641                         ids_in_page = []
2642                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2643                                 if mobj.group(1) not in ids_in_page:
2644                                         ids_in_page.append(mobj.group(1))
2645                         video_ids.extend(ids_in_page)
2646
2647                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2648                                 break
2649                         pagenum = pagenum + 1
2650
2651                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2652                 playlistend = self._downloader.params.get('playlistend', -1)
2653                 if playlistend == -1:
2654                         video_ids = video_ids[playliststart:]
2655                 else:
2656                         video_ids = video_ids[playliststart:playlistend]
2657
2658                 for id in video_ids:
2659                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2660                 return
2661
2662
2663 class YoutubeUserIE(InfoExtractor):
2664         """Information Extractor for YouTube users."""
2665
2666         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2667         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2668         _GDATA_PAGE_SIZE = 50
2669         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2670         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2671         _youtube_ie = None
2672         IE_NAME = u'youtube:user'
2673
2674         def __init__(self, youtube_ie, downloader=None):
2675                 InfoExtractor.__init__(self, downloader)
2676                 self._youtube_ie = youtube_ie
2677
2678         def report_download_page(self, username, start_index):
2679                 """Report attempt to download user page."""
2680                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2681                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2682
2683         def _real_initialize(self):
2684                 self._youtube_ie.initialize()
2685
2686         def _real_extract(self, url):
2687                 # Extract username
2688                 mobj = re.match(self._VALID_URL, url)
2689                 if mobj is None:
2690                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2691                         return
2692
2693                 username = mobj.group(1)
2694
2695                 # Download video ids using YouTube Data API. Result size per
2696                 # query is limited (currently to 50 videos) so we need to query
2697                 # page by page until there are no video ids - it means we got
2698                 # all of them.
2699
2700                 video_ids = []
2701                 pagenum = 0
2702
2703                 while True:
2704                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2705                         self.report_download_page(username, start_index)
2706
2707                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2708
2709                         try:
2710                                 page = urllib2.urlopen(request).read()
2711                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2713                                 return
2714
2715                         # Extract video identifiers
2716                         ids_in_page = []
2717
2718                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2719                                 if mobj.group(1) not in ids_in_page:
2720                                         ids_in_page.append(mobj.group(1))
2721
2722                         video_ids.extend(ids_in_page)
2723
2724                         # A little optimization - if current page is not
2725                         # "full", ie. does not contain PAGE_SIZE video ids then
2726                         # we can assume that this page is the last one - there
2727                         # are no more ids on further pages - no need to query
2728                         # again.
2729
2730                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2731                                 break
2732
2733                         pagenum += 1
2734
2735                 all_ids_count = len(video_ids)
2736                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2737                 playlistend = self._downloader.params.get('playlistend', -1)
2738
2739                 if playlistend == -1:
2740                         video_ids = video_ids[playliststart:]
2741                 else:
2742                         video_ids = video_ids[playliststart:playlistend]
2743
2744                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2745                                 (username, all_ids_count, len(video_ids)))
2746
2747                 for video_id in video_ids:
2748                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2749
2750
2751 class DepositFilesIE(InfoExtractor):
2752         """Information extractor for depositfiles.com"""
2753
2754         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2755         IE_NAME = u'DepositFiles'
2756
2757         def __init__(self, downloader=None):
2758                 InfoExtractor.__init__(self, downloader)
2759
2760         def report_download_webpage(self, file_id):
2761                 """Report webpage download."""
2762                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2763
2764         def report_extraction(self, file_id):
2765                 """Report information extraction."""
2766                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2767
2768         def _real_extract(self, url):
2769                 # At this point we have a new file
2770                 self._downloader.increment_downloads()
2771
2772                 file_id = url.split('/')[-1]
2773                 # Rebuild url in english locale
2774                 url = 'http://depositfiles.com/en/files/' + file_id
2775
2776                 # Retrieve file webpage with 'Free download' button pressed
2777                 free_download_indication = { 'gateway_result' : '1' }
2778                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2779                 try:
2780                         self.report_download_webpage(file_id)
2781                         webpage = urllib2.urlopen(request).read()
2782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2784                         return
2785
2786                 # Search for the real file URL
2787                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2788                 if (mobj is None) or (mobj.group(1) is None):
2789                         # Try to figure out reason of the error.
2790                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2791                         if (mobj is not None) and (mobj.group(1) is not None):
2792                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2793                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2794                         else:
2795                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2796                         return
2797
2798                 file_url = mobj.group(1)
2799                 file_extension = os.path.splitext(file_url)[1][1:]
2800
2801                 # Search for file title
2802                 mobj = re.search(r'<b title="(.*?)">', webpage)
2803                 if mobj is None:
2804                         self._downloader.trouble(u'ERROR: unable to extract title')
2805                         return
2806                 file_title = mobj.group(1).decode('utf-8')
2807
2808                 try:
2809                         # Process file information
2810                         self._downloader.process_info({
2811                                 'id':           file_id.decode('utf-8'),
2812                                 'url':          file_url.decode('utf-8'),
2813                                 'uploader':     u'NA',
2814                                 'upload_date':  u'NA',
2815                                 'title':        file_title,
2816                                 'stitle':       file_title,
2817                                 'ext':          file_extension.decode('utf-8'),
2818                                 'format':       u'NA',
2819                                 'player_url':   None,
2820                         })
2821                 except UnavailableVideoError, err:
2822                         self._downloader.trouble(u'ERROR: unable to download file')
2823
2824
2825 class FacebookIE(InfoExtractor):
2826         """Information Extractor for Facebook"""
2827
2828         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2829         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2830         _NETRC_MACHINE = 'facebook'
2831         _available_formats = ['video', 'highqual', 'lowqual']
2832         _video_extensions = {
2833                 'video': 'mp4',
2834                 'highqual': 'mp4',
2835                 'lowqual': 'mp4',
2836         }
2837         IE_NAME = u'facebook'
2838
2839         def __init__(self, downloader=None):
2840                 InfoExtractor.__init__(self, downloader)
2841
2842         def _reporter(self, message):
2843                 """Add header and report message."""
2844                 self._downloader.to_screen(u'[facebook] %s' % message)
2845
2846         def report_login(self):
2847                 """Report attempt to log in."""
2848                 self._reporter(u'Logging in')
2849
2850         def report_video_webpage_download(self, video_id):
2851                 """Report attempt to download video webpage."""
2852                 self._reporter(u'%s: Downloading video webpage' % video_id)
2853
2854         def report_information_extraction(self, video_id):
2855                 """Report attempt to extract video information."""
2856                 self._reporter(u'%s: Extracting video information' % video_id)
2857
2858         def _parse_page(self, video_webpage):
2859                 """Extract video information from page"""
2860                 # General data
2861                 data = {'title': r'\("video_title", "(.*?)"\)',
2862                         'description': r'<div class="datawrap">(.*?)</div>',
2863                         'owner': r'\("video_owner_name", "(.*?)"\)',
2864                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2865                         }
2866                 video_info = {}
2867                 for piece in data.keys():
2868                         mobj = re.search(data[piece], video_webpage)
2869                         if mobj is not None:
2870                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871
2872                 # Video urls
2873                 video_urls = {}
2874                 for fmt in self._available_formats:
2875                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2876                         if mobj is not None:
2877                                 # URL is in a Javascript segment inside an escaped Unicode format within
2878                                 # the generally utf-8 page
2879                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2880                 video_info['video_urls'] = video_urls
2881
2882                 return video_info
2883
2884         def _real_initialize(self):
2885                 if self._downloader is None:
2886                         return
2887
2888                 useremail = None
2889                 password = None
2890                 downloader_params = self._downloader.params
2891
2892                 # Attempt to use provided username and password or .netrc data
2893                 if downloader_params.get('username', None) is not None:
2894                         useremail = downloader_params['username']
2895                         password = downloader_params['password']
2896                 elif downloader_params.get('usenetrc', False):
2897                         try:
2898                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2899                                 if info is not None:
2900                                         useremail = info[0]
2901                                         password = info[2]
2902                                 else:
2903                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2904                         except (IOError, netrc.NetrcParseError), err:
2905                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2906                                 return
2907
2908                 if useremail is None:
2909                         return
2910
2911                 # Log in
2912                 login_form = {
2913                         'email': useremail,
2914                         'pass': password,
2915                         'login': 'Log+In'
2916                         }
2917                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2918                 try:
2919                         self.report_login()
2920                         login_results = urllib2.urlopen(request).read()
2921                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2922                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2923                                 return
2924                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2925                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2926                         return
2927
2928         def _real_extract(self, url):
2929                 mobj = re.match(self._VALID_URL, url)
2930                 if mobj is None:
2931                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2932                         return
2933                 video_id = mobj.group('ID')
2934
2935                 # Get video webpage
2936                 self.report_video_webpage_download(video_id)
2937                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2938                 try:
2939                         page = urllib2.urlopen(request)
2940                         video_webpage = page.read()
2941                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2942                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2943                         return
2944
2945                 # Start extracting information
2946                 self.report_information_extraction(video_id)
2947
2948                 # Extract information
2949                 video_info = self._parse_page(video_webpage)
2950
2951                 # uploader
2952                 if 'owner' not in video_info:
2953                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2954                         return
2955                 video_uploader = video_info['owner']
2956
2957                 # title
2958                 if 'title' not in video_info:
2959                         self._downloader.trouble(u'ERROR: unable to extract video title')
2960                         return
2961                 video_title = video_info['title']
2962                 video_title = video_title.decode('utf-8')
2963                 video_title = sanitize_title(video_title)
2964
2965                 simple_title = _simplify_title(video_title)
2966
2967                 # thumbnail image
2968                 if 'thumbnail' not in video_info:
2969                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2970                         video_thumbnail = ''
2971                 else:
2972                         video_thumbnail = video_info['thumbnail']
2973
2974                 # upload date
2975                 upload_date = u'NA'
2976                 if 'upload_date' in video_info:
2977                         upload_time = video_info['upload_date']
2978                         timetuple = email.utils.parsedate_tz(upload_time)
2979                         if timetuple is not None:
2980                                 try:
2981                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2982                                 except:
2983                                         pass
2984
2985                 # description
2986                 video_description = video_info.get('description', 'No description available.')
2987
2988                 url_map = video_info['video_urls']
2989                 if len(url_map.keys()) > 0:
2990                         # Decide which formats to download
2991                         req_format = self._downloader.params.get('format', None)
2992                         format_limit = self._downloader.params.get('format_limit', None)
2993
2994                         if format_limit is not None and format_limit in self._available_formats:
2995                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2996                         else:
2997                                 format_list = self._available_formats
2998                         existing_formats = [x for x in format_list if x in url_map]
2999                         if len(existing_formats) == 0:
3000                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3001                                 return
3002                         if req_format is None:
3003                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3004                         elif req_format == 'worst':
3005                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3006                         elif req_format == '-1':
3007                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3008                         else:
3009                                 # Specific format
3010                                 if req_format not in url_map:
3011                                         self._downloader.trouble(u'ERROR: requested format not available')
3012                                         return
3013                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3014
3015                 for format_param, video_real_url in video_url_list:
3016
3017                         # At this point we have a new video
3018                         self._downloader.increment_downloads()
3019
3020                         # Extension
3021                         video_extension = self._video_extensions.get(format_param, 'mp4')
3022
3023                         try:
3024                                 # Process video information
3025                                 self._downloader.process_info({
3026                                         'id':           video_id.decode('utf-8'),
3027                                         'url':          video_real_url.decode('utf-8'),
3028                                         'uploader':     video_uploader.decode('utf-8'),
3029                                         'upload_date':  upload_date,
3030                                         'title':        video_title,
3031                                         'stitle':       simple_title,
3032                                         'ext':          video_extension.decode('utf-8'),
3033                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3034                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3035                                         'description':  video_description.decode('utf-8'),
3036                                         'player_url':   None,
3037                                 })
3038                         except UnavailableVideoError, err:
3039                                 self._downloader.trouble(u'\nERROR: unable to download video')
3040
3041 class BlipTVIE(InfoExtractor):
3042         """Information extractor for blip.tv"""
3043
3044         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3045         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3046         IE_NAME = u'blip.tv'
3047
3048         def report_extraction(self, file_id):
3049                 """Report information extraction."""
3050                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3051
3052         def report_direct_download(self, title):
3053                 """Report information extraction."""
3054                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3055
3056         def _real_extract(self, url):
3057                 mobj = re.match(self._VALID_URL, url)
3058                 if mobj is None:
3059                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3060                         return
3061
3062                 if '?' in url:
3063                         cchar = '&'
3064                 else:
3065                         cchar = '?'
3066                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3067                 request = urllib2.Request(json_url)
3068                 self.report_extraction(mobj.group(1))
3069                 info = None
3070                 try:
3071                         urlh = urllib2.urlopen(request)
3072                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3073                                 basename = url.split('/')[-1]
3074                                 title,ext = os.path.splitext(basename)
3075                                 title = title.decode('UTF-8')
3076                                 ext = ext.replace('.', '')
3077                                 self.report_direct_download(title)
3078                                 info = {
3079                                         'id': title,
3080                                         'url': url,
3081                                         'title': title,
3082                                         'stitle': _simplify_title(title),
3083                                         'ext': ext,
3084                                         'urlhandle': urlh
3085                                 }
3086                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3088                         return
3089                 if info is None: # Regular URL
3090                         try:
3091                                 json_code = urlh.read()
3092                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3094                                 return
3095
3096                         try:
3097                                 json_data = json.loads(json_code)
3098                                 if 'Post' in json_data:
3099                                         data = json_data['Post']
3100                                 else:
3101                                         data = json_data
3102
3103                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3104                                 video_url = data['media']['url']
3105                                 umobj = re.match(self._URL_EXT, video_url)
3106                                 if umobj is None:
3107                                         raise ValueError('Can not determine filename extension')
3108                                 ext = umobj.group(1)
3109
3110                                 info = {
3111                                         'id': data['item_id'],
3112                                         'url': video_url,
3113                                         'uploader': data['display_name'],
3114                                         'upload_date': upload_date,
3115                                         'title': data['title'],
3116                                         'stitle': _simplify_title(data['title']),
3117                                         'ext': ext,
3118                                         'format': data['media']['mimeType'],
3119                                         'thumbnail': data['thumbnailUrl'],
3120                                         'description': data['description'],
3121                                         'player_url': data['embedUrl']
3122                                 }
3123                         except (ValueError,KeyError), err:
3124                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3125                                 return
3126
3127                 self._downloader.increment_downloads()
3128
3129                 try:
3130                         self._downloader.process_info(info)
3131                 except UnavailableVideoError, err:
3132                         self._downloader.trouble(u'\nERROR: unable to download video')
3133
3134
3135 class MyVideoIE(InfoExtractor):
3136         """Information Extractor for myvideo.de."""
3137
3138         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3139         IE_NAME = u'myvideo'
3140
3141         def __init__(self, downloader=None):
3142                 InfoExtractor.__init__(self, downloader)
3143
3144         def report_download_webpage(self, video_id):
3145                 """Report webpage download."""
3146                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3147
3148         def report_extraction(self, video_id):
3149                 """Report information extraction."""
3150                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3151
3152         def _real_extract(self,url):
3153                 mobj = re.match(self._VALID_URL, url)
3154                 if mobj is None:
3155                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3156                         return
3157
3158                 video_id = mobj.group(1)
3159
3160                 # Get video webpage
3161                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3162                 try:
3163                         self.report_download_webpage(video_id)
3164                         webpage = urllib2.urlopen(request).read()
3165                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3167                         return
3168
3169                 self.report_extraction(video_id)
3170                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3171                                  webpage)
3172                 if mobj is None:
3173                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3174                         return
3175                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3176
3177                 mobj = re.search('<title>([^<]+)</title>', webpage)
3178                 if mobj is None:
3179                         self._downloader.trouble(u'ERROR: unable to extract title')
3180                         return
3181
3182                 video_title = mobj.group(1)
3183                 video_title = sanitize_title(video_title)
3184
3185                 simple_title = _simplify_title(video_title)
3186
3187                 try:
3188                         self._downloader.process_info({
3189                                 'id':           video_id,
3190                                 'url':          video_url,
3191                                 'uploader':     u'NA',
3192                                 'upload_date':  u'NA',
3193                                 'title':        video_title,
3194                                 'stitle':       simple_title,
3195                                 'ext':          u'flv',
3196                                 'format':       u'NA',
3197                                 'player_url':   None,
3198                         })
3199                 except UnavailableVideoError:
3200                         self._downloader.trouble(u'\nERROR: Unable to download video')
3201
3202 class ComedyCentralIE(InfoExtractor):
3203         """Information extractor for The Daily Show and Colbert Report """
3204
3205         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3206         IE_NAME = u'comedycentral'
3207
3208         def report_extraction(self, episode_id):
3209                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3210
3211         def report_config_download(self, episode_id):
3212                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3213
3214         def report_index_download(self, episode_id):
3215                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3216
3217         def report_player_url(self, episode_id):
3218                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3219
3220         def _real_extract(self, url):
3221                 mobj = re.match(self._VALID_URL, url)
3222                 if mobj is None:
3223                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3224                         return
3225
3226                 if mobj.group('shortname'):
3227                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3228                                 url = u'http://www.thedailyshow.com/full-episodes/'
3229                         else:
3230                                 url = u'http://www.colbertnation.com/full-episodes/'
3231                         mobj = re.match(self._VALID_URL, url)
3232                         assert mobj is not None
3233
3234                 dlNewest = not mobj.group('episode')
3235                 if dlNewest:
3236                         epTitle = mobj.group('showname')
3237                 else:
3238                         epTitle = mobj.group('episode')
3239
3240                 req = urllib2.Request(url)
3241                 self.report_extraction(epTitle)
3242                 try:
3243                         htmlHandle = urllib2.urlopen(req)
3244                         html = htmlHandle.read()
3245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3246                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3247                         return
3248                 if dlNewest:
3249                         url = htmlHandle.geturl()
3250                         mobj = re.match(self._VALID_URL, url)
3251                         if mobj is None:
3252                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3253                                 return
3254                         if mobj.group('episode') == '':
3255                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3256                                 return
3257                         epTitle = mobj.group('episode')
3258
3259                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3260                 if len(mMovieParams) == 0:
3261                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3262                         return
3263
3264                 playerUrl_raw = mMovieParams[0][0]
3265                 self.report_player_url(epTitle)
3266                 try:
3267                         urlHandle = urllib2.urlopen(playerUrl_raw)
3268                         playerUrl = urlHandle.geturl()
3269                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3271                         return
3272
3273                 uri = mMovieParams[0][1]
3274                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3275                 self.report_index_download(epTitle)
3276                 try:
3277                         indexXml = urllib2.urlopen(indexUrl).read()
3278                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3279                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3280                         return
3281
3282                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3283                 itemEls = idoc.findall('.//item')
3284                 for itemEl in itemEls:
3285                         mediaId = itemEl.findall('./guid')[0].text
3286                         shortMediaId = mediaId.split(':')[-1]
3287                         showId = mediaId.split(':')[-2].replace('.com', '')
3288                         officialTitle = itemEl.findall('./title')[0].text
3289                         officialDate = itemEl.findall('./pubDate')[0].text
3290
3291                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3292                                                 urllib.urlencode({'uri': mediaId}))
3293                         configReq = urllib2.Request(configUrl)
3294                         self.report_config_download(epTitle)
3295                         try:
3296                                 configXml = urllib2.urlopen(configReq).read()
3297                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3299                                 return
3300
3301                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3302                         turls = []
3303                         for rendition in cdoc.findall('.//rendition'):
3304                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3305                                 turls.append(finfo)
3306
3307                         if len(turls) == 0:
3308                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3309                                 continue
3310
3311                         # For now, just pick the highest bitrate
3312                         format,video_url = turls[-1]
3313
3314                         self._downloader.increment_downloads()
3315
3316                         effTitle = showId + u'-' + epTitle
3317                         info = {
3318                                 'id': shortMediaId,
3319                                 'url': video_url,
3320                                 'uploader': showId,
3321                                 'upload_date': officialDate,
3322                                 'title': effTitle,
3323                                 'stitle': _simplify_title(effTitle),
3324                                 'ext': 'mp4',
3325                                 'format': format,
3326                                 'thumbnail': None,
3327                                 'description': officialTitle,
3328                                 'player_url': playerUrl
3329                         }
3330
3331                         try:
3332                                 self._downloader.process_info(info)
3333                         except UnavailableVideoError, err:
3334                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3335                                 continue
3336
3337
3338 class EscapistIE(InfoExtractor):
3339         """Information extractor for The Escapist """
3340
3341         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3342         IE_NAME = u'escapist'
3343
3344         def report_extraction(self, showName):
3345                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3346
3347         def report_config_download(self, showName):
3348                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3349
3350         def _real_extract(self, url):
3351                 htmlParser = HTMLParser.HTMLParser()
3352
3353                 mobj = re.match(self._VALID_URL, url)
3354                 if mobj is None:
3355                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3356                         return
3357                 showName = mobj.group('showname')
3358                 videoId = mobj.group('episode')
3359
3360                 self.report_extraction(showName)
3361                 try:
3362                         webPage = urllib2.urlopen(url).read()
3363                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3364                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3365                         return
3366
3367                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3368                 description = htmlParser.unescape(descMatch.group(1))
3369                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3370                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3371                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3372                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3373                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3374                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3375
3376                 self.report_config_download(showName)
3377                 try:
3378                         configJSON = urllib2.urlopen(configUrl).read()
3379                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3380                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3381                         return
3382
3383                 # Technically, it's JavaScript, not JSON
3384                 configJSON = configJSON.replace("'", '"')
3385
3386                 try:
3387                         config = json.loads(configJSON)
3388                 except (ValueError,), err:
3389                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3390                         return
3391
3392                 playlist = config['playlist']
3393                 videoUrl = playlist[1]['url']
3394
3395                 self._downloader.increment_downloads()
3396                 info = {
3397                         'id': videoId,
3398                         'url': videoUrl,
3399                         'uploader': showName,
3400                         'upload_date': None,
3401                         'title': showName,
3402                         'stitle': _simplify_title(showName),
3403                         'ext': 'flv',
3404                         'format': 'flv',
3405                         'thumbnail': imgUrl,
3406                         'description': description,
3407                         'player_url': playerUrl,
3408                 }
3409
3410                 try:
3411                         self._downloader.process_info(info)
3412                 except UnavailableVideoError, err:
3413                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3414
3415
3416 class CollegeHumorIE(InfoExtractor):
3417         """Information extractor for collegehumor.com"""
3418
3419         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3420         IE_NAME = u'collegehumor'
3421
3422         def report_webpage(self, video_id):
3423                 """Report information extraction."""
3424                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3425
3426         def report_extraction(self, video_id):
3427                 """Report information extraction."""
3428                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3429
3430         def _real_extract(self, url):
3431                 htmlParser = HTMLParser.HTMLParser()
3432
3433                 mobj = re.match(self._VALID_URL, url)
3434                 if mobj is None:
3435                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3436                         return
3437                 video_id = mobj.group('videoid')
3438
3439                 self.report_webpage(video_id)
3440                 request = urllib2.Request(url)
3441                 try:
3442                         webpage = urllib2.urlopen(request).read()
3443                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3444                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3445                         return
3446
3447                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3448                 if m is None:
3449                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3450                         return
3451                 internal_video_id = m.group('internalvideoid')
3452
3453                 info = {
3454                         'id': video_id,
3455                         'internal_id': internal_video_id,
3456                 }
3457
3458                 self.report_extraction(video_id)
3459                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3460                 try:
3461                         metaXml = urllib2.urlopen(xmlUrl).read()
3462                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3463                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3464                         return
3465
3466                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3467                 try:
3468                         videoNode = mdoc.findall('./video')[0]
3469                         info['description'] = videoNode.findall('./description')[0].text
3470                         info['title'] = videoNode.findall('./caption')[0].text
3471                         info['stitle'] = _simplify_title(info['title'])
3472                         info['url'] = videoNode.findall('./file')[0].text
3473                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3474                         info['ext'] = info['url'].rpartition('.')[2]
3475                         info['format'] = info['ext']
3476                 except IndexError:
3477                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3478                         return
3479
3480                 self._downloader.increment_downloads()
3481
3482                 try:
3483                         self._downloader.process_info(info)
3484                 except UnavailableVideoError, err:
3485                         self._downloader.trouble(u'\nERROR: unable to download video')
3486
3487
3488 class XVideosIE(InfoExtractor):
3489         """Information extractor for xvideos.com"""
3490
3491         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3492         IE_NAME = u'xvideos'
3493
3494         def report_webpage(self, video_id):
3495                 """Report information extraction."""
3496                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3497
3498         def report_extraction(self, video_id):
3499                 """Report information extraction."""
3500                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3501
3502         def _real_extract(self, url):
3503                 htmlParser = HTMLParser.HTMLParser()
3504
3505                 mobj = re.match(self._VALID_URL, url)
3506                 if mobj is None:
3507                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3508                         return
3509                 video_id = mobj.group(1).decode('utf-8')
3510
3511                 self.report_webpage(video_id)
3512
3513                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3514                 try:
3515                         webpage = urllib2.urlopen(request).read()
3516                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3517                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3518                         return
3519
3520                 self.report_extraction(video_id)
3521
3522
3523                 # Extract video URL
3524                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3525                 if mobj is None:
3526                         self._downloader.trouble(u'ERROR: unable to extract video url')
3527                         return
3528                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3529
3530
3531                 # Extract title
3532                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3533                 if mobj is None:
3534                         self._downloader.trouble(u'ERROR: unable to extract video title')
3535                         return
3536                 video_title = mobj.group(1).decode('utf-8')
3537
3538
3539                 # Extract video thumbnail
3540                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3541                 if mobj is None:
3542                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3543                         return
3544                 video_thumbnail = mobj.group(1).decode('utf-8')
3545
3546
3547
3548                 self._downloader.increment_downloads()
3549                 info = {
3550                         'id': video_id,
3551                         'url': video_url,
3552                         'uploader': None,
3553                         'upload_date': None,
3554                         'title': video_title,
3555                         'stitle': _simplify_title(video_title),
3556                         'ext': 'flv',
3557                         'format': 'flv',
3558                         'thumbnail': video_thumbnail,
3559                         'description': None,
3560                         'player_url': None,
3561                 }
3562
3563                 try:
3564                         self._downloader.process_info(info)
3565                 except UnavailableVideoError, err:
3566                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3567
3568
3569 class SoundcloudIE(InfoExtractor):
3570         """Information extractor for soundcloud.com
3571            To access the media, the uid of the song and a stream token
3572            must be extracted from the page source and the script must make
3573            a request to media.soundcloud.com/crossdomain.xml. Then
3574            the media can be grabbed by requesting from an url composed
3575            of the stream token and uid
3576          """
3577
3578         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3579         IE_NAME = u'soundcloud'
3580
3581         def __init__(self, downloader=None):
3582                 InfoExtractor.__init__(self, downloader)
3583
3584         def report_webpage(self, video_id):
3585                 """Report information extraction."""
3586                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3587
3588         def report_extraction(self, video_id):
3589                 """Report information extraction."""
3590                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3591
3592         def _real_extract(self, url):
3593                 htmlParser = HTMLParser.HTMLParser()
3594
3595                 mobj = re.match(self._VALID_URL, url)
3596                 if mobj is None:
3597                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3598                         return
3599
3600                 # extract uploader (which is in the url)
3601                 uploader = mobj.group(1).decode('utf-8')
3602                 # extract simple title (uploader + slug of song title)
3603                 slug_title =  mobj.group(2).decode('utf-8')
3604                 simple_title = uploader + '-' + slug_title
3605
3606                 self.report_webpage('%s/%s' % (uploader, slug_title))
3607
3608                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3609                 try:
3610                         webpage = urllib2.urlopen(request).read()
3611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3612                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3613                         return
3614
3615                 self.report_extraction('%s/%s' % (uploader, slug_title))
3616
3617                 # extract uid and stream token that soundcloud hands out for access
3618                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3619                 if mobj:
3620                         video_id = mobj.group(1)
3621                         stream_token = mobj.group(2)
3622
3623                 # extract unsimplified title
3624                 mobj = re.search('"title":"(.*?)",', webpage)
3625                 if mobj:
3626                         title = mobj.group(1)
3627
3628                 # construct media url (with uid/token)
3629                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3630                 mediaURL = mediaURL % (video_id, stream_token)
3631
3632                 # description
3633                 description = u'No description available'
3634                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3635                 if mobj:
3636                         description = mobj.group(1)
3637
3638                 # upload date
3639                 upload_date = None
3640                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3641                 if mobj:
3642                         try:
3643                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3644                         except Exception, e:
3645                                 print str(e)
3646
3647                 # for soundcloud, a request to a cross domain is required for cookies
3648                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3649
3650                 try:
3651                         self._downloader.process_info({
3652                                 'id':           video_id.decode('utf-8'),
3653                                 'url':          mediaURL,
3654                                 'uploader':     uploader.decode('utf-8'),
3655                                 'upload_date':  upload_date,
3656                                 'title':        simple_title.decode('utf-8'),
3657                                 'stitle':       simple_title.decode('utf-8'),
3658                                 'ext':          u'mp3',
3659                                 'format':       u'NA',
3660                                 'player_url':   None,
3661                                 'description': description.decode('utf-8')
3662                         })
3663                 except UnavailableVideoError:
3664                         self._downloader.trouble(u'\nERROR: unable to download video')
3665
3666
3667 class InfoQIE(InfoExtractor):
3668         """Information extractor for infoq.com"""
3669
3670         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3671         IE_NAME = u'infoq'
3672
3673         def report_webpage(self, video_id):
3674                 """Report information extraction."""
3675                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3676
3677         def report_extraction(self, video_id):
3678                 """Report information extraction."""
3679                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3680
3681         def _real_extract(self, url):
3682                 htmlParser = HTMLParser.HTMLParser()
3683
3684                 mobj = re.match(self._VALID_URL, url)
3685                 if mobj is None:
3686                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3687                         return
3688
3689                 self.report_webpage(url)
3690
3691                 request = urllib2.Request(url)
3692                 try:
3693                         webpage = urllib2.urlopen(request).read()
3694                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3695                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3696                         return
3697
3698                 self.report_extraction(url)
3699
3700
3701                 # Extract video URL
3702                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3703                 if mobj is None:
3704                         self._downloader.trouble(u'ERROR: unable to extract video url')
3705                         return
3706                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3707
3708
3709                 # Extract title
3710                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3711                 if mobj is None:
3712                         self._downloader.trouble(u'ERROR: unable to extract video title')
3713                         return
3714                 video_title = mobj.group(1).decode('utf-8')
3715
3716                 # Extract description
3717                 video_description = u'No description available.'
3718                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3719                 if mobj is not None:
3720                         video_description = mobj.group(1).decode('utf-8')
3721
3722                 video_filename = video_url.split('/')[-1]
3723                 video_id, extension = video_filename.split('.')
3724
3725                 self._downloader.increment_downloads()
3726                 info = {
3727                         'id': video_id,
3728                         'url': video_url,
3729                         'uploader': None,
3730                         'upload_date': None,
3731                         'title': video_title,
3732                         'stitle': _simplify_title(video_title),
3733                         'ext': extension,
3734                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3735                         'thumbnail': None,
3736                         'description': video_description,
3737                         'player_url': None,
3738                 }
3739
3740                 try:
3741                         self._downloader.process_info(info)
3742                 except UnavailableVideoError, err:
3743                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3744
3745 class MixcloudIE(InfoExtractor):
3746         """Information extractor for www.mixcloud.com"""
3747         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3748         IE_NAME = u'mixcloud'
3749
3750         def __init__(self, downloader=None):
3751                 InfoExtractor.__init__(self, downloader)
3752
3753         def report_download_json(self, file_id):
3754                 """Report JSON download."""
3755                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3756
3757         def report_extraction(self, file_id):
3758                 """Report information extraction."""
3759                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3760
3761         def get_urls(self, jsonData, fmt, bitrate='best'):
3762                 """Get urls from 'audio_formats' section in json"""
3763                 file_url = None
3764                 try:
3765                         bitrate_list = jsonData[fmt]
3766                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3767                                 bitrate = max(bitrate_list) # select highest
3768
3769                         url_list = jsonData[fmt][bitrate]
3770                 except TypeError: # we have no bitrate info.
3771                         url_list = jsonData[fmt]
3772
3773                 return url_list
3774
3775         def check_urls(self, url_list):
3776                 """Returns 1st active url from list"""
3777                 for url in url_list:
3778                         try:
3779                                 urllib2.urlopen(url)
3780                                 return url
3781                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3782                                 url = None
3783
3784                 return None
3785
3786         def _print_formats(self, formats):
3787                 print 'Available formats:'
3788                 for fmt in formats.keys():
3789                         for b in formats[fmt]:
3790                                 try:
3791                                         ext = formats[fmt][b][0]
3792                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3793                                 except TypeError: # we have no bitrate info
3794                                         ext = formats[fmt][0]
3795                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3796                                         break
3797
3798         def _real_extract(self, url):
3799                 mobj = re.match(self._VALID_URL, url)
3800                 if mobj is None:
3801                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3802                         return
3803                 # extract uploader & filename from url
3804                 uploader = mobj.group(1).decode('utf-8')
3805                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3806
3807                 # construct API request
3808                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3809                 # retrieve .json file with links to files
3810                 request = urllib2.Request(file_url)
3811                 try:
3812                         self.report_download_json(file_url)
3813                         jsonData = urllib2.urlopen(request).read()
3814                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3815                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3816                         return
3817
3818                 # parse JSON
3819                 json_data = json.loads(jsonData)
3820                 player_url = json_data['player_swf_url']
3821                 formats = dict(json_data['audio_formats'])
3822
3823                 req_format = self._downloader.params.get('format', None)
3824                 bitrate = None
3825
3826                 if self._downloader.params.get('listformats', None):
3827                         self._print_formats(formats)
3828                         return
3829
3830                 if req_format is None or req_format == 'best':
3831                         for format_param in formats.keys():
3832                                 url_list = self.get_urls(formats, format_param)
3833                                 # check urls
3834                                 file_url = self.check_urls(url_list)
3835                                 if file_url is not None:
3836                                         break # got it!
3837                 else:
3838                         if req_format not in formats.keys():
3839                                 self._downloader.trouble(u'ERROR: format is not available')
3840                                 return
3841
3842                         url_list = self.get_urls(formats, req_format)
3843                         file_url = self.check_urls(url_list)
3844                         format_param = req_format
3845
3846                 # We have audio
3847                 self._downloader.increment_downloads()
3848                 try:
3849                         # Process file information
3850                         self._downloader.process_info({
3851                                 'id': file_id.decode('utf-8'),
3852                                 'url': file_url.decode('utf-8'),
3853                                 'uploader':     uploader.decode('utf-8'),
3854                                 'upload_date': u'NA',
3855                                 'title': json_data['name'],
3856                                 'stitle': _simplify_title(json_data['name']),
3857                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3858                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3859                                 'thumbnail': json_data['thumbnail_url'],
3860                                 'description': json_data['description'],
3861                                 'player_url': player_url.decode('utf-8'),
3862                         })
3863                 except UnavailableVideoError, err:
3864                         self._downloader.trouble(u'ERROR: unable to download file')
3865
3866 class StanfordOpenClassroomIE(InfoExtractor):
3867         """Information extractor for Stanford's Open ClassRoom"""
3868
3869         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3870         IE_NAME = u'stanfordoc'
3871
3872         def report_download_webpage(self, objid):
3873                 """Report information extraction."""
3874                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3875
3876         def report_extraction(self, video_id):
3877                 """Report information extraction."""
3878                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3879
3880         def _real_extract(self, url):
3881                 mobj = re.match(self._VALID_URL, url)
3882                 if mobj is None:
3883                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3884                         return
3885
3886                 if mobj.group('course') and mobj.group('video'): # A specific video
3887                         course = mobj.group('course')
3888                         video = mobj.group('video')
3889                         info = {
3890                                 'id': _simplify_title(course + '_' + video),
3891                         }
3892
3893                         self.report_extraction(info['id'])
3894                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3895                         xmlUrl = baseUrl + video + '.xml'
3896                         try:
3897                                 metaXml = urllib2.urlopen(xmlUrl).read()
3898                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3899                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3900                                 return
3901                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3902                         try:
3903                                 info['title'] = mdoc.findall('./title')[0].text
3904                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3905                         except IndexError:
3906                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3907                                 return
3908                         info['stitle'] = _simplify_title(info['title'])
3909                         info['ext'] = info['url'].rpartition('.')[2]
3910                         info['format'] = info['ext']
3911                         self._downloader.increment_downloads()
3912                         try:
3913                                 self._downloader.process_info(info)
3914                         except UnavailableVideoError, err:
3915                                 self._downloader.trouble(u'\nERROR: unable to download video')
3916                 elif mobj.group('course'): # A course page
3917                         unescapeHTML = HTMLParser.HTMLParser().unescape
3918
3919                         course = mobj.group('course')
3920                         info = {
3921                                 'id': _simplify_title(course),
3922                                 'type': 'playlist',
3923                         }
3924
3925                         self.report_download_webpage(info['id'])
3926                         try:
3927                                 coursepage = urllib2.urlopen(url).read()
3928                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3929                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3930                                 return
3931
3932                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3933                         if m:
3934                                 info['title'] = unescapeHTML(m.group(1))
3935                         else:
3936                                 info['title'] = info['id']
3937                         info['stitle'] = _simplify_title(info['title'])
3938
3939                         m = re.search('<description>([^<]+)</description>', coursepage)
3940                         if m:
3941                                 info['description'] = unescapeHTML(m.group(1))
3942
3943                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3944                         info['list'] = [
3945                                 {
3946                                         'type': 'reference',
3947                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3948                                 }
3949                                         for vpage in links]
3950
3951                         for entry in info['list']:
3952                                 assert entry['type'] == 'reference'
3953                                 self.extract(entry['url'])
3954                 else: # Root page
3955                         unescapeHTML = HTMLParser.HTMLParser().unescape
3956
3957                         info = {
3958                                 'id': 'Stanford OpenClassroom',
3959                                 'type': 'playlist',
3960                         }
3961
3962                         self.report_download_webpage(info['id'])
3963                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3964                         try:
3965                                 rootpage = urllib2.urlopen(rootURL).read()
3966                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3967                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3968                                 return
3969
3970                         info['title'] = info['id']
3971                         info['stitle'] = _simplify_title(info['title'])
3972
3973                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3974                         info['list'] = [
3975                                 {
3976                                         'type': 'reference',
3977                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3978                                 }
3979                                         for cpage in links]
3980
3981                         for entry in info['list']:
3982                                 assert entry['type'] == 'reference'
3983                                 self.extract(entry['url'])
3984
3985 class MTVIE(InfoExtractor):
3986         """Information extractor for MTV.com"""
3987
3988         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3989         IE_NAME = u'mtv'
3990
3991         def report_webpage(self, video_id):
3992                 """Report information extraction."""
3993                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3994
3995         def report_extraction(self, video_id):
3996                 """Report information extraction."""
3997                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3998
3999         def _real_extract(self, url):
4000                 mobj = re.match(self._VALID_URL, url)
4001                 if mobj is None:
4002                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4003                         return
4004                 if not mobj.group('proto'):
4005                         url = 'http://' + url
4006                 video_id = mobj.group('videoid')
4007                 self.report_webpage(video_id)
4008
4009                 request = urllib2.Request(url)
4010                 try:
4011                         webpage = urllib2.urlopen(request).read()
4012                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4013                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4014                         return
4015
4016                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4017                 if mobj is None:
4018                         self._downloader.trouble(u'ERROR: unable to extract song name')
4019                         return
4020                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4021                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4022                 if mobj is None:
4023                         self._downloader.trouble(u'ERROR: unable to extract performer')
4024                         return
4025                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4026                 video_title = performer + ' - ' + song_name
4027
4028                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4029                 if mobj is None:
4030                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4031                         return
4032                 mtvn_uri = mobj.group(1)
4033
4034                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4035                 if mobj is None:
4036                         self._downloader.trouble(u'ERROR: unable to extract content id')
4037                         return
4038                 content_id = mobj.group(1)
4039
4040                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4041                 self.report_extraction(video_id)
4042                 request = urllib2.Request(videogen_url)
4043                 try:
4044                         metadataXml = urllib2.urlopen(request).read()
4045                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4046                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4047                         return
4048
4049                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4050                 renditions = mdoc.findall('.//rendition')
4051
4052                 # For now, always pick the highest quality.
4053                 rendition = renditions[-1]
4054
4055                 try:
4056                         _,_,ext = rendition.attrib['type'].partition('/')
4057                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4058                         video_url = rendition.find('./src').text
4059                 except KeyError:
4060                         self._downloader.trouble('Invalid rendition field.')
4061                         return
4062
4063                 self._downloader.increment_downloads()
4064                 info = {
4065                         'id': video_id,
4066                         'url': video_url,
4067                         'uploader': performer,
4068                         'title': video_title,
4069                         'stitle': _simplify_title(video_title),
4070                         'ext': ext,
4071                         'format': format,
4072                 }
4073
4074                 try:
4075                         self._downloader.process_info(info)
4076                 except UnavailableVideoError, err:
4077                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4078
4079
4080 class PostProcessor(object):
4081         """Post Processor class.
4082
4083         PostProcessor objects can be added to downloaders with their
4084         add_post_processor() method. When the downloader has finished a
4085         successful download, it will take its internal chain of PostProcessors
4086         and start calling the run() method on each one of them, first with
4087         an initial argument and then with the returned value of the previous
4088         PostProcessor.
4089
4090         The chain will be stopped if one of them ever returns None or the end
4091         of the chain is reached.
4092
4093         PostProcessor objects follow a "mutual registration" process similar
4094         to InfoExtractor objects.
4095         """
4096
4097         _downloader = None
4098
4099         def __init__(self, downloader=None):
4100                 self._downloader = downloader
4101
4102         def set_downloader(self, downloader):
4103                 """Sets the downloader for this PP."""
4104                 self._downloader = downloader
4105
4106         def run(self, information):
4107                 """Run the PostProcessor.
4108
4109                 The "information" argument is a dictionary like the ones
4110                 composed by InfoExtractors. The only difference is that this
4111                 one has an extra field called "filepath" that points to the
4112                 downloaded file.
4113
4114                 When this method returns None, the postprocessing chain is
4115                 stopped. However, this method may return an information
4116                 dictionary that will be passed to the next postprocessing
4117                 object in the chain. It can be the one it received after
4118                 changing some fields.
4119
4120                 In addition, this method may raise a PostProcessingError
4121                 exception that will be taken into account by the downloader
4122                 it was called from.
4123                 """
4124                 return information # by default, do nothing
4125
4126 class AudioConversionError(BaseException):
4127         def __init__(self, message):
4128                 self.message = message
4129
4130 class FFmpegExtractAudioPP(PostProcessor):
4131
4132         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4133                 PostProcessor.__init__(self, downloader)
4134                 if preferredcodec is None:
4135                         preferredcodec = 'best'
4136                 self._preferredcodec = preferredcodec
4137                 self._preferredquality = preferredquality
4138                 self._keepvideo = keepvideo
4139
4140         @staticmethod
4141         def get_audio_codec(path):
4142                 try:
4143                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4144                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4145                         output = handle.communicate()[0]
4146                         if handle.wait() != 0:
4147                                 return None
4148                 except (IOError, OSError):
4149                         return None
4150                 audio_codec = None
4151                 for line in output.split('\n'):
4152                         if line.startswith('codec_name='):
4153                                 audio_codec = line.split('=')[1].strip()
4154                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4155                                 return audio_codec
4156                 return None
4157
4158         @staticmethod
4159         def run_ffmpeg(path, out_path, codec, more_opts):
4160                 if codec is None:
4161                         acodec_opts = []
4162                 else:
4163                         acodec_opts = ['-acodec', codec]
4164                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4165                 try:
4166                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4167                         stdout,stderr = p.communicate()
4168                 except (IOError, OSError):
4169                         e = sys.exc_info()[1]
4170                         if isinstance(e, OSError) and e.errno == 2:
4171                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4172                         else:
4173                                 raise e
4174                 if p.returncode != 0:
4175                         msg = stderr.strip().split('\n')[-1]
4176                         raise AudioConversionError(msg)
4177
4178         def run(self, information):
4179                 path = information['filepath']
4180
4181                 filecodec = self.get_audio_codec(path)
4182                 if filecodec is None:
4183                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4184                         return None
4185
4186                 more_opts = []
4187                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4188                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4189                                 # Lossless, but in another container
4190                                 acodec = 'copy'
4191                                 extension = self._preferredcodec
4192                                 more_opts = ['-absf', 'aac_adtstoasc']
4193                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4194                                 # Lossless if possible
4195                                 acodec = 'copy'
4196                                 extension = filecodec
4197                                 if filecodec == 'aac':
4198                                         more_opts = ['-f', 'adts']
4199                                 if filecodec == 'vorbis':
4200                                         extension = 'ogg'
4201                         else:
4202                                 # MP3 otherwise.
4203                                 acodec = 'libmp3lame'
4204                                 extension = 'mp3'
4205                                 more_opts = []
4206                                 if self._preferredquality is not None:
4207                                         more_opts += ['-ab', self._preferredquality]
4208                 else:
4209                         # We convert the audio (lossy)
4210                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4211                         extension = self._preferredcodec
4212                         more_opts = []
4213                         if self._preferredquality is not None:
4214                                 more_opts += ['-ab', self._preferredquality]
4215                         if self._preferredcodec == 'aac':
4216                                 more_opts += ['-f', 'adts']
4217                         if self._preferredcodec == 'm4a':
4218                                 more_opts += ['-absf', 'aac_adtstoasc']
4219                         if self._preferredcodec == 'vorbis':
4220                                 extension = 'ogg'
4221                         if self._preferredcodec == 'wav':
4222                                 extension = 'wav'
4223                                 more_opts += ['-f', 'wav']
4224
4225                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4226                 new_path = prefix + sep + extension
4227                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4228                 try:
4229                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4230                 except:
4231                         etype,e,tb = sys.exc_info()
4232                         if isinstance(e, AudioConversionError):
4233                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4234                         else:
4235                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4236                         return None
4237
4238                 # Try to update the date time for extracted audio file.
4239                 if information.get('filetime') is not None:
4240                         try:
4241                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4242                         except:
4243                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4244
4245                 if not self._keepvideo:
4246                         try:
4247                                 os.remove(_encodeFilename(path))
4248                         except (IOError, OSError):
4249                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4250                                 return None
4251
4252                 information['filepath'] = new_path
4253                 return information
4254
4255
4256 def updateSelf(downloader, filename):
4257         ''' Update the program file with the latest version from the repository '''
4258         # Note: downloader only used for options
4259         if not os.access(filename, os.W_OK):
4260                 sys.exit('ERROR: no write permissions on %s' % filename)
4261
4262         downloader.to_screen(u'Updating to latest version...')
4263
4264         try:
4265                 try:
4266                         urlh = urllib.urlopen(UPDATE_URL)
4267                         newcontent = urlh.read()
4268
4269                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4270                         if vmatch is not None and vmatch.group(1) == __version__:
4271                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4272                                 return
4273                 finally:
4274                         urlh.close()
4275         except (IOError, OSError), err:
4276                 sys.exit('ERROR: unable to download latest version')
4277
4278         try:
4279                 outf = open(filename, 'wb')
4280                 try:
4281                         outf.write(newcontent)
4282                 finally:
4283                         outf.close()
4284         except (IOError, OSError), err:
4285                 sys.exit('ERROR: unable to overwrite current version')
4286
4287         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4288
4289 def parseOpts():
4290         def _readOptions(filename_bytes):
4291                 try:
4292                         optionf = open(filename_bytes)
4293                 except IOError:
4294                         return [] # silently skip if file is not present
4295                 try:
4296                         res = []
4297                         for l in optionf:
4298                                 res += shlex.split(l, comments=True)
4299                 finally:
4300                         optionf.close()
4301                 return res
4302
4303         def _format_option_string(option):
4304                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4305
4306                 opts = []
4307
4308                 if option._short_opts: opts.append(option._short_opts[0])
4309                 if option._long_opts: opts.append(option._long_opts[0])
4310                 if len(opts) > 1: opts.insert(1, ', ')
4311
4312                 if option.takes_value(): opts.append(' %s' % option.metavar)
4313
4314                 return "".join(opts)
4315
4316         def _find_term_columns():
4317                 columns = os.environ.get('COLUMNS', None)
4318                 if columns:
4319                         return int(columns)
4320
4321                 try:
4322                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4323                         out,err = sp.communicate()
4324                         return int(out.split()[1])
4325                 except:
4326                         pass
4327                 return None
4328
4329         max_width = 80
4330         max_help_position = 80
4331
4332         # No need to wrap help messages if we're on a wide console
4333         columns = _find_term_columns()
4334         if columns: max_width = columns
4335
4336         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4337         fmt.format_option_strings = _format_option_string
4338
4339         kw = {
4340                 'version'   : __version__,
4341                 'formatter' : fmt,
4342                 'usage' : '%prog [options] url [url...]',
4343                 'conflict_handler' : 'resolve',
4344         }
4345
4346         parser = optparse.OptionParser(**kw)
4347
4348         # option groups
4349         general        = optparse.OptionGroup(parser, 'General Options')
4350         selection      = optparse.OptionGroup(parser, 'Video Selection')
4351         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4352         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4353         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4354         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4355         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4356
4357         general.add_option('-h', '--help',
4358                         action='help', help='print this help text and exit')
4359         general.add_option('-v', '--version',
4360                         action='version', help='print program version and exit')
4361         general.add_option('-U', '--update',
4362                         action='store_true', dest='update_self', help='update this program to latest version')
4363         general.add_option('-i', '--ignore-errors',
4364                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4365         general.add_option('-r', '--rate-limit',
4366                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4367         general.add_option('-R', '--retries',
4368                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4369         general.add_option('--dump-user-agent',
4370                         action='store_true', dest='dump_user_agent',
4371                         help='display the current browser identification', default=False)
4372         general.add_option('--list-extractors',
4373                         action='store_true', dest='list_extractors',
4374                         help='List all supported extractors and the URLs they would handle', default=False)
4375
4376         selection.add_option('--playlist-start',
4377                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4378         selection.add_option('--playlist-end',
4379                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4380         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4381         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4382         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4383
4384         authentication.add_option('-u', '--username',
4385                         dest='username', metavar='USERNAME', help='account username')
4386         authentication.add_option('-p', '--password',
4387                         dest='password', metavar='PASSWORD', help='account password')
4388         authentication.add_option('-n', '--netrc',
4389                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4390
4391
4392         video_format.add_option('-f', '--format',
4393                         action='store', dest='format', metavar='FORMAT', help='video format code')
4394         video_format.add_option('--all-formats',
4395                         action='store_const', dest='format', help='download all available video formats', const='all')
4396         video_format.add_option('--prefer-free-formats',
4397                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4398         video_format.add_option('--max-quality',
4399                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4400         video_format.add_option('-F', '--list-formats',
4401                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4402         video_format.add_option('--write-srt',
4403                         action='store_true', dest='writesubtitles',
4404                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4405         video_format.add_option('--srt-lang',
4406                         action='store', dest='subtitleslang', metavar='LANG',
4407                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4408
4409
4410         verbosity.add_option('-q', '--quiet',
4411                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4412         verbosity.add_option('-s', '--simulate',
4413                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4414         verbosity.add_option('--skip-download',
4415                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4416         verbosity.add_option('-g', '--get-url',
4417                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4418         verbosity.add_option('-e', '--get-title',
4419                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4420         verbosity.add_option('--get-thumbnail',
4421                         action='store_true', dest='getthumbnail',
4422                         help='simulate, quiet but print thumbnail URL', default=False)
4423         verbosity.add_option('--get-description',
4424                         action='store_true', dest='getdescription',
4425                         help='simulate, quiet but print video description', default=False)
4426         verbosity.add_option('--get-filename',
4427                         action='store_true', dest='getfilename',
4428                         help='simulate, quiet but print output filename', default=False)
4429         verbosity.add_option('--get-format',
4430                         action='store_true', dest='getformat',
4431                         help='simulate, quiet but print output format', default=False)
4432         verbosity.add_option('--no-progress',
4433                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4434         verbosity.add_option('--console-title',
4435                         action='store_true', dest='consoletitle',
4436                         help='display progress in console titlebar', default=False)
4437         verbosity.add_option('-v', '--verbose',
4438                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4439
4440
4441         filesystem.add_option('-t', '--title',
4442                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4443         filesystem.add_option('-l', '--literal',
4444                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4445         filesystem.add_option('-A', '--auto-number',
4446                         action='store_true', dest='autonumber',
4447                         help='number downloaded files starting from 00000', default=False)
4448         filesystem.add_option('-o', '--output',
4449                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4450         filesystem.add_option('-a', '--batch-file',
4451                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4452         filesystem.add_option('-w', '--no-overwrites',
4453                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4454         filesystem.add_option('-c', '--continue',
4455                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4456         filesystem.add_option('--no-continue',
4457                         action='store_false', dest='continue_dl',
4458                         help='do not resume partially downloaded files (restart from beginning)')
4459         filesystem.add_option('--cookies',
4460                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4461         filesystem.add_option('--no-part',
4462                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4463         filesystem.add_option('--no-mtime',
4464                         action='store_false', dest='updatetime',
4465                         help='do not use the Last-modified header to set the file modification time', default=True)
4466         filesystem.add_option('--write-description',
4467                         action='store_true', dest='writedescription',
4468                         help='write video description to a .description file', default=False)
4469         filesystem.add_option('--write-info-json',
4470                         action='store_true', dest='writeinfojson',
4471                         help='write video metadata to a .info.json file', default=False)
4472
4473
4474         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4475                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4476         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4477                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4478         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4479                         help='ffmpeg audio bitrate specification, 128k by default')
4480         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4481                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4482
4483
4484         parser.add_option_group(general)
4485         parser.add_option_group(selection)
4486         parser.add_option_group(filesystem)
4487         parser.add_option_group(verbosity)
4488         parser.add_option_group(video_format)
4489         parser.add_option_group(authentication)
4490         parser.add_option_group(postproc)
4491
4492         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4493         if xdg_config_home:
4494                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4495         else:
4496                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4497         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4498         opts, args = parser.parse_args(argv)
4499
4500         return parser, opts, args
4501
4502 def gen_extractors():
4503         """ Return a list of an instance of every supported extractor.
4504         The order does matter; the first extractor matched is the one handling the URL.
4505         """
4506         youtube_ie = YoutubeIE()
4507         google_ie = GoogleIE()
4508         yahoo_ie = YahooIE()
4509         return [
4510                 YoutubePlaylistIE(youtube_ie),
4511                 YoutubeUserIE(youtube_ie),
4512                 YoutubeSearchIE(youtube_ie),
4513                 youtube_ie,
4514                 MetacafeIE(youtube_ie),
4515                 DailymotionIE(),
4516                 google_ie,
4517                 GoogleSearchIE(google_ie),
4518                 PhotobucketIE(),
4519                 yahoo_ie,
4520                 YahooSearchIE(yahoo_ie),
4521                 DepositFilesIE(),
4522                 FacebookIE(),
4523                 BlipTVIE(),
4524                 VimeoIE(),
4525                 MyVideoIE(),
4526                 ComedyCentralIE(),
4527                 EscapistIE(),
4528                 CollegeHumorIE(),
4529                 XVideosIE(),
4530                 SoundcloudIE(),
4531                 InfoQIE(),
4532                 MixcloudIE(),
4533                 StanfordOpenClassroomIE(),
4534                 MTVIE(),
4535
4536                 GenericIE()
4537         ]
4538
4539 def _real_main():
4540         parser, opts, args = parseOpts()
4541
4542         # Open appropriate CookieJar
4543         if opts.cookiefile is None:
4544                 jar = cookielib.CookieJar()
4545         else:
4546                 try:
4547                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4548                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4549                                 jar.load()
4550                 except (IOError, OSError), err:
4551                         sys.exit(u'ERROR: unable to open cookie file')
4552
4553         # Dump user agent
4554         if opts.dump_user_agent:
4555                 print std_headers['User-Agent']
4556                 sys.exit(0)
4557
4558         # Batch file verification
4559         batchurls = []
4560         if opts.batchfile is not None:
4561                 try:
4562                         if opts.batchfile == '-':
4563                                 batchfd = sys.stdin
4564                         else:
4565                                 batchfd = open(opts.batchfile, 'r')
4566                         batchurls = batchfd.readlines()
4567                         batchurls = [x.strip() for x in batchurls]
4568                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4569                 except IOError:
4570                         sys.exit(u'ERROR: batch file could not be read')
4571         all_urls = batchurls + args
4572         all_urls = map(lambda url: url.strip(), all_urls)
4573
4574         # General configuration
4575         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4576         proxy_handler = urllib2.ProxyHandler()
4577         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4578         urllib2.install_opener(opener)
4579         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4580
4581         if opts.verbose:
4582                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4583
4584         extractors = gen_extractors()
4585
4586         if opts.list_extractors:
4587                 for ie in extractors:
4588                         print(ie.IE_NAME)
4589                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4590                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4591                         for mu in matchedUrls:
4592                                 print(u'  ' + mu)
4593                 sys.exit(0)
4594
4595         # Conflicting, missing and erroneous options
4596         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4597                 parser.error(u'using .netrc conflicts with giving username/password')
4598         if opts.password is not None and opts.username is None:
4599                 parser.error(u'account username missing')
4600         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4601                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4602         if opts.usetitle and opts.useliteral:
4603                 parser.error(u'using title conflicts with using literal title')
4604         if opts.username is not None and opts.password is None:
4605                 opts.password = getpass.getpass(u'Type account password and press return:')
4606         if opts.ratelimit is not None:
4607                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4608                 if numeric_limit is None:
4609                         parser.error(u'invalid rate limit specified')
4610                 opts.ratelimit = numeric_limit
4611         if opts.retries is not None:
4612                 try:
4613                         opts.retries = long(opts.retries)
4614                 except (TypeError, ValueError), err:
4615                         parser.error(u'invalid retry count specified')
4616         try:
4617                 opts.playliststart = int(opts.playliststart)
4618                 if opts.playliststart <= 0:
4619                         raise ValueError(u'Playlist start must be positive')
4620         except (TypeError, ValueError), err:
4621                 parser.error(u'invalid playlist start number specified')
4622         try:
4623                 opts.playlistend = int(opts.playlistend)
4624                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4625                         raise ValueError(u'Playlist end must be greater than playlist start')
4626         except (TypeError, ValueError), err:
4627                 parser.error(u'invalid playlist end number specified')
4628         if opts.extractaudio:
4629                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4630                         parser.error(u'invalid audio format specified')
4631
4632         # File downloader
4633         fd = FileDownloader({
4634                 'usenetrc': opts.usenetrc,
4635                 'username': opts.username,
4636                 'password': opts.password,
4637                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4638                 'forceurl': opts.geturl,
4639                 'forcetitle': opts.gettitle,
4640                 'forcethumbnail': opts.getthumbnail,
4641                 'forcedescription': opts.getdescription,
4642                 'forcefilename': opts.getfilename,
4643                 'forceformat': opts.getformat,
4644                 'simulate': opts.simulate,
4645                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4646                 'format': opts.format,
4647                 'format_limit': opts.format_limit,
4648                 'listformats': opts.listformats,
4649                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4650                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4651                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4652                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4653                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4654                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4655                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4656                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4657                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4658                         or u'%(id)s.%(ext)s'),
4659                 'ignoreerrors': opts.ignoreerrors,
4660                 'ratelimit': opts.ratelimit,
4661                 'nooverwrites': opts.nooverwrites,
4662                 'retries': opts.retries,
4663                 'continuedl': opts.continue_dl,
4664                 'noprogress': opts.noprogress,
4665                 'playliststart': opts.playliststart,
4666                 'playlistend': opts.playlistend,
4667                 'logtostderr': opts.outtmpl == '-',
4668                 'consoletitle': opts.consoletitle,
4669                 'nopart': opts.nopart,
4670                 'updatetime': opts.updatetime,
4671                 'writedescription': opts.writedescription,
4672                 'writeinfojson': opts.writeinfojson,
4673                 'writesubtitles': opts.writesubtitles,
4674                 'subtitleslang': opts.subtitleslang,
4675                 'matchtitle': opts.matchtitle,
4676                 'rejecttitle': opts.rejecttitle,
4677                 'max_downloads': opts.max_downloads,
4678                 'prefer_free_formats': opts.prefer_free_formats,
4679                 'verbose': opts.verbose,
4680                 })
4681         for extractor in extractors:
4682                 fd.add_info_extractor(extractor)
4683
4684         # PostProcessors
4685         if opts.extractaudio:
4686                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4687
4688         # Update version
4689         if opts.update_self:
4690                 updateSelf(fd, sys.argv[0])
4691
4692         # Maybe do nothing
4693         if len(all_urls) < 1:
4694                 if not opts.update_self:
4695                         parser.error(u'you must provide at least one URL')
4696                 else:
4697                         sys.exit()
4698
4699         try:
4700                 retcode = fd.download(all_urls)
4701         except MaxDownloadsReached:
4702                 fd.to_screen(u'--max-download limit reached, aborting.')
4703                 retcode = 101
4704
4705         # Dump cookie jar if requested
4706         if opts.cookiefile is not None:
4707                 try:
4708                         jar.save()
4709                 except (IOError, OSError), err:
4710                         sys.exit(u'ERROR: unable to save cookie jar')
4711
4712         sys.exit(retcode)
4713
4714 def main():
4715         try:
4716                 _real_main()
4717         except DownloadError:
4718                 sys.exit(1)
4719         except SameFileError:
4720                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4721         except KeyboardInterrupt:
4722                 sys.exit(u'\nERROR: Interrupted by user')
4723
4724 if __name__ == '__main__':
4725         main()
4726
4727 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: