_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2012.02.27'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25
  26 import cookielib
  27 import datetime
  28 import getpass
  29 import gzip
  30 import htmlentitydefs
  31 import HTMLParser
  32 import httplib
  33 import locale
  34 import math
  35 import netrc
  36 import optparse
  37 import os
  38 import os.path
  39 import re
  40 import shlex
  41 import socket
  42 import string
  43 import subprocess
  44 import sys
  45 import time
  46 import urllib
  47 import urllib2
  48 import warnings
  49 import zlib
  50
  51 if os.name == 'nt':
  52         import ctypes
  53
  54 try:
  55         import email.utils
  56 except ImportError: # Python 2.4
  57         import email.Utils
  58 try:
  59         import cStringIO as StringIO
  60 except ImportError:
  61         import StringIO
  62
  63 # parse_qs was moved from the cgi module to the urlparse module recently.
  64 try:
  65         from urlparse import parse_qs
  66 except ImportError:
  67         from cgi import parse_qs
  68
  69 try:
  70         import lxml.etree
  71 except ImportError:
  72         pass # Handled below
  73
  74 try:
  75         import xml.etree.ElementTree
  76 except ImportError: # Python<2.5: Not officially supported, but let it slip
  77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  78
  79 std_headers = {
  80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83         'Accept-Encoding': 'gzip, deflate',
  84         'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87 try:
  88         import json
  89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  90         import re
  91         class json(object):
  92                 @staticmethod
  93                 def loads(s):
  94                         s = s.decode('UTF-8')
  95                         def raiseError(msg, i):
  96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  97                         def skipSpace(i, expectMore=True):
  98                                 while i < len(s) and s[i] in ' \t\r\n':
  99                                         i += 1
 100                                 if expectMore:
 101                                         if i >= len(s):
 102                                                 raiseError('Premature end', i)
 103                                 return i
 104                         def decodeEscape(match):
 105                                 esc = match.group(1)
 106                                 _STATIC = {
 107                                         '"': '"',
 108                                         '\\': '\\',
 109                                         '/': '/',
 110                                         'b': unichr(0x8),
 111                                         'f': unichr(0xc),
 112                                         'n': '\n',
 113                                         'r': '\r',
 114                                         't': '\t',
 115                                 }
 116                                 if esc in _STATIC:
 117                                         return _STATIC[esc]
 118                                 if esc[0] == 'u':
 119                                         if len(esc) == 1+4:
 120                                                 return unichr(int(esc[1:5], 16))
 121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 122                                                 hi = int(esc[1:5], 16)
 123                                                 low = int(esc[7:11], 16)
 124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 125                                 raise ValueError('Unknown escape ' + str(esc))
 126                         def parseString(i):
 127                                 i += 1
 128                                 e = i
 129                                 while True:
 130                                         e = s.index('"', e)
 131                                         bslashes = 0
 132                                         while s[e-bslashes-1] == '\\':
 133                                                 bslashes += 1
 134                                         if bslashes % 2 == 1:
 135                                                 e += 1
 136                                                 continue
 137                                         break
 138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 139                                 stri = rexp.sub(decodeEscape, s[i:e])
 140                                 return (e+1,stri)
 141                         def parseObj(i):
 142                                 i += 1
 143                                 res = {}
 144                                 i = skipSpace(i)
 145                                 if s[i] == '}': # Empty dictionary
 146                                         return (i+1,res)
 147                                 while True:
 148                                         if s[i] != '"':
 149                                                 raiseError('Expected a string object key', i)
 150                                         i,key = parseString(i)
 151                                         i = skipSpace(i)
 152                                         if i >= len(s) or s[i] != ':':
 153                                                 raiseError('Expected a colon', i)
 154                                         i,val = parse(i+1)
 155                                         res[key] = val
 156                                         i = skipSpace(i)
 157                                         if s[i] == '}':
 158                                                 return (i+1, res)
 159                                         if s[i] != ',':
 160                                                 raiseError('Expected comma or closing curly brace', i)
 161                                         i = skipSpace(i+1)
 162                         def parseArray(i):
 163                                 res = []
 164                                 i = skipSpace(i+1)
 165                                 if s[i] == ']': # Empty array
 166                                         return (i+1,res)
 167                                 while True:
 168                                         i,val = parse(i)
 169                                         res.append(val)
 170                                         i = skipSpace(i) # Raise exception if premature end
 171                                         if s[i] == ']':
 172                                                 return (i+1, res)
 173                                         if s[i] != ',':
 174                                                 raiseError('Expected a comma or closing bracket', i)
 175                                         i = skipSpace(i+1)
 176                         def parseDiscrete(i):
 177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 178                                         if s.startswith(k, i):
 179                                                 return (i+len(k), v)
 180                                 raiseError('Not a boolean (or null)', i)
 181                         def parseNumber(i):
 182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 183                                 if mobj is None:
 184                                         raiseError('Not a number', i)
 185                                 nums = mobj.group(1)
 186                                 if '.' in nums or 'e' in nums or 'E' in nums:
 187                                         return (i+len(nums), float(nums))
 188                                 return (i+len(nums), int(nums))
 189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 190                         def parse(i):
 191                                 i = skipSpace(i)
 192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 193                                 i = skipSpace(i, False)
 194                                 return (i,res)
 195                         i,res = parse(0)
 196                         if i < len(s):
 197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 198                         return res
 199
 200 def preferredencoding():
 201         """Get preferred encoding.
 202
 203         Returns the best encoding scheme for the system, based on
 204         locale.getpreferredencoding() and some further tweaks.
 205         """
 206         def yield_preferredencoding():
 207                 try:
 208                         pref = locale.getpreferredencoding()
 209                         u'TEST'.encode(pref)
 210                 except:
 211                         pref = 'UTF-8'
 212                 while True:
 213                         yield pref
 214         return yield_preferredencoding().next()
 215
 216
 217 def htmlentity_transform(matchobj):
 218         """Transforms an HTML entity to a Unicode character.
 219
 220         This function receives a match object and is intended to be used with
 221         the re.sub() function.
 222         """
 223         entity = matchobj.group(1)
 224
 225         # Known non-numeric HTML entity
 226         if entity in htmlentitydefs.name2codepoint:
 227                 return unichr(htmlentitydefs.name2codepoint[entity])
 228
 229         # Unicode character
 230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 231         if mobj is not None:
 232                 numstr = mobj.group(1)
 233                 if numstr.startswith(u'x'):
 234                         base = 16
 235                         numstr = u'0%s' % numstr
 236                 else:
 237                         base = 10
 238                 return unichr(long(numstr, base))
 239
 240         # Unknown entity in name, return its literal representation
 241         return (u'&%s;' % entity)
 242
 243
 244 def sanitize_title(utitle):
 245         """Sanitizes a video title so it could be used as part of a filename."""
 246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 247         return utitle.replace(unicode(os.sep), u'%')
 248
 249
 250 def sanitize_open(filename, open_mode):
 251         """Try to open the given filename, and slightly tweak it if this fails.
 252
 253         Attempts to open the given filename. If this fails, it tries to change
 254         the filename slightly, step by step, until it's either able to open it
 255         or it fails and raises a final exception, like the standard open()
 256         function.
 257
 258         It returns the tuple (stream, definitive_file_name).
 259         """
 260         try:
 261                 if filename == u'-':
 262                         if sys.platform == 'win32':
 263                                 import msvcrt
 264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 265                         return (sys.stdout, filename)
 266                 stream = open(_encodeFilename(filename), open_mode)
 267                 return (stream, filename)
 268         except (IOError, OSError), err:
 269                 # In case of error, try to remove win32 forbidden chars
 270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 271
 272                 # An exception here should be caught in the caller
 273                 stream = open(_encodeFilename(filename), open_mode)
 274                 return (stream, filename)
 275
 276
 277 def timeconvert(timestr):
 278         """Convert RFC 2822 defined time string into system timestamp"""
 279         timestamp = None
 280         timetuple = email.utils.parsedate_tz(timestr)
 281         if timetuple is not None:
 282                 timestamp = email.utils.mktime_tz(timetuple)
 283         return timestamp
 284
 285 def _simplify_title(title):
 286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 287         return expr.sub(u'_', title).strip(u'_')
 288
 289 def _orderedSet(iterable):
 290         """ Remove all duplicates from the input iterable """
 291         res = []
 292         for el in iterable:
 293                 if el not in res:
 294                         res.append(el)
 295         return res
 296
 297 def _unescapeHTML(s):
 298         """
 299         @param s a string (of type unicode)
 300         """
 301         assert type(s) == type(u'')
 302
 303         htmlParser = HTMLParser.HTMLParser()
 304         return htmlParser.unescape(s)
 305
 306 def _encodeFilename(s):
 307         """
 308         @param s The name of the file (of type unicode)
 309         """
 310
 311         assert type(s) == type(u'')
 312
 313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 317                 return s
 318         else:
 319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 320
 321 class DownloadError(Exception):
 322         """Download Error exception.
 323
 324         This exception may be thrown by FileDownloader objects if they are not
 325         configured to continue on errors. They will contain the appropriate
 326         error message.
 327         """
 328         pass
 329
 330
 331 class SameFileError(Exception):
 332         """Same File exception.
 333
 334         This exception will be thrown by FileDownloader objects if they detect
 335         multiple files would have to be downloaded to the same file on disk.
 336         """
 337         pass
 338
 339
 340 class PostProcessingError(Exception):
 341         """Post Processing exception.
 342
 343         This exception may be raised by PostProcessor's .run() method to
 344         indicate an error in the postprocessing task.
 345         """
 346         pass
 347
 348 class MaxDownloadsReached(Exception):
 349         """ --max-downloads limit has been reached. """
 350         pass
 351
 352
 353 class UnavailableVideoError(Exception):
 354         """Unavailable Format exception.
 355
 356         This exception will be thrown when a video is requested
 357         in a format that is not available for that video.
 358         """
 359         pass
 360
 361
 362 class ContentTooShortError(Exception):
 363         """Content Too Short exception.
 364
 365         This exception may be raised by FileDownloader objects when a file they
 366         download is too small for what the server announced first, indicating
 367         the connection was probably interrupted.
 368         """
 369         # Both in bytes
 370         downloaded = None
 371         expected = None
 372
 373         def __init__(self, downloaded, expected):
 374                 self.downloaded = downloaded
 375                 self.expected = expected
 376
 377
 378 class YoutubeDLHandler(urllib2.HTTPHandler):
 379         """Handler for HTTP requests and responses.
 380
 381         This class, when installed with an OpenerDirector, automatically adds
 382         the standard headers to every HTTP request and handles gzipped and
 383         deflated responses from web servers. If compression is to be avoided in
 384         a particular request, the original request in the program code only has
 385         to include the HTTP header "Youtubedl-No-Compression", which will be
 386         removed before making the real request.
 387
 388         Part of this code was copied from:
 389
 390         http://techknack.net/python-urllib2-handlers/
 391
 392         Andrew Rowls, the author of that code, agreed to release it to the
 393         public domain.
 394         """
 395
 396         @staticmethod
 397         def deflate(data):
 398                 try:
 399                         return zlib.decompress(data, -zlib.MAX_WBITS)
 400                 except zlib.error:
 401                         return zlib.decompress(data)
 402
 403         @staticmethod
 404         def addinfourl_wrapper(stream, headers, url, code):
 405                 if hasattr(urllib2.addinfourl, 'getcode'):
 406                         return urllib2.addinfourl(stream, headers, url, code)
 407                 ret = urllib2.addinfourl(stream, headers, url)
 408                 ret.code = code
 409                 return ret
 410
 411         def http_request(self, req):
 412                 for h in std_headers:
 413                         if h in req.headers:
 414                                 del req.headers[h]
 415                         req.add_header(h, std_headers[h])
 416                 if 'Youtubedl-no-compression' in req.headers:
 417                         if 'Accept-encoding' in req.headers:
 418                                 del req.headers['Accept-encoding']
 419                         del req.headers['Youtubedl-no-compression']
 420                 return req
 421
 422         def http_response(self, req, resp):
 423                 old_resp = resp
 424                 # gzip
 425                 if resp.headers.get('Content-encoding', '') == 'gzip':
 426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 428                         resp.msg = old_resp.msg
 429                 # deflate
 430                 if resp.headers.get('Content-encoding', '') == 'deflate':
 431                         gz = StringIO.StringIO(self.deflate(resp.read()))
 432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 433                         resp.msg = old_resp.msg
 434                 return resp
 435
 436
 437 class FileDownloader(object):
 438         """File Downloader class.
 439
 440         File downloader objects are the ones responsible of downloading the
 441         actual video file and writing it to disk if the user has requested
 442         it, among some other tasks. In most cases there should be one per
 443         program. As, given a video URL, the downloader doesn't know how to
 444         extract all the needed information, task that InfoExtractors do, it
 445         has to pass the URL to one of them.
 446
 447         For this, file downloader objects have a method that allows
 448         InfoExtractors to be registered in a given order. When it is passed
 449         a URL, the file downloader handles it to the first InfoExtractor it
 450         finds that reports being able to handle it. The InfoExtractor extracts
 451         all the information about the video or videos the URL refers to, and
 452         asks the FileDownloader to process the video information, possibly
 453         downloading the video.
 454
 455         File downloaders accept a lot of parameters. In order not to saturate
 456         the object constructor with arguments, it receives a dictionary of
 457         options instead. These options are available through the params
 458         attribute for the InfoExtractors to use. The FileDownloader also
 459         registers itself as the downloader in charge for the InfoExtractors
 460         that are added to it, so this is a "mutual registration".
 461
 462         Available options:
 463
 464         username:         Username for authentication purposes.
 465         password:         Password for authentication purposes.
 466         usenetrc:         Use netrc for authentication instead.
 467         quiet:            Do not print messages to stdout.
 468         forceurl:         Force printing final URL.
 469         forcetitle:       Force printing title.
 470         forcethumbnail:   Force printing thumbnail URL.
 471         forcedescription: Force printing description.
 472         forcefilename:    Force printing final filename.
 473         simulate:         Do not download the video files.
 474         format:           Video format code.
 475         format_limit:     Highest quality format to try.
 476         outtmpl:          Template for output names.
 477         ignoreerrors:     Do not stop on download errors.
 478         ratelimit:        Download speed limit, in bytes/sec.
 479         nooverwrites:     Prevent overwriting files.
 480         retries:          Number of times to retry for HTTP error 5xx
 481         continuedl:       Try to continue downloads if possible.
 482         noprogress:       Do not print the progress bar.
 483         playliststart:    Playlist item to start at.
 484         playlistend:      Playlist item to end at.
 485         matchtitle:       Download only matching titles.
 486         rejecttitle:      Reject downloads for matching titles.
 487         logtostderr:      Log messages to stderr instead of stdout.
 488         consoletitle:     Display progress in console window's titlebar.
 489         nopart:           Do not use temporary .part files.
 490         updatetime:       Use the Last-modified header to set output file timestamps.
 491         writedescription: Write the video description to a .description file
 492         writeinfojson:    Write the video description to a .info.json file
 493         writesubtitles:   Write the video subtitles to a .srt file
 494         subtitleslang:    Language of the subtitles to download
 495         """
 496
 497         params = None
 498         _ies = []
 499         _pps = []
 500         _download_retcode = None
 501         _num_downloads = None
 502         _screen_file = None
 503
 504         def __init__(self, params):
 505                 """Create a FileDownloader object with the given options."""
 506                 self._ies = []
 507                 self._pps = []
 508                 self._download_retcode = 0
 509                 self._num_downloads = 0
 510                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 511                 self.params = params
 512
 513         @staticmethod
 514         def format_bytes(bytes):
 515                 if bytes is None:
 516                         return 'N/A'
 517                 if type(bytes) is str:
 518                         bytes = float(bytes)
 519                 if bytes == 0.0:
 520                         exponent = 0
 521                 else:
 522                         exponent = long(math.log(bytes, 1024.0))
 523                 suffix = 'bkMGTPEZY'[exponent]
 524                 converted = float(bytes) / float(1024 ** exponent)
 525                 return '%.2f%s' % (converted, suffix)
 526
 527         @staticmethod
 528         def calc_percent(byte_counter, data_len):
 529                 if data_len is None:
 530                         return '---.-%'
 531                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 532
 533         @staticmethod
 534         def calc_eta(start, now, total, current):
 535                 if total is None:
 536                         return '--:--'
 537                 dif = now - start
 538                 if current == 0 or dif < 0.001: # One millisecond
 539                         return '--:--'
 540                 rate = float(current) / dif
 541                 eta = long((float(total) - float(current)) / rate)
 542                 (eta_mins, eta_secs) = divmod(eta, 60)
 543                 if eta_mins > 99:
 544                         return '--:--'
 545                 return '%02d:%02d' % (eta_mins, eta_secs)
 546
 547         @staticmethod
 548         def calc_speed(start, now, bytes):
 549                 dif = now - start
 550                 if bytes == 0 or dif < 0.001: # One millisecond
 551                         return '%10s' % '---b/s'
 552                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 553
 554         @staticmethod
 555         def best_block_size(elapsed_time, bytes):
 556                 new_min = max(bytes / 2.0, 1.0)
 557                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 558                 if elapsed_time < 0.001:
 559                         return long(new_max)
 560                 rate = bytes / elapsed_time
 561                 if rate > new_max:
 562                         return long(new_max)
 563                 if rate < new_min:
 564                         return long(new_min)
 565                 return long(rate)
 566
 567         @staticmethod
 568         def parse_bytes(bytestr):
 569                 """Parse a string indicating a byte quantity into a long integer."""
 570                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 571                 if matchobj is None:
 572                         return None
 573                 number = float(matchobj.group(1))
 574                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 575                 return long(round(number * multiplier))
 576
 577         def add_info_extractor(self, ie):
 578                 """Add an InfoExtractor object to the end of the list."""
 579                 self._ies.append(ie)
 580                 ie.set_downloader(self)
 581
 582         def add_post_processor(self, pp):
 583                 """Add a PostProcessor object to the end of the chain."""
 584                 self._pps.append(pp)
 585                 pp.set_downloader(self)
 586
 587         def to_screen(self, message, skip_eol=False):
 588                 """Print message to stdout if not in quiet mode."""
 589                 assert type(message) == type(u'')
 590                 if not self.params.get('quiet', False):
 591                         terminator = [u'\n', u''][skip_eol]
 592                         output = message + terminator
 593
 594                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 595                                 output = output.encode(preferredencoding(), 'ignore')
 596                         self._screen_file.write(output)
 597                         self._screen_file.flush()
 598
 599         def to_stderr(self, message):
 600                 """Print message to stderr."""
 601                 print >>sys.stderr, message.encode(preferredencoding())
 602
 603         def to_cons_title(self, message):
 604                 """Set console/terminal window title to message."""
 605                 if not self.params.get('consoletitle', False):
 606                         return
 607                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 608                         # c_wchar_p() might not be necessary if `message` is
 609                         # already of type unicode()
 610                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 611                 elif 'TERM' in os.environ:
 612                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 613
 614         def fixed_template(self):
 615                 """Checks if the output template is fixed."""
 616                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 617
 618         def trouble(self, message=None):
 619                 """Determine action to take when a download problem appears.
 620
 621                 Depending on if the downloader has been configured to ignore
 622                 download errors or not, this method may throw an exception or
 623                 not when errors are found, after printing the message.
 624                 """
 625                 if message is not None:
 626                         self.to_stderr(message)
 627                 if not self.params.get('ignoreerrors', False):
 628                         raise DownloadError(message)
 629                 self._download_retcode = 1
 630
 631         def slow_down(self, start_time, byte_counter):
 632                 """Sleep if the download speed is over the rate limit."""
 633                 rate_limit = self.params.get('ratelimit', None)
 634                 if rate_limit is None or byte_counter == 0:
 635                         return
 636                 now = time.time()
 637                 elapsed = now - start_time
 638                 if elapsed <= 0.0:
 639                         return
 640                 speed = float(byte_counter) / elapsed
 641                 if speed > rate_limit:
 642                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 643
 644         def temp_name(self, filename):
 645                 """Returns a temporary filename for the given filename."""
 646                 if self.params.get('nopart', False) or filename == u'-' or \
 647                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 648                         return filename
 649                 return filename + u'.part'
 650
 651         def undo_temp_name(self, filename):
 652                 if filename.endswith(u'.part'):
 653                         return filename[:-len(u'.part')]
 654                 return filename
 655
 656         def try_rename(self, old_filename, new_filename):
 657                 try:
 658                         if old_filename == new_filename:
 659                                 return
 660                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 661                 except (IOError, OSError), err:
 662                         self.trouble(u'ERROR: unable to rename file')
 663
 664         def try_utime(self, filename, last_modified_hdr):
 665                 """Try to set the last-modified time of the given file."""
 666                 if last_modified_hdr is None:
 667                         return
 668                 if not os.path.isfile(_encodeFilename(filename)):
 669                         return
 670                 timestr = last_modified_hdr
 671                 if timestr is None:
 672                         return
 673                 filetime = timeconvert(timestr)
 674                 if filetime is None:
 675                         return filetime
 676                 try:
 677                         os.utime(filename, (time.time(), filetime))
 678                 except:
 679                         pass
 680                 return filetime
 681
 682         def report_writedescription(self, descfn):
 683                 """ Report that the description file is being written """
 684                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 685
 686         def report_writesubtitles(self, srtfn):
 687                 """ Report that the subtitles file is being written """
 688                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 689
 690         def report_writeinfojson(self, infofn):
 691                 """ Report that the metadata file has been written """
 692                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 693
 694         def report_destination(self, filename):
 695                 """Report destination filename."""
 696                 self.to_screen(u'[download] Destination: ' + filename)
 697
 698         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 699                 """Report download progress."""
 700                 if self.params.get('noprogress', False):
 701                         return
 702                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 703                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 704                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 705                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 706
 707         def report_resuming_byte(self, resume_len):
 708                 """Report attempt to resume at given byte."""
 709                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 710
 711         def report_retry(self, count, retries):
 712                 """Report retry in case of HTTP error 5xx"""
 713                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 714
 715         def report_file_already_downloaded(self, file_name):
 716                 """Report file has already been fully downloaded."""
 717                 try:
 718                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 719                 except (UnicodeEncodeError), err:
 720                         self.to_screen(u'[download] The file has already been downloaded')
 721
 722         def report_unable_to_resume(self):
 723                 """Report it was impossible to resume download."""
 724                 self.to_screen(u'[download] Unable to resume')
 725
 726         def report_finish(self):
 727                 """Report download finished."""
 728                 if self.params.get('noprogress', False):
 729                         self.to_screen(u'[download] Download completed')
 730                 else:
 731                         self.to_screen(u'')
 732
 733         def increment_downloads(self):
 734                 """Increment the ordinal that assigns a number to each file."""
 735                 self._num_downloads += 1
 736
 737         def prepare_filename(self, info_dict):
 738                 """Generate the output filename."""
 739                 try:
 740                         template_dict = dict(info_dict)
 741                         template_dict['epoch'] = unicode(long(time.time()))
 742                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 743                         filename = self.params['outtmpl'] % template_dict
 744                         return filename
 745                 except (ValueError, KeyError), err:
 746                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 747                         return None
 748
 749         def _match_entry(self, info_dict):
 750                 """ Returns None iff the file should be downloaded """
 751
 752                 title = info_dict['title']
 753                 matchtitle = self.params.get('matchtitle', False)
 754                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 755                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 756                 rejecttitle = self.params.get('rejecttitle', False)
 757                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 758                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 759                 return None
 760
 761         def process_info(self, info_dict):
 762                 """Process a single dictionary returned by an InfoExtractor."""
 763
 764                 reason = self._match_entry(info_dict)
 765                 if reason is not None:
 766                         self.to_screen(u'[download] ' + reason)
 767                         return
 768
 769                 max_downloads = self.params.get('max_downloads')
 770                 if max_downloads is not None:
 771                         if self._num_downloads > int(max_downloads):
 772                                 raise MaxDownloadsReached()
 773
 774                 filename = self.prepare_filename(info_dict)
 775
 776                 # Forced printings
 777                 if self.params.get('forcetitle', False):
 778                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 779                 if self.params.get('forceurl', False):
 780                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 781                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 782                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 783                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 784                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 785                 if self.params.get('forcefilename', False) and filename is not None:
 786                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 787                 if self.params.get('forceformat', False):
 788                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 789
 790                 # Do nothing else if in simulate mode
 791                 if self.params.get('simulate', False):
 792                         return
 793
 794                 if filename is None:
 795                         return
 796
 797                 try:
 798                         dn = os.path.dirname(_encodeFilename(filename))
 799                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 800                                 os.makedirs(dn)
 801                 except (OSError, IOError), err:
 802                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 803                         return
 804
 805                 if self.params.get('writedescription', False):
 806                         try:
 807                                 descfn = filename + u'.description'
 808                                 self.report_writedescription(descfn)
 809                                 descfile = open(_encodeFilename(descfn), 'wb')
 810                                 try:
 811                                         descfile.write(info_dict['description'].encode('utf-8'))
 812                                 finally:
 813                                         descfile.close()
 814                         except (OSError, IOError):
 815                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 816                                 return
 817
 818                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 819                         # subtitles download errors are already managed as troubles in relevant IE
 820                         # that way it will silently go on when used with unsupporting IE
 821                         try:
 822                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 823                                 self.report_writesubtitles(srtfn)
 824                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 825                                 try:
 826                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 827                                 finally:
 828                                         srtfile.close()
 829                         except (OSError, IOError):
 830                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 831                                 return
 832
 833                 if self.params.get('writeinfojson', False):
 834                         infofn = filename + u'.info.json'
 835                         self.report_writeinfojson(infofn)
 836                         try:
 837                                 json.dump
 838                         except (NameError,AttributeError):
 839                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 840                                 return
 841                         try:
 842                                 infof = open(_encodeFilename(infofn), 'wb')
 843                                 try:
 844                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 845                                         json.dump(json_info_dict, infof)
 846                                 finally:
 847                                         infof.close()
 848                         except (OSError, IOError):
 849                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 850                                 return
 851
 852                 if not self.params.get('skip_download', False):
 853                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 854                                 success = True
 855                         else:
 856                                 try:
 857                                         success = self._do_download(filename, info_dict)
 858                                 except (OSError, IOError), err:
 859                                         raise UnavailableVideoError
 860                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 861                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 862                                         return
 863                                 except (ContentTooShortError, ), err:
 864                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 865                                         return
 866
 867                         if success:
 868                                 try:
 869                                         self.post_process(filename, info_dict)
 870                                 except (PostProcessingError), err:
 871                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 872                                         return
 873
 874         def download(self, url_list):
 875                 """Download a given list of URLs."""
 876                 if len(url_list) > 1 and self.fixed_template():
 877                         raise SameFileError(self.params['outtmpl'])
 878
 879                 for url in url_list:
 880                         suitable_found = False
 881                         for ie in self._ies:
 882                                 # Go to next InfoExtractor if not suitable
 883                                 if not ie.suitable(url):
 884                                         continue
 885
 886                                 # Suitable InfoExtractor found
 887                                 suitable_found = True
 888
 889                                 # Extract information from URL and process it
 890                                 ie.extract(url)
 891
 892                                 # Suitable InfoExtractor had been found; go to next URL
 893                                 break
 894
 895                         if not suitable_found:
 896                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 897
 898                 return self._download_retcode
 899
 900         def post_process(self, filename, ie_info):
 901                 """Run the postprocessing chain on the given file."""
 902                 info = dict(ie_info)
 903                 info['filepath'] = filename
 904                 for pp in self._pps:
 905                         info = pp.run(info)
 906                         if info is None:
 907                                 break
 908
 909         def _download_with_rtmpdump(self, filename, url, player_url):
 910                 self.report_destination(filename)
 911                 tmpfilename = self.temp_name(filename)
 912
 913                 # Check for rtmpdump first
 914                 try:
 915                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 916                 except (OSError, IOError):
 917                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 918                         return False
 919
 920                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 921                 # the connection was interrumpted and resuming appears to be
 922                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 923                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 924                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 925                 if self.params.get('verbose', False):
 926                         try:
 927                                 import pipes
 928                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 929                         except ImportError:
 930                                 shell_quote = repr
 931                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 932                 retval = subprocess.call(args)
 933                 while retval == 2 or retval == 1:
 934                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 935                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 936                         time.sleep(5.0) # This seems to be needed
 937                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 938                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 939                         if prevsize == cursize and retval == 1:
 940                                 break
 941                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 942                         if prevsize == cursize and retval == 2 and cursize > 1024:
 943                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 944                                 retval = 0
 945                                 break
 946                 if retval == 0:
 947                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 948                         self.try_rename(tmpfilename, filename)
 949                         return True
 950                 else:
 951                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 952                         return False
 953
 954         def _do_download(self, filename, info_dict):
 955                 url = info_dict['url']
 956                 player_url = info_dict.get('player_url', None)
 957
 958                 # Check file already present
 959                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 960                         self.report_file_already_downloaded(filename)
 961                         return True
 962
 963                 # Attempt to download using rtmpdump
 964                 if url.startswith('rtmp'):
 965                         return self._download_with_rtmpdump(filename, url, player_url)
 966
 967                 tmpfilename = self.temp_name(filename)
 968                 stream = None
 969
 970                 # Do not include the Accept-Encoding header
 971                 headers = {'Youtubedl-no-compression': 'True'}
 972                 basic_request = urllib2.Request(url, None, headers)
 973                 request = urllib2.Request(url, None, headers)
 974
 975                 # Establish possible resume length
 976                 if os.path.isfile(_encodeFilename(tmpfilename)):
 977                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 978                 else:
 979                         resume_len = 0
 980
 981                 open_mode = 'wb'
 982                 if resume_len != 0:
 983                         if self.params.get('continuedl', False):
 984                                 self.report_resuming_byte(resume_len)
 985                                 request.add_header('Range','bytes=%d-' % resume_len)
 986                                 open_mode = 'ab'
 987                         else:
 988                                 resume_len = 0
 989
 990                 count = 0
 991                 retries = self.params.get('retries', 0)
 992                 while count <= retries:
 993                         # Establish connection
 994                         try:
 995                                 if count == 0 and 'urlhandle' in info_dict:
 996                                         data = info_dict['urlhandle']
 997                                 data = urllib2.urlopen(request)
 998                                 break
 999                         except (urllib2.HTTPError, ), err:
1000                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001                                         # Unexpected HTTP error
1002                                         raise
1003                                 elif err.code == 416:
1004                                         # Unable to resume (requested range not satisfiable)
1005                                         try:
1006                                                 # Open the connection again without the range header
1007                                                 data = urllib2.urlopen(basic_request)
1008                                                 content_length = data.info()['Content-Length']
1009                                         except (urllib2.HTTPError, ), err:
1010                                                 if err.code < 500 or err.code >= 600:
1011                                                         raise
1012                                         else:
1013                                                 # Examine the reported length
1014                                                 if (content_length is not None and
1015                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016                                                         # The file had already been fully downloaded.
1017                                                         # Explanation to the above condition: in issue #175 it was revealed that
1018                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1019                                                         # changing the file size slightly and causing problems for some users. So
1020                                                         # I decided to implement a suggested change and consider the file
1021                                                         # completely downloaded if the file size differs less than 100 bytes from
1022                                                         # the one in the hard drive.
1023                                                         self.report_file_already_downloaded(filename)
1024                                                         self.try_rename(tmpfilename, filename)
1025                                                         return True
1026                                                 else:
1027                                                         # The length does not match, we start the download over
1028                                                         self.report_unable_to_resume()
1029                                                         open_mode = 'wb'
1030                                                         break
1031                         # Retry
1032                         count += 1
1033                         if count <= retries:
1034                                 self.report_retry(count, retries)
1035
1036                 if count > retries:
1037                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1038                         return False
1039
1040                 data_len = data.info().get('Content-length', None)
1041                 if data_len is not None:
1042                         data_len = long(data_len) + resume_len
1043                 data_len_str = self.format_bytes(data_len)
1044                 byte_counter = 0 + resume_len
1045                 block_size = 1024
1046                 start = time.time()
1047                 while True:
1048                         # Download and write
1049                         before = time.time()
1050                         data_block = data.read(block_size)
1051                         after = time.time()
1052                         if len(data_block) == 0:
1053                                 break
1054                         byte_counter += len(data_block)
1055
1056                         # Open file just in time
1057                         if stream is None:
1058                                 try:
1059                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060                                         assert stream is not None
1061                                         filename = self.undo_temp_name(tmpfilename)
1062                                         self.report_destination(filename)
1063                                 except (OSError, IOError), err:
1064                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065                                         return False
1066                         try:
1067                                 stream.write(data_block)
1068                         except (IOError, OSError), err:
1069                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070                                 return False
1071                         block_size = self.best_block_size(after - before, len(data_block))
1072
1073                         # Progress message
1074                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075                         if data_len is None:
1076                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077                         else:
1078                                 percent_str = self.calc_percent(byte_counter, data_len)
1079                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082                         # Apply rate limit
1083                         self.slow_down(start, byte_counter - resume_len)
1084
1085                 if stream is None:
1086                         self.trouble(u'\nERROR: Did not get any data blocks')
1087                         return False
1088                 stream.close()
1089                 self.report_finish()
1090                 if data_len is not None and byte_counter != data_len:
1091                         raise ContentTooShortError(byte_counter, long(data_len))
1092                 self.try_rename(tmpfilename, filename)
1093
1094                 # Update file modification time
1095                 if self.params.get('updatetime', True):
1096                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098                 return True
1099
1100
1101 class InfoExtractor(object):
1102         """Information Extractor class.
1103
1104         Information extractors are the classes that, given a URL, extract
1105         information from the video (or videos) the URL refers to. This
1106         information includes the real video URL, the video title and simplified
1107         title, author and others. The information is stored in a dictionary
1108         which is then passed to the FileDownloader. The FileDownloader
1109         processes this information possibly downloading the video to the file
1110         system, among other possible outcomes. The dictionaries must include
1111         the following fields:
1112
1113         id:             Video identifier.
1114         url:            Final video URL.
1115         uploader:       Nickname of the video uploader.
1116         title:          Literal title.
1117         stitle:         Simplified title.
1118         ext:            Video filename extension.
1119         format:         Video format.
1120         player_url:     SWF Player URL (may be None).
1121
1122         The following fields are optional. Their primary purpose is to allow
1123         youtube-dl to serve as the backend for a video search function, such
1124         as the one in youtube2mp3.  They are only used when their respective
1125         forced printing functions are called:
1126
1127         thumbnail:      Full URL to a video thumbnail image.
1128         description:    One-line video description.
1129
1130         Subclasses of this one should re-define the _real_initialize() and
1131         _real_extract() methods and define a _VALID_URL regexp.
1132         Probably, they should also be added to the list of extractors.
1133         """
1134
1135         _ready = False
1136         _downloader = None
1137
1138         def __init__(self, downloader=None):
1139                 """Constructor. Receives an optional downloader."""
1140                 self._ready = False
1141                 self.set_downloader(downloader)
1142
1143         def suitable(self, url):
1144                 """Receives a URL and returns True if suitable for this IE."""
1145                 return re.match(self._VALID_URL, url) is not None
1146
1147         def initialize(self):
1148                 """Initializes an instance (authentication, etc)."""
1149                 if not self._ready:
1150                         self._real_initialize()
1151                         self._ready = True
1152
1153         def extract(self, url):
1154                 """Extracts URL information and returns it in list of dicts."""
1155                 self.initialize()
1156                 return self._real_extract(url)
1157
1158         def set_downloader(self, downloader):
1159                 """Sets the downloader for this IE."""
1160                 self._downloader = downloader
1161
1162         def _real_initialize(self):
1163                 """Real initialization process. Redefine in subclasses."""
1164                 pass
1165
1166         def _real_extract(self, url):
1167                 """Real extraction process. Redefine in subclasses."""
1168                 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172         """Information extractor for youtube.com."""
1173
1174         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178         _NETRC_MACHINE = 'youtube'
1179         # Listed in order of quality
1180         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1181         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1182         _video_extensions = {
1183                 '13': '3gp',
1184                 '17': 'mp4',
1185                 '18': 'mp4',
1186                 '22': 'mp4',
1187                 '37': 'mp4',
1188                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1189                 '43': 'webm',
1190                 '44': 'webm',
1191                 '45': 'webm',
1192                 '46': 'webm',
1193         }
1194         _video_dimensions = {
1195                 '5': '240x400',
1196                 '6': '???',
1197                 '13': '???',
1198                 '17': '144x176',
1199                 '18': '360x640',
1200                 '22': '720x1280',
1201                 '34': '360x640',
1202                 '35': '480x854',
1203                 '37': '1080x1920',
1204                 '38': '3072x4096',
1205                 '43': '360x640',
1206                 '44': '480x854',
1207                 '45': '720x1280',
1208                 '46': '1080x1920',
1209         }
1210         IE_NAME = u'youtube'
1211
1212         def report_lang(self):
1213                 """Report attempt to set language."""
1214                 self._downloader.to_screen(u'[youtube] Setting language')
1215
1216         def report_login(self):
1217                 """Report attempt to log in."""
1218                 self._downloader.to_screen(u'[youtube] Logging in')
1219
1220         def report_age_confirmation(self):
1221                 """Report attempt to confirm age."""
1222                 self._downloader.to_screen(u'[youtube] Confirming age')
1223
1224         def report_video_webpage_download(self, video_id):
1225                 """Report attempt to download video webpage."""
1226                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227
1228         def report_video_info_webpage_download(self, video_id):
1229                 """Report attempt to download video info webpage."""
1230                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231
1232         def report_video_subtitles_download(self, video_id):
1233                 """Report attempt to download video info webpage."""
1234                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235
1236         def report_information_extraction(self, video_id):
1237                 """Report attempt to extract video information."""
1238                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239
1240         def report_unavailable_format(self, video_id, format):
1241                 """Report extracted video URL."""
1242                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243
1244         def report_rtmp_download(self):
1245                 """Indicate the download will use the RTMP protocol."""
1246                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247
1248         def _closed_captions_xml_to_srt(self, xml_string):
1249                 srt = ''
1250                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251                 # TODO parse xml instead of regex
1252                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253                         if not dur: dur = '4'
1254                         start = float(start)
1255                         end = start + float(dur)
1256                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260                         srt += str(n) + '\n'
1261                         srt += start + ' --> ' + end + '\n'
1262                         srt += caption + '\n\n'
1263                 return srt
1264
1265         def _print_formats(self, formats):
1266                 print 'Available formats:'
1267                 for x in formats:
1268                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269
1270         def _real_initialize(self):
1271                 if self._downloader is None:
1272                         return
1273
1274                 username = None
1275                 password = None
1276                 downloader_params = self._downloader.params
1277
1278                 # Attempt to use provided username and password or .netrc data
1279                 if downloader_params.get('username', None) is not None:
1280                         username = downloader_params['username']
1281                         password = downloader_params['password']
1282                 elif downloader_params.get('usenetrc', False):
1283                         try:
1284                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285                                 if info is not None:
1286                                         username = info[0]
1287                                         password = info[2]
1288                                 else:
1289                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290                         except (IOError, netrc.NetrcParseError), err:
1291                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292                                 return
1293
1294                 # Set language
1295                 request = urllib2.Request(self._LANG_URL)
1296                 try:
1297                         self.report_lang()
1298                         urllib2.urlopen(request).read()
1299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301                         return
1302
1303                 # No authentication to be performed
1304                 if username is None:
1305                         return
1306
1307                 # Log in
1308                 login_form = {
1309                                 'current_form': 'loginForm',
1310                                 'next':         '/',
1311                                 'action_login': 'Log In',
1312                                 'username':     username,
1313                                 'password':     password,
1314                                 }
1315                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316                 try:
1317                         self.report_login()
1318                         login_results = urllib2.urlopen(request).read()
1319                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321                                 return
1322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1324                         return
1325
1326                 # Confirm age
1327                 age_form = {
1328                                 'next_url':             '/',
1329                                 'action_confirm':       'Confirm',
1330                                 }
1331                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332                 try:
1333                         self.report_age_confirmation()
1334                         age_results = urllib2.urlopen(request).read()
1335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337                         return
1338
1339         def _real_extract(self, url):
1340                 # Extract video id from URL
1341                 mobj = re.match(self._VALID_URL, url)
1342                 if mobj is None:
1343                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1344                         return
1345                 video_id = mobj.group(2)
1346
1347                 # Get video webpage
1348                 self.report_video_webpage_download(video_id)
1349                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1350                 try:
1351                         video_webpage = urllib2.urlopen(request).read()
1352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1353                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1354                         return
1355
1356                 # Attempt to extract SWF player URL
1357                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1358                 if mobj is not None:
1359                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1360                 else:
1361                         player_url = None
1362
1363                 # Get video info
1364                 self.report_video_info_webpage_download(video_id)
1365                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1366                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1367                                         % (video_id, el_type))
1368                         request = urllib2.Request(video_info_url)
1369                         try:
1370                                 video_info_webpage = urllib2.urlopen(request).read()
1371                                 video_info = parse_qs(video_info_webpage)
1372                                 if 'token' in video_info:
1373                                         break
1374                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1376                                 return
1377                 if 'token' not in video_info:
1378                         if 'reason' in video_info:
1379                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1380                         else:
1381                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1382                         return
1383
1384                 # Start extracting information
1385                 self.report_information_extraction(video_id)
1386
1387                 # uploader
1388                 if 'author' not in video_info:
1389                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1390                         return
1391                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1392
1393                 # title
1394                 if 'title' not in video_info:
1395                         self._downloader.trouble(u'ERROR: unable to extract video title')
1396                         return
1397                 video_title = urllib.unquote_plus(video_info['title'][0])
1398                 video_title = video_title.decode('utf-8')
1399                 video_title = sanitize_title(video_title)
1400
1401                 # simplified title
1402                 simple_title = _simplify_title(video_title)
1403
1404                 # thumbnail image
1405                 if 'thumbnail_url' not in video_info:
1406                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1407                         video_thumbnail = ''
1408                 else:   # don't panic if we can't find it
1409                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1410
1411                 # upload date
1412                 upload_date = u'NA'
1413                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1414                 if mobj is not None:
1415                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1416                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1417                         for expression in format_expressions:
1418                                 try:
1419                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1420                                 except:
1421                                         pass
1422
1423                 # description
1424                 try:
1425                         lxml.etree
1426                 except NameError:
1427                         video_description = u'No description available.'
1428                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1429                         if mobj is not None:
1430                                 video_description = mobj.group(1).decode('utf-8')
1431                 else:
1432                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1433                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1434                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1435                         # TODO use another parser
1436
1437                 # closed captions
1438                 video_subtitles = None
1439                 if self._downloader.params.get('writesubtitles', False):
1440                         self.report_video_subtitles_download(video_id)
1441                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1442                         try:
1443                                 srt_list = urllib2.urlopen(request).read()
1444                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1445                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1446                         else:
1447                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1448                                 if srt_lang_list:
1449                                         if self._downloader.params.get('subtitleslang', False):
1450                                                 srt_lang = self._downloader.params.get('subtitleslang')
1451                                         elif 'en' in srt_lang_list:
1452                                                 srt_lang = 'en'
1453                                         else:
1454                                                 srt_lang = srt_lang_list[0]
1455                                         if not srt_lang in srt_lang_list:
1456                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1457                                         else:
1458                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1459                                                 try:
1460                                                         srt_xml = urllib2.urlopen(request).read()
1461                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1462                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1463                                                 else:
1464                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1465                                 else:
1466                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1467
1468                 # token
1469                 video_token = urllib.unquote_plus(video_info['token'][0])
1470
1471                 # Decide which formats to download
1472                 req_format = self._downloader.params.get('format', None)
1473
1474                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1475                         self.report_rtmp_download()
1476                         video_url_list = [(None, video_info['conn'][0])]
1477                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1478                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1479                         url_data = [parse_qs(uds) for uds in url_data_strs]
1480                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1481                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1482
1483                         format_limit = self._downloader.params.get('format_limit', None)
1484                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1485                         if format_limit is not None and format_limit in available_formats:
1486                                 format_list = available_formats[available_formats.index(format_limit):]
1487                         else:
1488                                 format_list = available_formats
1489                         existing_formats = [x for x in format_list if x in url_map]
1490                         if len(existing_formats) == 0:
1491                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1492                                 return
1493                         if self._downloader.params.get('listformats', None):
1494                                 self._print_formats(existing_formats)
1495                                 return
1496                         if req_format is None or req_format == 'best':
1497                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1498                         elif req_format == 'worst':
1499                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1500                         elif req_format in ('-1', 'all'):
1501                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1502                         else:
1503                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1504                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1505                                 req_formats = req_format.split('/')
1506                                 video_url_list = None
1507                                 for rf in req_formats:
1508                                         if rf in url_map:
1509                                                 video_url_list = [(rf, url_map[rf])]
1510                                                 break
1511                                 if video_url_list is None:
1512                                         self._downloader.trouble(u'ERROR: requested format not available')
1513                                         return
1514                 else:
1515                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1516                         return
1517
1518                 for format_param, video_real_url in video_url_list:
1519                         # At this point we have a new video
1520                         self._downloader.increment_downloads()
1521
1522                         # Extension
1523                         video_extension = self._video_extensions.get(format_param, 'flv')
1524
1525                         try:
1526                                 # Process video information
1527                                 self._downloader.process_info({
1528                                         'id':           video_id.decode('utf-8'),
1529                                         'url':          video_real_url.decode('utf-8'),
1530                                         'uploader':     video_uploader.decode('utf-8'),
1531                                         'upload_date':  upload_date,
1532                                         'title':        video_title,
1533                                         'stitle':       simple_title,
1534                                         'ext':          video_extension.decode('utf-8'),
1535                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1536                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1537                                         'description':  video_description,
1538                                         'player_url':   player_url,
1539                                         'subtitles':    video_subtitles
1540                                 })
1541                         except UnavailableVideoError, err:
1542                                 self._downloader.trouble(u'\nERROR: unable to download video')
1543
1544
1545 class MetacafeIE(InfoExtractor):
1546         """Information Extractor for metacafe.com."""
1547
1548         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1549         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1550         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1551         _youtube_ie = None
1552         IE_NAME = u'metacafe'
1553
1554         def __init__(self, youtube_ie, downloader=None):
1555                 InfoExtractor.__init__(self, downloader)
1556                 self._youtube_ie = youtube_ie
1557
1558         def report_disclaimer(self):
1559                 """Report disclaimer retrieval."""
1560                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1561
1562         def report_age_confirmation(self):
1563                 """Report attempt to confirm age."""
1564                 self._downloader.to_screen(u'[metacafe] Confirming age')
1565
1566         def report_download_webpage(self, video_id):
1567                 """Report webpage download."""
1568                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1569
1570         def report_extraction(self, video_id):
1571                 """Report information extraction."""
1572                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1573
1574         def _real_initialize(self):
1575                 # Retrieve disclaimer
1576                 request = urllib2.Request(self._DISCLAIMER)
1577                 try:
1578                         self.report_disclaimer()
1579                         disclaimer = urllib2.urlopen(request).read()
1580                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1581                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1582                         return
1583
1584                 # Confirm age
1585                 disclaimer_form = {
1586                         'filters': '0',
1587                         'submit': "Continue - I'm over 18",
1588                         }
1589                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1590                 try:
1591                         self.report_age_confirmation()
1592                         disclaimer = urllib2.urlopen(request).read()
1593                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1595                         return
1596
1597         def _real_extract(self, url):
1598                 # Extract id and simplified title from URL
1599                 mobj = re.match(self._VALID_URL, url)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1602                         return
1603
1604                 video_id = mobj.group(1)
1605
1606                 # Check if video comes from YouTube
1607                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1608                 if mobj2 is not None:
1609                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1610                         return
1611
1612                 # At this point we have a new video
1613                 self._downloader.increment_downloads()
1614
1615                 simple_title = mobj.group(2).decode('utf-8')
1616
1617                 # Retrieve video webpage to extract further information
1618                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1619                 try:
1620                         self.report_download_webpage(video_id)
1621                         webpage = urllib2.urlopen(request).read()
1622                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1624                         return
1625
1626                 # Extract URL, uploader and title from webpage
1627                 self.report_extraction(video_id)
1628                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1629                 if mobj is not None:
1630                         mediaURL = urllib.unquote(mobj.group(1))
1631                         video_extension = mediaURL[-3:]
1632
1633                         # Extract gdaKey if available
1634                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1635                         if mobj is None:
1636                                 video_url = mediaURL
1637                         else:
1638                                 gdaKey = mobj.group(1)
1639                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1640                 else:
1641                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1642                         if mobj is None:
1643                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1644                                 return
1645                         vardict = parse_qs(mobj.group(1))
1646                         if 'mediaData' not in vardict:
1647                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648                                 return
1649                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1650                         if mobj is None:
1651                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652                                 return
1653                         mediaURL = mobj.group(1).replace('\\/', '/')
1654                         video_extension = mediaURL[-3:]
1655                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1656
1657                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1658                 if mobj is None:
1659                         self._downloader.trouble(u'ERROR: unable to extract title')
1660                         return
1661                 video_title = mobj.group(1).decode('utf-8')
1662                 video_title = sanitize_title(video_title)
1663
1664                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1665                 if mobj is None:
1666                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1667                         return
1668                 video_uploader = mobj.group(1)
1669
1670                 try:
1671                         # Process video information
1672                         self._downloader.process_info({
1673                                 'id':           video_id.decode('utf-8'),
1674                                 'url':          video_url.decode('utf-8'),
1675                                 'uploader':     video_uploader.decode('utf-8'),
1676                                 'upload_date':  u'NA',
1677                                 'title':        video_title,
1678                                 'stitle':       simple_title,
1679                                 'ext':          video_extension.decode('utf-8'),
1680                                 'format':       u'NA',
1681                                 'player_url':   None,
1682                         })
1683                 except UnavailableVideoError:
1684                         self._downloader.trouble(u'\nERROR: unable to download video')
1685
1686
1687 class DailymotionIE(InfoExtractor):
1688         """Information Extractor for Dailymotion"""
1689
1690         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1691         IE_NAME = u'dailymotion'
1692
1693         def __init__(self, downloader=None):
1694                 InfoExtractor.__init__(self, downloader)
1695
1696         def report_download_webpage(self, video_id):
1697                 """Report webpage download."""
1698                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1699
1700         def report_extraction(self, video_id):
1701                 """Report information extraction."""
1702                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1703
1704         def _real_extract(self, url):
1705                 # Extract id and simplified title from URL
1706                 mobj = re.match(self._VALID_URL, url)
1707                 if mobj is None:
1708                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1709                         return
1710
1711                 # At this point we have a new video
1712                 self._downloader.increment_downloads()
1713                 video_id = mobj.group(1)
1714
1715                 video_extension = 'flv'
1716
1717                 # Retrieve video webpage to extract further information
1718                 request = urllib2.Request(url)
1719                 request.add_header('Cookie', 'family_filter=off')
1720                 try:
1721                         self.report_download_webpage(video_id)
1722                         webpage = urllib2.urlopen(request).read()
1723                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1724                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1725                         return
1726
1727                 # Extract URL, uploader and title from webpage
1728                 self.report_extraction(video_id)
1729                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1730                 if mobj is None:
1731                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1732                         return
1733                 sequence = urllib.unquote(mobj.group(1))
1734                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1737                         return
1738                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1739
1740                 # if needed add http://www.dailymotion.com/ if relative URL
1741
1742                 video_url = mediaURL
1743
1744                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: unable to extract title')
1747                         return
1748                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1749                 video_title = sanitize_title(video_title)
1750                 simple_title = _simplify_title(video_title)
1751
1752                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1755                         return
1756                 video_uploader = mobj.group(1)
1757
1758                 try:
1759                         # Process video information
1760                         self._downloader.process_info({
1761                                 'id':           video_id.decode('utf-8'),
1762                                 'url':          video_url.decode('utf-8'),
1763                                 'uploader':     video_uploader.decode('utf-8'),
1764                                 'upload_date':  u'NA',
1765                                 'title':        video_title,
1766                                 'stitle':       simple_title,
1767                                 'ext':          video_extension.decode('utf-8'),
1768                                 'format':       u'NA',
1769                                 'player_url':   None,
1770                         })
1771                 except UnavailableVideoError:
1772                         self._downloader.trouble(u'\nERROR: unable to download video')
1773
1774
1775 class GoogleIE(InfoExtractor):
1776         """Information extractor for video.google.com."""
1777
1778         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1779         IE_NAME = u'video.google'
1780
1781         def __init__(self, downloader=None):
1782                 InfoExtractor.__init__(self, downloader)
1783
1784         def report_download_webpage(self, video_id):
1785                 """Report webpage download."""
1786                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1787
1788         def report_extraction(self, video_id):
1789                 """Report information extraction."""
1790                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1791
1792         def _real_extract(self, url):
1793                 # Extract id from URL
1794                 mobj = re.match(self._VALID_URL, url)
1795                 if mobj is None:
1796                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1797                         return
1798
1799                 # At this point we have a new video
1800                 self._downloader.increment_downloads()
1801                 video_id = mobj.group(1)
1802
1803                 video_extension = 'mp4'
1804
1805                 # Retrieve video webpage to extract further information
1806                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1807                 try:
1808                         self.report_download_webpage(video_id)
1809                         webpage = urllib2.urlopen(request).read()
1810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1812                         return
1813
1814                 # Extract URL, uploader, and title from webpage
1815                 self.report_extraction(video_id)
1816                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1817                 if mobj is None:
1818                         video_extension = 'flv'
1819                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1820                 if mobj is None:
1821                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1822                         return
1823                 mediaURL = urllib.unquote(mobj.group(1))
1824                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1825                 mediaURL = mediaURL.replace('\\x26', '\x26')
1826
1827                 video_url = mediaURL
1828
1829                 mobj = re.search(r'<title>(.*)</title>', webpage)
1830                 if mobj is None:
1831                         self._downloader.trouble(u'ERROR: unable to extract title')
1832                         return
1833                 video_title = mobj.group(1).decode('utf-8')
1834                 video_title = sanitize_title(video_title)
1835                 simple_title = _simplify_title(video_title)
1836
1837                 # Extract video description
1838                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: unable to extract video description')
1841                         return
1842                 video_description = mobj.group(1).decode('utf-8')
1843                 if not video_description:
1844                         video_description = 'No description available.'
1845
1846                 # Extract video thumbnail
1847                 if self._downloader.params.get('forcethumbnail', False):
1848                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1849                         try:
1850                                 webpage = urllib2.urlopen(request).read()
1851                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1853                                 return
1854                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1855                         if mobj is None:
1856                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1857                                 return
1858                         video_thumbnail = mobj.group(1)
1859                 else:   # we need something to pass to process_info
1860                         video_thumbnail = ''
1861
1862                 try:
1863                         # Process video information
1864                         self._downloader.process_info({
1865                                 'id':           video_id.decode('utf-8'),
1866                                 'url':          video_url.decode('utf-8'),
1867                                 'uploader':     u'NA',
1868                                 'upload_date':  u'NA',
1869                                 'title':        video_title,
1870                                 'stitle':       simple_title,
1871                                 'ext':          video_extension.decode('utf-8'),
1872                                 'format':       u'NA',
1873                                 'player_url':   None,
1874                         })
1875                 except UnavailableVideoError:
1876                         self._downloader.trouble(u'\nERROR: unable to download video')
1877
1878
1879 class PhotobucketIE(InfoExtractor):
1880         """Information extractor for photobucket.com."""
1881
1882         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1883         IE_NAME = u'photobucket'
1884
1885         def __init__(self, downloader=None):
1886                 InfoExtractor.__init__(self, downloader)
1887
1888         def report_download_webpage(self, video_id):
1889                 """Report webpage download."""
1890                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1891
1892         def report_extraction(self, video_id):
1893                 """Report information extraction."""
1894                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1895
1896         def _real_extract(self, url):
1897                 # Extract id from URL
1898                 mobj = re.match(self._VALID_URL, url)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1901                         return
1902
1903                 # At this point we have a new video
1904                 self._downloader.increment_downloads()
1905                 video_id = mobj.group(1)
1906
1907                 video_extension = 'flv'
1908
1909                 # Retrieve video webpage to extract further information
1910                 request = urllib2.Request(url)
1911                 try:
1912                         self.report_download_webpage(video_id)
1913                         webpage = urllib2.urlopen(request).read()
1914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916                         return
1917
1918                 # Extract URL, uploader, and title from webpage
1919                 self.report_extraction(video_id)
1920                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1923                         return
1924                 mediaURL = urllib.unquote(mobj.group(1))
1925
1926                 video_url = mediaURL
1927
1928                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: unable to extract title')
1931                         return
1932                 video_title = mobj.group(1).decode('utf-8')
1933                 video_title = sanitize_title(video_title)
1934                 simple_title = _simplify_title(vide_title)
1935
1936                 video_uploader = mobj.group(2).decode('utf-8')
1937
1938                 try:
1939                         # Process video information
1940                         self._downloader.process_info({
1941                                 'id':           video_id.decode('utf-8'),
1942                                 'url':          video_url.decode('utf-8'),
1943                                 'uploader':     video_uploader,
1944                                 'upload_date':  u'NA',
1945                                 'title':        video_title,
1946                                 'stitle':       simple_title,
1947                                 'ext':          video_extension.decode('utf-8'),
1948                                 'format':       u'NA',
1949                                 'player_url':   None,
1950                         })
1951                 except UnavailableVideoError:
1952                         self._downloader.trouble(u'\nERROR: unable to download video')
1953
1954
1955 class YahooIE(InfoExtractor):
1956         """Information extractor for video.yahoo.com."""
1957
1958         # _VALID_URL matches all Yahoo! Video URLs
1959         # _VPAGE_URL matches only the extractable '/watch/' URLs
1960         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1961         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1962         IE_NAME = u'video.yahoo'
1963
1964         def __init__(self, downloader=None):
1965                 InfoExtractor.__init__(self, downloader)
1966
1967         def report_download_webpage(self, video_id):
1968                 """Report webpage download."""
1969                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1970
1971         def report_extraction(self, video_id):
1972                 """Report information extraction."""
1973                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1974
1975         def _real_extract(self, url, new_video=True):
1976                 # Extract ID from URL
1977                 mobj = re.match(self._VALID_URL, url)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1980                         return
1981
1982                 # At this point we have a new video
1983                 self._downloader.increment_downloads()
1984                 video_id = mobj.group(2)
1985                 video_extension = 'flv'
1986
1987                 # Rewrite valid but non-extractable URLs as
1988                 # extractable English language /watch/ URLs
1989                 if re.match(self._VPAGE_URL, url) is None:
1990                         request = urllib2.Request(url)
1991                         try:
1992                                 webpage = urllib2.urlopen(request).read()
1993                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1995                                 return
1996
1997                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1998                         if mobj is None:
1999                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2000                                 return
2001                         yahoo_id = mobj.group(1)
2002
2003                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2004                         if mobj is None:
2005                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2006                                 return
2007                         yahoo_vid = mobj.group(1)
2008
2009                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2010                         return self._real_extract(url, new_video=False)
2011
2012                 # Retrieve video webpage to extract further information
2013                 request = urllib2.Request(url)
2014                 try:
2015                         self.report_download_webpage(video_id)
2016                         webpage = urllib2.urlopen(request).read()
2017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019                         return
2020
2021                 # Extract uploader and title from webpage
2022                 self.report_extraction(video_id)
2023                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2024                 if mobj is None:
2025                         self._downloader.trouble(u'ERROR: unable to extract video title')
2026                         return
2027                 video_title = mobj.group(1).decode('utf-8')
2028                 simple_title = _simplify_title(video_title)
2029
2030                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2031                 if mobj is None:
2032                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2033                         return
2034                 video_uploader = mobj.group(1).decode('utf-8')
2035
2036                 # Extract video thumbnail
2037                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2038                 if mobj is None:
2039                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2040                         return
2041                 video_thumbnail = mobj.group(1).decode('utf-8')
2042
2043                 # Extract video description
2044                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2045                 if mobj is None:
2046                         self._downloader.trouble(u'ERROR: unable to extract video description')
2047                         return
2048                 video_description = mobj.group(1).decode('utf-8')
2049                 if not video_description:
2050                         video_description = 'No description available.'
2051
2052                 # Extract video height and width
2053                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2054                 if mobj is None:
2055                         self._downloader.trouble(u'ERROR: unable to extract video height')
2056                         return
2057                 yv_video_height = mobj.group(1)
2058
2059                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2060                 if mobj is None:
2061                         self._downloader.trouble(u'ERROR: unable to extract video width')
2062                         return
2063                 yv_video_width = mobj.group(1)
2064
2065                 # Retrieve video playlist to extract media URL
2066                 # I'm not completely sure what all these options are, but we
2067                 # seem to need most of them, otherwise the server sends a 401.
2068                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2069                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2070                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2071                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2072                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2073                 try:
2074                         self.report_download_webpage(video_id)
2075                         webpage = urllib2.urlopen(request).read()
2076                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2078                         return
2079
2080                 # Extract media URL from playlist XML
2081                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2082                 if mobj is None:
2083                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2084                         return
2085                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2086                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2087
2088                 try:
2089                         # Process video information
2090                         self._downloader.process_info({
2091                                 'id':           video_id.decode('utf-8'),
2092                                 'url':          video_url,
2093                                 'uploader':     video_uploader,
2094                                 'upload_date':  u'NA',
2095                                 'title':        video_title,
2096                                 'stitle':       simple_title,
2097                                 'ext':          video_extension.decode('utf-8'),
2098                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2099                                 'description':  video_description,
2100                                 'thumbnail':    video_thumbnail,
2101                                 'player_url':   None,
2102                         })
2103                 except UnavailableVideoError:
2104                         self._downloader.trouble(u'\nERROR: unable to download video')
2105
2106
2107 class VimeoIE(InfoExtractor):
2108         """Information extractor for vimeo.com."""
2109
2110         # _VALID_URL matches Vimeo URLs
2111         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2112         IE_NAME = u'vimeo'
2113
2114         def __init__(self, downloader=None):
2115                 InfoExtractor.__init__(self, downloader)
2116
2117         def report_download_webpage(self, video_id):
2118                 """Report webpage download."""
2119                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2120
2121         def report_extraction(self, video_id):
2122                 """Report information extraction."""
2123                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2124
2125         def _real_extract(self, url, new_video=True):
2126                 # Extract ID from URL
2127                 mobj = re.match(self._VALID_URL, url)
2128                 if mobj is None:
2129                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2130                         return
2131
2132                 # At this point we have a new video
2133                 self._downloader.increment_downloads()
2134                 video_id = mobj.group(1)
2135
2136                 # Retrieve video webpage to extract further information
2137                 request = urllib2.Request(url, None, std_headers)
2138                 try:
2139                         self.report_download_webpage(video_id)
2140                         webpage = urllib2.urlopen(request).read()
2141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2143                         return
2144
2145                 # Now we begin extracting as much information as we can from what we
2146                 # retrieved. First we extract the information common to all extractors,
2147                 # and latter we extract those that are Vimeo specific.
2148                 self.report_extraction(video_id)
2149
2150                 # Extract the config JSON
2151                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2152                 try:
2153                         config = json.loads(config)
2154                 except:
2155                         self._downloader.trouble(u'ERROR: unable to extract info section')
2156                         return
2157
2158                 # Extract title
2159                 video_title = config["video"]["title"]
2160                 simple_title = _simplify_title(video_title)
2161
2162                 # Extract uploader
2163                 video_uploader = config["video"]["owner"]["name"]
2164
2165                 # Extract video thumbnail
2166                 video_thumbnail = config["video"]["thumbnail"]
2167
2168                 # Extract video description
2169                 try:
2170                         lxml.etree
2171                 except NameError:
2172                         video_description = u'No description available.'
2173                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2174                         if mobj is not None:
2175                                 video_description = mobj.group(1)
2176                 else:
2177                         html_parser = lxml.etree.HTMLParser()
2178                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2179                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2180                         # TODO use another parser
2181
2182                 # Extract upload date
2183                 video_upload_date = u'NA'
2184                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2185                 if mobj is not None:
2186                         video_upload_date = mobj.group(1)
2187
2188                 # Vimeo specific: extract request signature and timestamp
2189                 sig = config['request']['signature']
2190                 timestamp = config['request']['timestamp']
2191
2192                 # Vimeo specific: extract video codec and quality information
2193                 # TODO bind to format param
2194                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2195                 for codec in codecs:
2196                         if codec[0] in config["video"]["files"]:
2197                                 video_codec = codec[0]
2198                                 video_extension = codec[1]
2199                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2200                                 else: quality = 'sd'
2201                                 break
2202                 else:
2203                         self._downloader.trouble(u'ERROR: no known codec found')
2204                         return
2205
2206                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2207                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2208
2209                 try:
2210                         # Process video information
2211                         self._downloader.process_info({
2212                                 'id':           video_id,
2213                                 'url':          video_url,
2214                                 'uploader':     video_uploader,
2215                                 'upload_date':  video_upload_date,
2216                                 'title':        video_title,
2217                                 'stitle':       simple_title,
2218                                 'ext':          video_extension,
2219                                 'thumbnail':    video_thumbnail,
2220                                 'description':  video_description,
2221                                 'player_url':   None,
2222                         })
2223                 except UnavailableVideoError:
2224                         self._downloader.trouble(u'ERROR: unable to download video')
2225
2226
2227 class GenericIE(InfoExtractor):
2228         """Generic last-resort information extractor."""
2229
2230         _VALID_URL = r'.*'
2231         IE_NAME = u'generic'
2232
2233         def __init__(self, downloader=None):
2234                 InfoExtractor.__init__(self, downloader)
2235
2236         def report_download_webpage(self, video_id):
2237                 """Report webpage download."""
2238                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2239                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2240
2241         def report_extraction(self, video_id):
2242                 """Report information extraction."""
2243                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2244
2245         def _real_extract(self, url):
2246                 # At this point we have a new video
2247                 self._downloader.increment_downloads()
2248
2249                 video_id = url.split('/')[-1]
2250                 request = urllib2.Request(url)
2251                 try:
2252                         self.report_download_webpage(video_id)
2253                         webpage = urllib2.urlopen(request).read()
2254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2256                         return
2257                 except ValueError, err:
2258                         # since this is the last-resort InfoExtractor, if
2259                         # this error is thrown, it'll be thrown here
2260                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2261                         return
2262
2263                 self.report_extraction(video_id)
2264                 # Start with something easy: JW Player in SWFObject
2265                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2266                 if mobj is None:
2267                         # Broaden the search a little bit
2268                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2269                 if mobj is None:
2270                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2271                         return
2272
2273                 # It's possible that one of the regexes
2274                 # matched, but returned an empty group:
2275                 if mobj.group(1) is None:
2276                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2277                         return
2278
2279                 video_url = urllib.unquote(mobj.group(1))
2280                 video_id = os.path.basename(video_url)
2281
2282                 # here's a fun little line of code for you:
2283                 video_extension = os.path.splitext(video_id)[1][1:]
2284                 video_id = os.path.splitext(video_id)[0]
2285
2286                 # it's tempting to parse this further, but you would
2287                 # have to take into account all the variations like
2288                 #   Video Title - Site Name
2289                 #   Site Name | Video Title
2290                 #   Video Title - Tagline | Site Name
2291                 # and so on and so forth; it's just not practical
2292                 mobj = re.search(r'<title>(.*)</title>', webpage)
2293                 if mobj is None:
2294                         self._downloader.trouble(u'ERROR: unable to extract title')
2295                         return
2296                 video_title = mobj.group(1).decode('utf-8')
2297                 video_title = sanitize_title(video_title)
2298                 simple_title = _simplify_title(video_title)
2299
2300                 # video uploader is domain name
2301                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2302                 if mobj is None:
2303                         self._downloader.trouble(u'ERROR: unable to extract title')
2304                         return
2305                 video_uploader = mobj.group(1).decode('utf-8')
2306
2307                 try:
2308                         # Process video information
2309                         self._downloader.process_info({
2310                                 'id':           video_id.decode('utf-8'),
2311                                 'url':          video_url.decode('utf-8'),
2312                                 'uploader':     video_uploader,
2313                                 'upload_date':  u'NA',
2314                                 'title':        video_title,
2315                                 'stitle':       simple_title,
2316                                 'ext':          video_extension.decode('utf-8'),
2317                                 'format':       u'NA',
2318                                 'player_url':   None,
2319                         })
2320                 except UnavailableVideoError, err:
2321                         self._downloader.trouble(u'\nERROR: unable to download video')
2322
2323
2324 class YoutubeSearchIE(InfoExtractor):
2325         """Information Extractor for YouTube search queries."""
2326         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2327         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2328         _youtube_ie = None
2329         _max_youtube_results = 1000
2330         IE_NAME = u'youtube:search'
2331
2332         def __init__(self, youtube_ie, downloader=None):
2333                 InfoExtractor.__init__(self, downloader)
2334                 self._youtube_ie = youtube_ie
2335
2336         def report_download_page(self, query, pagenum):
2337                 """Report attempt to download playlist page with given number."""
2338                 query = query.decode(preferredencoding())
2339                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2340
2341         def _real_initialize(self):
2342                 self._youtube_ie.initialize()
2343
2344         def _real_extract(self, query):
2345                 mobj = re.match(self._VALID_URL, query)
2346                 if mobj is None:
2347                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2348                         return
2349
2350                 prefix, query = query.split(':')
2351                 prefix = prefix[8:]
2352                 query = query.encode('utf-8')
2353                 if prefix == '':
2354                         self._download_n_results(query, 1)
2355                         return
2356                 elif prefix == 'all':
2357                         self._download_n_results(query, self._max_youtube_results)
2358                         return
2359                 else:
2360                         try:
2361                                 n = long(prefix)
2362                                 if n <= 0:
2363                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2364                                         return
2365                                 elif n > self._max_youtube_results:
2366                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2367                                         n = self._max_youtube_results
2368                                 self._download_n_results(query, n)
2369                                 return
2370                         except ValueError: # parsing prefix as integer fails
2371                                 self._download_n_results(query, 1)
2372                                 return
2373
2374         def _download_n_results(self, query, n):
2375                 """Downloads a specified number of results for a query"""
2376
2377                 video_ids = []
2378                 pagenum = 0
2379                 limit = n
2380
2381                 while (50 * pagenum) < limit:
2382                         self.report_download_page(query, pagenum+1)
2383                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2384                         request = urllib2.Request(result_url)
2385                         try:
2386                                 data = urllib2.urlopen(request).read()
2387                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2388                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2389                                 return
2390                         api_response = json.loads(data)['data']
2391
2392                         new_ids = list(video['id'] for video in api_response['items'])
2393                         video_ids += new_ids
2394
2395                         limit = min(n, api_response['totalItems'])
2396                         pagenum += 1
2397
2398                 if len(video_ids) > n:
2399                         video_ids = video_ids[:n]
2400                 for id in video_ids:
2401                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2402                 return
2403
2404
2405 class GoogleSearchIE(InfoExtractor):
2406         """Information Extractor for Google Video search queries."""
2407         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2408         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2409         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2410         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2411         _google_ie = None
2412         _max_google_results = 1000
2413         IE_NAME = u'video.google:search'
2414
2415         def __init__(self, google_ie, downloader=None):
2416                 InfoExtractor.__init__(self, downloader)
2417                 self._google_ie = google_ie
2418
2419         def report_download_page(self, query, pagenum):
2420                 """Report attempt to download playlist page with given number."""
2421                 query = query.decode(preferredencoding())
2422                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2423
2424         def _real_initialize(self):
2425                 self._google_ie.initialize()
2426
2427         def _real_extract(self, query):
2428                 mobj = re.match(self._VALID_URL, query)
2429                 if mobj is None:
2430                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2431                         return
2432
2433                 prefix, query = query.split(':')
2434                 prefix = prefix[8:]
2435                 query = query.encode('utf-8')
2436                 if prefix == '':
2437                         self._download_n_results(query, 1)
2438                         return
2439                 elif prefix == 'all':
2440                         self._download_n_results(query, self._max_google_results)
2441                         return
2442                 else:
2443                         try:
2444                                 n = long(prefix)
2445                                 if n <= 0:
2446                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2447                                         return
2448                                 elif n > self._max_google_results:
2449                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2450                                         n = self._max_google_results
2451                                 self._download_n_results(query, n)
2452                                 return
2453                         except ValueError: # parsing prefix as integer fails
2454                                 self._download_n_results(query, 1)
2455                                 return
2456
2457         def _download_n_results(self, query, n):
2458                 """Downloads a specified number of results for a query"""
2459
2460                 video_ids = []
2461                 pagenum = 0
2462
2463                 while True:
2464                         self.report_download_page(query, pagenum)
2465                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2466                         request = urllib2.Request(result_url)
2467                         try:
2468                                 page = urllib2.urlopen(request).read()
2469                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471                                 return
2472
2473                         # Extract video identifiers
2474                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2475                                 video_id = mobj.group(1)
2476                                 if video_id not in video_ids:
2477                                         video_ids.append(video_id)
2478                                         if len(video_ids) == n:
2479                                                 # Specified n videos reached
2480                                                 for id in video_ids:
2481                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2482                                                 return
2483
2484                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2485                                 for id in video_ids:
2486                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2487                                 return
2488
2489                         pagenum = pagenum + 1
2490
2491
2492 class YahooSearchIE(InfoExtractor):
2493         """Information Extractor for Yahoo! Video search queries."""
2494         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2495         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2496         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2497         _MORE_PAGES_INDICATOR = r'\s*Next'
2498         _yahoo_ie = None
2499         _max_yahoo_results = 1000
2500         IE_NAME = u'video.yahoo:search'
2501
2502         def __init__(self, yahoo_ie, downloader=None):
2503                 InfoExtractor.__init__(self, downloader)
2504                 self._yahoo_ie = yahoo_ie
2505
2506         def report_download_page(self, query, pagenum):
2507                 """Report attempt to download playlist page with given number."""
2508                 query = query.decode(preferredencoding())
2509                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2510
2511         def _real_initialize(self):
2512                 self._yahoo_ie.initialize()
2513
2514         def _real_extract(self, query):
2515                 mobj = re.match(self._VALID_URL, query)
2516                 if mobj is None:
2517                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2518                         return
2519
2520                 prefix, query = query.split(':')
2521                 prefix = prefix[8:]
2522                 query = query.encode('utf-8')
2523                 if prefix == '':
2524                         self._download_n_results(query, 1)
2525                         return
2526                 elif prefix == 'all':
2527                         self._download_n_results(query, self._max_yahoo_results)
2528                         return
2529                 else:
2530                         try:
2531                                 n = long(prefix)
2532                                 if n <= 0:
2533                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2534                                         return
2535                                 elif n > self._max_yahoo_results:
2536                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2537                                         n = self._max_yahoo_results
2538                                 self._download_n_results(query, n)
2539                                 return
2540                         except ValueError: # parsing prefix as integer fails
2541                                 self._download_n_results(query, 1)
2542                                 return
2543
2544         def _download_n_results(self, query, n):
2545                 """Downloads a specified number of results for a query"""
2546
2547                 video_ids = []
2548                 already_seen = set()
2549                 pagenum = 1
2550
2551                 while True:
2552                         self.report_download_page(query, pagenum)
2553                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2554                         request = urllib2.Request(result_url)
2555                         try:
2556                                 page = urllib2.urlopen(request).read()
2557                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559                                 return
2560
2561                         # Extract video identifiers
2562                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2563                                 video_id = mobj.group(1)
2564                                 if video_id not in already_seen:
2565                                         video_ids.append(video_id)
2566                                         already_seen.add(video_id)
2567                                         if len(video_ids) == n:
2568                                                 # Specified n videos reached
2569                                                 for id in video_ids:
2570                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2571                                                 return
2572
2573                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2574                                 for id in video_ids:
2575                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2576                                 return
2577
2578                         pagenum = pagenum + 1
2579
2580
2581 class YoutubePlaylistIE(InfoExtractor):
2582         """Information Extractor for YouTube playlists."""
2583
2584         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2585         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2586         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2587         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2588         _youtube_ie = None
2589         IE_NAME = u'youtube:playlist'
2590
2591         def __init__(self, youtube_ie, downloader=None):
2592                 InfoExtractor.__init__(self, downloader)
2593                 self._youtube_ie = youtube_ie
2594
2595         def report_download_page(self, playlist_id, pagenum):
2596                 """Report attempt to download playlist page with given number."""
2597                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2598
2599         def _real_initialize(self):
2600                 self._youtube_ie.initialize()
2601
2602         def _real_extract(self, url):
2603                 # Extract playlist id
2604                 mobj = re.match(self._VALID_URL, url)
2605                 if mobj is None:
2606                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2607                         return
2608
2609                 # Single video case
2610                 if mobj.group(3) is not None:
2611                         self._youtube_ie.extract(mobj.group(3))
2612                         return
2613
2614                 # Download playlist pages
2615                 # prefix is 'p' as default for playlists but there are other types that need extra care
2616                 playlist_prefix = mobj.group(1)
2617                 if playlist_prefix == 'a':
2618                         playlist_access = 'artist'
2619                 else:
2620                         playlist_prefix = 'p'
2621                         playlist_access = 'view_play_list'
2622                 playlist_id = mobj.group(2)
2623                 video_ids = []
2624                 pagenum = 1
2625
2626                 while True:
2627                         self.report_download_page(playlist_id, pagenum)
2628                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2629                         request = urllib2.Request(url)
2630                         try:
2631                                 page = urllib2.urlopen(request).read()
2632                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2633                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2634                                 return
2635
2636                         # Extract video identifiers
2637                         ids_in_page = []
2638                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2639                                 if mobj.group(1) not in ids_in_page:
2640                                         ids_in_page.append(mobj.group(1))
2641                         video_ids.extend(ids_in_page)
2642
2643                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2644                                 break
2645                         pagenum = pagenum + 1
2646
2647                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2648                 playlistend = self._downloader.params.get('playlistend', -1)
2649                 if playlistend == -1:
2650                         video_ids = video_ids[playliststart:]
2651                 else:
2652                         video_ids = video_ids[playliststart:playlistend]
2653
2654                 for id in video_ids:
2655                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2656                 return
2657
2658
2659 class YoutubeUserIE(InfoExtractor):
2660         """Information Extractor for YouTube users."""
2661
2662         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2663         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2664         _GDATA_PAGE_SIZE = 50
2665         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2666         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2667         _youtube_ie = None
2668         IE_NAME = u'youtube:user'
2669
2670         def __init__(self, youtube_ie, downloader=None):
2671                 InfoExtractor.__init__(self, downloader)
2672                 self._youtube_ie = youtube_ie
2673
2674         def report_download_page(self, username, start_index):
2675                 """Report attempt to download user page."""
2676                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2677                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2678
2679         def _real_initialize(self):
2680                 self._youtube_ie.initialize()
2681
2682         def _real_extract(self, url):
2683                 # Extract username
2684                 mobj = re.match(self._VALID_URL, url)
2685                 if mobj is None:
2686                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2687                         return
2688
2689                 username = mobj.group(1)
2690
2691                 # Download video ids using YouTube Data API. Result size per
2692                 # query is limited (currently to 50 videos) so we need to query
2693                 # page by page until there are no video ids - it means we got
2694                 # all of them.
2695
2696                 video_ids = []
2697                 pagenum = 0
2698
2699                 while True:
2700                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2701                         self.report_download_page(username, start_index)
2702
2703                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2704
2705                         try:
2706                                 page = urllib2.urlopen(request).read()
2707                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2708                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2709                                 return
2710
2711                         # Extract video identifiers
2712                         ids_in_page = []
2713
2714                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2715                                 if mobj.group(1) not in ids_in_page:
2716                                         ids_in_page.append(mobj.group(1))
2717
2718                         video_ids.extend(ids_in_page)
2719
2720                         # A little optimization - if current page is not
2721                         # "full", ie. does not contain PAGE_SIZE video ids then
2722                         # we can assume that this page is the last one - there
2723                         # are no more ids on further pages - no need to query
2724                         # again.
2725
2726                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2727                                 break
2728
2729                         pagenum += 1
2730
2731                 all_ids_count = len(video_ids)
2732                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2733                 playlistend = self._downloader.params.get('playlistend', -1)
2734
2735                 if playlistend == -1:
2736                         video_ids = video_ids[playliststart:]
2737                 else:
2738                         video_ids = video_ids[playliststart:playlistend]
2739
2740                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2741                                 (username, all_ids_count, len(video_ids)))
2742
2743                 for video_id in video_ids:
2744                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2745
2746
2747 class DepositFilesIE(InfoExtractor):
2748         """Information extractor for depositfiles.com"""
2749
2750         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2751         IE_NAME = u'DepositFiles'
2752
2753         def __init__(self, downloader=None):
2754                 InfoExtractor.__init__(self, downloader)
2755
2756         def report_download_webpage(self, file_id):
2757                 """Report webpage download."""
2758                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2759
2760         def report_extraction(self, file_id):
2761                 """Report information extraction."""
2762                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2763
2764         def _real_extract(self, url):
2765                 # At this point we have a new file
2766                 self._downloader.increment_downloads()
2767
2768                 file_id = url.split('/')[-1]
2769                 # Rebuild url in english locale
2770                 url = 'http://depositfiles.com/en/files/' + file_id
2771
2772                 # Retrieve file webpage with 'Free download' button pressed
2773                 free_download_indication = { 'gateway_result' : '1' }
2774                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2775                 try:
2776                         self.report_download_webpage(file_id)
2777                         webpage = urllib2.urlopen(request).read()
2778                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2779                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2780                         return
2781
2782                 # Search for the real file URL
2783                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2784                 if (mobj is None) or (mobj.group(1) is None):
2785                         # Try to figure out reason of the error.
2786                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2787                         if (mobj is not None) and (mobj.group(1) is not None):
2788                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2789                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2790                         else:
2791                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2792                         return
2793
2794                 file_url = mobj.group(1)
2795                 file_extension = os.path.splitext(file_url)[1][1:]
2796
2797                 # Search for file title
2798                 mobj = re.search(r'<b title="(.*?)">', webpage)
2799                 if mobj is None:
2800                         self._downloader.trouble(u'ERROR: unable to extract title')
2801                         return
2802                 file_title = mobj.group(1).decode('utf-8')
2803
2804                 try:
2805                         # Process file information
2806                         self._downloader.process_info({
2807                                 'id':           file_id.decode('utf-8'),
2808                                 'url':          file_url.decode('utf-8'),
2809                                 'uploader':     u'NA',
2810                                 'upload_date':  u'NA',
2811                                 'title':        file_title,
2812                                 'stitle':       file_title,
2813                                 'ext':          file_extension.decode('utf-8'),
2814                                 'format':       u'NA',
2815                                 'player_url':   None,
2816                         })
2817                 except UnavailableVideoError, err:
2818                         self._downloader.trouble(u'ERROR: unable to download file')
2819
2820
2821 class FacebookIE(InfoExtractor):
2822         """Information Extractor for Facebook"""
2823
2824         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2825         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2826         _NETRC_MACHINE = 'facebook'
2827         _available_formats = ['video', 'highqual', 'lowqual']
2828         _video_extensions = {
2829                 'video': 'mp4',
2830                 'highqual': 'mp4',
2831                 'lowqual': 'mp4',
2832         }
2833         IE_NAME = u'facebook'
2834
2835         def __init__(self, downloader=None):
2836                 InfoExtractor.__init__(self, downloader)
2837
2838         def _reporter(self, message):
2839                 """Add header and report message."""
2840                 self._downloader.to_screen(u'[facebook] %s' % message)
2841
2842         def report_login(self):
2843                 """Report attempt to log in."""
2844                 self._reporter(u'Logging in')
2845
2846         def report_video_webpage_download(self, video_id):
2847                 """Report attempt to download video webpage."""
2848                 self._reporter(u'%s: Downloading video webpage' % video_id)
2849
2850         def report_information_extraction(self, video_id):
2851                 """Report attempt to extract video information."""
2852                 self._reporter(u'%s: Extracting video information' % video_id)
2853
2854         def _parse_page(self, video_webpage):
2855                 """Extract video information from page"""
2856                 # General data
2857                 data = {'title': r'\("video_title", "(.*?)"\)',
2858                         'description': r'<div class="datawrap">(.*?)</div>',
2859                         'owner': r'\("video_owner_name", "(.*?)"\)',
2860                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2861                         }
2862                 video_info = {}
2863                 for piece in data.keys():
2864                         mobj = re.search(data[piece], video_webpage)
2865                         if mobj is not None:
2866                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2867
2868                 # Video urls
2869                 video_urls = {}
2870                 for fmt in self._available_formats:
2871                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2872                         if mobj is not None:
2873                                 # URL is in a Javascript segment inside an escaped Unicode format within
2874                                 # the generally utf-8 page
2875                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2876                 video_info['video_urls'] = video_urls
2877
2878                 return video_info
2879
2880         def _real_initialize(self):
2881                 if self._downloader is None:
2882                         return
2883
2884                 useremail = None
2885                 password = None
2886                 downloader_params = self._downloader.params
2887
2888                 # Attempt to use provided username and password or .netrc data
2889                 if downloader_params.get('username', None) is not None:
2890                         useremail = downloader_params['username']
2891                         password = downloader_params['password']
2892                 elif downloader_params.get('usenetrc', False):
2893                         try:
2894                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2895                                 if info is not None:
2896                                         useremail = info[0]
2897                                         password = info[2]
2898                                 else:
2899                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2900                         except (IOError, netrc.NetrcParseError), err:
2901                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2902                                 return
2903
2904                 if useremail is None:
2905                         return
2906
2907                 # Log in
2908                 login_form = {
2909                         'email': useremail,
2910                         'pass': password,
2911                         'login': 'Log+In'
2912                         }
2913                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2914                 try:
2915                         self.report_login()
2916                         login_results = urllib2.urlopen(request).read()
2917                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2918                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2919                                 return
2920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2922                         return
2923
2924         def _real_extract(self, url):
2925                 mobj = re.match(self._VALID_URL, url)
2926                 if mobj is None:
2927                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2928                         return
2929                 video_id = mobj.group('ID')
2930
2931                 # Get video webpage
2932                 self.report_video_webpage_download(video_id)
2933                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2934                 try:
2935                         page = urllib2.urlopen(request)
2936                         video_webpage = page.read()
2937                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2938                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2939                         return
2940
2941                 # Start extracting information
2942                 self.report_information_extraction(video_id)
2943
2944                 # Extract information
2945                 video_info = self._parse_page(video_webpage)
2946
2947                 # uploader
2948                 if 'owner' not in video_info:
2949                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2950                         return
2951                 video_uploader = video_info['owner']
2952
2953                 # title
2954                 if 'title' not in video_info:
2955                         self._downloader.trouble(u'ERROR: unable to extract video title')
2956                         return
2957                 video_title = video_info['title']
2958                 video_title = video_title.decode('utf-8')
2959                 video_title = sanitize_title(video_title)
2960
2961                 simple_title = _simplify_title(video_title)
2962
2963                 # thumbnail image
2964                 if 'thumbnail' not in video_info:
2965                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2966                         video_thumbnail = ''
2967                 else:
2968                         video_thumbnail = video_info['thumbnail']
2969
2970                 # upload date
2971                 upload_date = u'NA'
2972                 if 'upload_date' in video_info:
2973                         upload_time = video_info['upload_date']
2974                         timetuple = email.utils.parsedate_tz(upload_time)
2975                         if timetuple is not None:
2976                                 try:
2977                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2978                                 except:
2979                                         pass
2980
2981                 # description
2982                 video_description = video_info.get('description', 'No description available.')
2983
2984                 url_map = video_info['video_urls']
2985                 if len(url_map.keys()) > 0:
2986                         # Decide which formats to download
2987                         req_format = self._downloader.params.get('format', None)
2988                         format_limit = self._downloader.params.get('format_limit', None)
2989
2990                         if format_limit is not None and format_limit in self._available_formats:
2991                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2992                         else:
2993                                 format_list = self._available_formats
2994                         existing_formats = [x for x in format_list if x in url_map]
2995                         if len(existing_formats) == 0:
2996                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2997                                 return
2998                         if req_format is None:
2999                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3000                         elif req_format == 'worst':
3001                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3002                         elif req_format == '-1':
3003                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3004                         else:
3005                                 # Specific format
3006                                 if req_format not in url_map:
3007                                         self._downloader.trouble(u'ERROR: requested format not available')
3008                                         return
3009                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3010
3011                 for format_param, video_real_url in video_url_list:
3012
3013                         # At this point we have a new video
3014                         self._downloader.increment_downloads()
3015
3016                         # Extension
3017                         video_extension = self._video_extensions.get(format_param, 'mp4')
3018
3019                         try:
3020                                 # Process video information
3021                                 self._downloader.process_info({
3022                                         'id':           video_id.decode('utf-8'),
3023                                         'url':          video_real_url.decode('utf-8'),
3024                                         'uploader':     video_uploader.decode('utf-8'),
3025                                         'upload_date':  upload_date,
3026                                         'title':        video_title,
3027                                         'stitle':       simple_title,
3028                                         'ext':          video_extension.decode('utf-8'),
3029                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3030                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3031                                         'description':  video_description.decode('utf-8'),
3032                                         'player_url':   None,
3033                                 })
3034                         except UnavailableVideoError, err:
3035                                 self._downloader.trouble(u'\nERROR: unable to download video')
3036
3037 class BlipTVIE(InfoExtractor):
3038         """Information extractor for blip.tv"""
3039
3040         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3041         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3042         IE_NAME = u'blip.tv'
3043
3044         def report_extraction(self, file_id):
3045                 """Report information extraction."""
3046                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3047
3048         def report_direct_download(self, title):
3049                 """Report information extraction."""
3050                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3051
3052         def _real_extract(self, url):
3053                 mobj = re.match(self._VALID_URL, url)
3054                 if mobj is None:
3055                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3056                         return
3057
3058                 if '?' in url:
3059                         cchar = '&'
3060                 else:
3061                         cchar = '?'
3062                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3063                 request = urllib2.Request(json_url)
3064                 self.report_extraction(mobj.group(1))
3065                 info = None
3066                 try:
3067                         urlh = urllib2.urlopen(request)
3068                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3069                                 basename = url.split('/')[-1]
3070                                 title,ext = os.path.splitext(basename)
3071                                 title = title.decode('UTF-8')
3072                                 ext = ext.replace('.', '')
3073                                 self.report_direct_download(title)
3074                                 info = {
3075                                         'id': title,
3076                                         'url': url,
3077                                         'title': title,
3078                                         'stitle': _simplify_title(title),
3079                                         'ext': ext,
3080                                         'urlhandle': urlh
3081                                 }
3082                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3083                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3084                         return
3085                 if info is None: # Regular URL
3086                         try:
3087                                 json_code = urlh.read()
3088                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3089                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3090                                 return
3091
3092                         try:
3093                                 json_data = json.loads(json_code)
3094                                 if 'Post' in json_data:
3095                                         data = json_data['Post']
3096                                 else:
3097                                         data = json_data
3098
3099                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3100                                 video_url = data['media']['url']
3101                                 umobj = re.match(self._URL_EXT, video_url)
3102                                 if umobj is None:
3103                                         raise ValueError('Can not determine filename extension')
3104                                 ext = umobj.group(1)
3105
3106                                 info = {
3107                                         'id': data['item_id'],
3108                                         'url': video_url,
3109                                         'uploader': data['display_name'],
3110                                         'upload_date': upload_date,
3111                                         'title': data['title'],
3112                                         'stitle': _simplify_title(data['title']),
3113                                         'ext': ext,
3114                                         'format': data['media']['mimeType'],
3115                                         'thumbnail': data['thumbnailUrl'],
3116                                         'description': data['description'],
3117                                         'player_url': data['embedUrl']
3118                                 }
3119                         except (ValueError,KeyError), err:
3120                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3121                                 return
3122
3123                 self._downloader.increment_downloads()
3124
3125                 try:
3126                         self._downloader.process_info(info)
3127                 except UnavailableVideoError, err:
3128                         self._downloader.trouble(u'\nERROR: unable to download video')
3129
3130
3131 class MyVideoIE(InfoExtractor):
3132         """Information Extractor for myvideo.de."""
3133
3134         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3135         IE_NAME = u'myvideo'
3136
3137         def __init__(self, downloader=None):
3138                 InfoExtractor.__init__(self, downloader)
3139
3140         def report_download_webpage(self, video_id):
3141                 """Report webpage download."""
3142                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3143
3144         def report_extraction(self, video_id):
3145                 """Report information extraction."""
3146                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3147
3148         def _real_extract(self,url):
3149                 mobj = re.match(self._VALID_URL, url)
3150                 if mobj is None:
3151                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3152                         return
3153
3154                 video_id = mobj.group(1)
3155
3156                 # Get video webpage
3157                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3158                 try:
3159                         self.report_download_webpage(video_id)
3160                         webpage = urllib2.urlopen(request).read()
3161                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3163                         return
3164
3165                 self.report_extraction(video_id)
3166                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3167                                  webpage)
3168                 if mobj is None:
3169                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3170                         return
3171                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3172
3173                 mobj = re.search('<title>([^<]+)</title>', webpage)
3174                 if mobj is None:
3175                         self._downloader.trouble(u'ERROR: unable to extract title')
3176                         return
3177
3178                 video_title = mobj.group(1)
3179                 video_title = sanitize_title(video_title)
3180
3181                 simple_title = _simplify_title(video_title)
3182
3183                 try:
3184                         self._downloader.process_info({
3185                                 'id':           video_id,
3186                                 'url':          video_url,
3187                                 'uploader':     u'NA',
3188                                 'upload_date':  u'NA',
3189                                 'title':        video_title,
3190                                 'stitle':       simple_title,
3191                                 'ext':          u'flv',
3192                                 'format':       u'NA',
3193                                 'player_url':   None,
3194                         })
3195                 except UnavailableVideoError:
3196                         self._downloader.trouble(u'\nERROR: Unable to download video')
3197
3198 class ComedyCentralIE(InfoExtractor):
3199         """Information extractor for The Daily Show and Colbert Report """
3200
3201         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3202         IE_NAME = u'comedycentral'
3203
3204         def report_extraction(self, episode_id):
3205                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3206
3207         def report_config_download(self, episode_id):
3208                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3209
3210         def report_index_download(self, episode_id):
3211                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3212
3213         def report_player_url(self, episode_id):
3214                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3215
3216         def _real_extract(self, url):
3217                 mobj = re.match(self._VALID_URL, url)
3218                 if mobj is None:
3219                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220                         return
3221
3222                 if mobj.group('shortname'):
3223                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3224                                 url = u'http://www.thedailyshow.com/full-episodes/'
3225                         else:
3226                                 url = u'http://www.colbertnation.com/full-episodes/'
3227                         mobj = re.match(self._VALID_URL, url)
3228                         assert mobj is not None
3229
3230                 dlNewest = not mobj.group('episode')
3231                 if dlNewest:
3232                         epTitle = mobj.group('showname')
3233                 else:
3234                         epTitle = mobj.group('episode')
3235
3236                 req = urllib2.Request(url)
3237                 self.report_extraction(epTitle)
3238                 try:
3239                         htmlHandle = urllib2.urlopen(req)
3240                         html = htmlHandle.read()
3241                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3242                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3243                         return
3244                 if dlNewest:
3245                         url = htmlHandle.geturl()
3246                         mobj = re.match(self._VALID_URL, url)
3247                         if mobj is None:
3248                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3249                                 return
3250                         if mobj.group('episode') == '':
3251                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3252                                 return
3253                         epTitle = mobj.group('episode')
3254
3255                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3256                 if len(mMovieParams) == 0:
3257                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3258                         return
3259
3260                 playerUrl_raw = mMovieParams[0][0]
3261                 self.report_player_url(epTitle)
3262                 try:
3263                         urlHandle = urllib2.urlopen(playerUrl_raw)
3264                         playerUrl = urlHandle.geturl()
3265                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3267                         return
3268
3269                 uri = mMovieParams[0][1]
3270                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3271                 self.report_index_download(epTitle)
3272                 try:
3273                         indexXml = urllib2.urlopen(indexUrl).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3276                         return
3277
3278                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3279                 itemEls = idoc.findall('.//item')
3280                 for itemEl in itemEls:
3281                         mediaId = itemEl.findall('./guid')[0].text
3282                         shortMediaId = mediaId.split(':')[-1]
3283                         showId = mediaId.split(':')[-2].replace('.com', '')
3284                         officialTitle = itemEl.findall('./title')[0].text
3285                         officialDate = itemEl.findall('./pubDate')[0].text
3286
3287                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3288                                                 urllib.urlencode({'uri': mediaId}))
3289                         configReq = urllib2.Request(configUrl)
3290                         self.report_config_download(epTitle)
3291                         try:
3292                                 configXml = urllib2.urlopen(configReq).read()
3293                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3294                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3295                                 return
3296
3297                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3298                         turls = []
3299                         for rendition in cdoc.findall('.//rendition'):
3300                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3301                                 turls.append(finfo)
3302
3303                         if len(turls) == 0:
3304                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3305                                 continue
3306
3307                         # For now, just pick the highest bitrate
3308                         format,video_url = turls[-1]
3309
3310                         self._downloader.increment_downloads()
3311
3312                         effTitle = showId + u'-' + epTitle
3313                         info = {
3314                                 'id': shortMediaId,
3315                                 'url': video_url,
3316                                 'uploader': showId,
3317                                 'upload_date': officialDate,
3318                                 'title': effTitle,
3319                                 'stitle': _simplify_title(effTitle),
3320                                 'ext': 'mp4',
3321                                 'format': format,
3322                                 'thumbnail': None,
3323                                 'description': officialTitle,
3324                                 'player_url': playerUrl
3325                         }
3326
3327                         try:
3328                                 self._downloader.process_info(info)
3329                         except UnavailableVideoError, err:
3330                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3331                                 continue
3332
3333
3334 class EscapistIE(InfoExtractor):
3335         """Information extractor for The Escapist """
3336
3337         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3338         IE_NAME = u'escapist'
3339
3340         def report_extraction(self, showName):
3341                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3342
3343         def report_config_download(self, showName):
3344                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3345
3346         def _real_extract(self, url):
3347                 htmlParser = HTMLParser.HTMLParser()
3348
3349                 mobj = re.match(self._VALID_URL, url)
3350                 if mobj is None:
3351                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3352                         return
3353                 showName = mobj.group('showname')
3354                 videoId = mobj.group('episode')
3355
3356                 self.report_extraction(showName)
3357                 try:
3358                         webPage = urllib2.urlopen(url).read()
3359                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3360                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3361                         return
3362
3363                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3364                 description = htmlParser.unescape(descMatch.group(1))
3365                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3366                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3367                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3368                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3369                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3370                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3371
3372                 self.report_config_download(showName)
3373                 try:
3374                         configJSON = urllib2.urlopen(configUrl).read()
3375                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3376                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3377                         return
3378
3379                 # Technically, it's JavaScript, not JSON
3380                 configJSON = configJSON.replace("'", '"')
3381
3382                 try:
3383                         config = json.loads(configJSON)
3384                 except (ValueError,), err:
3385                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3386                         return
3387
3388                 playlist = config['playlist']
3389                 videoUrl = playlist[1]['url']
3390
3391                 self._downloader.increment_downloads()
3392                 info = {
3393                         'id': videoId,
3394                         'url': videoUrl,
3395                         'uploader': showName,
3396                         'upload_date': None,
3397                         'title': showName,
3398                         'stitle': _simplify_title(showName),
3399                         'ext': 'flv',
3400                         'format': 'flv',
3401                         'thumbnail': imgUrl,
3402                         'description': description,
3403                         'player_url': playerUrl,
3404                 }
3405
3406                 try:
3407                         self._downloader.process_info(info)
3408                 except UnavailableVideoError, err:
3409                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3410
3411
3412 class CollegeHumorIE(InfoExtractor):
3413         """Information extractor for collegehumor.com"""
3414
3415         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3416         IE_NAME = u'collegehumor'
3417
3418         def report_webpage(self, video_id):
3419                 """Report information extraction."""
3420                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3421
3422         def report_extraction(self, video_id):
3423                 """Report information extraction."""
3424                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3425
3426         def _real_extract(self, url):
3427                 htmlParser = HTMLParser.HTMLParser()
3428
3429                 mobj = re.match(self._VALID_URL, url)
3430                 if mobj is None:
3431                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3432                         return
3433                 video_id = mobj.group('videoid')
3434
3435                 self.report_webpage(video_id)
3436                 request = urllib2.Request(url)
3437                 try:
3438                         webpage = urllib2.urlopen(request).read()
3439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3440                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3441                         return
3442
3443                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3444                 if m is None:
3445                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3446                         return
3447                 internal_video_id = m.group('internalvideoid')
3448
3449                 info = {
3450                         'id': video_id,
3451                         'internal_id': internal_video_id,
3452                 }
3453
3454                 self.report_extraction(video_id)
3455                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3456                 try:
3457                         metaXml = urllib2.urlopen(xmlUrl).read()
3458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3459                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3460                         return
3461
3462                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3463                 try:
3464                         videoNode = mdoc.findall('./video')[0]
3465                         info['description'] = videoNode.findall('./description')[0].text
3466                         info['title'] = videoNode.findall('./caption')[0].text
3467                         info['stitle'] = _simplify_title(info['title'])
3468                         info['url'] = videoNode.findall('./file')[0].text
3469                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3470                         info['ext'] = info['url'].rpartition('.')[2]
3471                         info['format'] = info['ext']
3472                 except IndexError:
3473                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3474                         return
3475
3476                 self._downloader.increment_downloads()
3477
3478                 try:
3479                         self._downloader.process_info(info)
3480                 except UnavailableVideoError, err:
3481                         self._downloader.trouble(u'\nERROR: unable to download video')
3482
3483
3484 class XVideosIE(InfoExtractor):
3485         """Information extractor for xvideos.com"""
3486
3487         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3488         IE_NAME = u'xvideos'
3489
3490         def report_webpage(self, video_id):
3491                 """Report information extraction."""
3492                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3493
3494         def report_extraction(self, video_id):
3495                 """Report information extraction."""
3496                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3497
3498         def _real_extract(self, url):
3499                 htmlParser = HTMLParser.HTMLParser()
3500
3501                 mobj = re.match(self._VALID_URL, url)
3502                 if mobj is None:
3503                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3504                         return
3505                 video_id = mobj.group(1).decode('utf-8')
3506
3507                 self.report_webpage(video_id)
3508
3509                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3510                 try:
3511                         webpage = urllib2.urlopen(request).read()
3512                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3513                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3514                         return
3515
3516                 self.report_extraction(video_id)
3517
3518
3519                 # Extract video URL
3520                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3521                 if mobj is None:
3522                         self._downloader.trouble(u'ERROR: unable to extract video url')
3523                         return
3524                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3525
3526
3527                 # Extract title
3528                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3529                 if mobj is None:
3530                         self._downloader.trouble(u'ERROR: unable to extract video title')
3531                         return
3532                 video_title = mobj.group(1).decode('utf-8')
3533
3534
3535                 # Extract video thumbnail
3536                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3537                 if mobj is None:
3538                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3539                         return
3540                 video_thumbnail = mobj.group(1).decode('utf-8')
3541
3542
3543
3544                 self._downloader.increment_downloads()
3545                 info = {
3546                         'id': video_id,
3547                         'url': video_url,
3548                         'uploader': None,
3549                         'upload_date': None,
3550                         'title': video_title,
3551                         'stitle': _simplify_title(video_title),
3552                         'ext': 'flv',
3553                         'format': 'flv',
3554                         'thumbnail': video_thumbnail,
3555                         'description': None,
3556                         'player_url': None,
3557                 }
3558
3559                 try:
3560                         self._downloader.process_info(info)
3561                 except UnavailableVideoError, err:
3562                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3563
3564
3565 class SoundcloudIE(InfoExtractor):
3566         """Information extractor for soundcloud.com
3567            To access the media, the uid of the song and a stream token
3568            must be extracted from the page source and the script must make
3569            a request to media.soundcloud.com/crossdomain.xml. Then
3570            the media can be grabbed by requesting from an url composed
3571            of the stream token and uid
3572          """
3573
3574         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3575         IE_NAME = u'soundcloud'
3576
3577         def __init__(self, downloader=None):
3578                 InfoExtractor.__init__(self, downloader)
3579
3580         def report_webpage(self, video_id):
3581                 """Report information extraction."""
3582                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3583
3584         def report_extraction(self, video_id):
3585                 """Report information extraction."""
3586                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3587
3588         def _real_extract(self, url):
3589                 htmlParser = HTMLParser.HTMLParser()
3590
3591                 mobj = re.match(self._VALID_URL, url)
3592                 if mobj is None:
3593                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3594                         return
3595
3596                 # extract uploader (which is in the url)
3597                 uploader = mobj.group(1).decode('utf-8')
3598                 # extract simple title (uploader + slug of song title)
3599                 slug_title =  mobj.group(2).decode('utf-8')
3600                 simple_title = uploader + '-' + slug_title
3601
3602                 self.report_webpage('%s/%s' % (uploader, slug_title))
3603
3604                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3605                 try:
3606                         webpage = urllib2.urlopen(request).read()
3607                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3608                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3609                         return
3610
3611                 self.report_extraction('%s/%s' % (uploader, slug_title))
3612
3613                 # extract uid and stream token that soundcloud hands out for access
3614                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3615                 if mobj:
3616                         video_id = mobj.group(1)
3617                         stream_token = mobj.group(2)
3618
3619                 # extract unsimplified title
3620                 mobj = re.search('"title":"(.*?)",', webpage)
3621                 if mobj:
3622                         title = mobj.group(1)
3623
3624                 # construct media url (with uid/token)
3625                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3626                 mediaURL = mediaURL % (video_id, stream_token)
3627
3628                 # description
3629                 description = u'No description available'
3630                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3631                 if mobj:
3632                         description = mobj.group(1)
3633
3634                 # upload date
3635                 upload_date = None
3636                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3637                 if mobj:
3638                         try:
3639                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3640                         except Exception, e:
3641                                 print str(e)
3642
3643                 # for soundcloud, a request to a cross domain is required for cookies
3644                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3645
3646                 try:
3647                         self._downloader.process_info({
3648                                 'id':           video_id.decode('utf-8'),
3649                                 'url':          mediaURL,
3650                                 'uploader':     uploader.decode('utf-8'),
3651                                 'upload_date':  upload_date,
3652                                 'title':        simple_title.decode('utf-8'),
3653                                 'stitle':       simple_title.decode('utf-8'),
3654                                 'ext':          u'mp3',
3655                                 'format':       u'NA',
3656                                 'player_url':   None,
3657                                 'description': description.decode('utf-8')
3658                         })
3659                 except UnavailableVideoError:
3660                         self._downloader.trouble(u'\nERROR: unable to download video')
3661
3662
3663 class InfoQIE(InfoExtractor):
3664         """Information extractor for infoq.com"""
3665
3666         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3667         IE_NAME = u'infoq'
3668
3669         def report_webpage(self, video_id):
3670                 """Report information extraction."""
3671                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3672
3673         def report_extraction(self, video_id):
3674                 """Report information extraction."""
3675                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3676
3677         def _real_extract(self, url):
3678                 htmlParser = HTMLParser.HTMLParser()
3679
3680                 mobj = re.match(self._VALID_URL, url)
3681                 if mobj is None:
3682                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683                         return
3684
3685                 self.report_webpage(url)
3686
3687                 request = urllib2.Request(url)
3688                 try:
3689                         webpage = urllib2.urlopen(request).read()
3690                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3691                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3692                         return
3693
3694                 self.report_extraction(url)
3695
3696
3697                 # Extract video URL
3698                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3699                 if mobj is None:
3700                         self._downloader.trouble(u'ERROR: unable to extract video url')
3701                         return
3702                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3703
3704
3705                 # Extract title
3706                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3707                 if mobj is None:
3708                         self._downloader.trouble(u'ERROR: unable to extract video title')
3709                         return
3710                 video_title = mobj.group(1).decode('utf-8')
3711
3712                 # Extract description
3713                 video_description = u'No description available.'
3714                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3715                 if mobj is not None:
3716                         video_description = mobj.group(1).decode('utf-8')
3717
3718                 video_filename = video_url.split('/')[-1]
3719                 video_id, extension = video_filename.split('.')
3720
3721                 self._downloader.increment_downloads()
3722                 info = {
3723                         'id': video_id,
3724                         'url': video_url,
3725                         'uploader': None,
3726                         'upload_date': None,
3727                         'title': video_title,
3728                         'stitle': _simplify_title(video_title),
3729                         'ext': extension,
3730                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3731                         'thumbnail': None,
3732                         'description': video_description,
3733                         'player_url': None,
3734                 }
3735
3736                 try:
3737                         self._downloader.process_info(info)
3738                 except UnavailableVideoError, err:
3739                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3740
3741 class MixcloudIE(InfoExtractor):
3742         """Information extractor for www.mixcloud.com"""
3743         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3744         IE_NAME = u'mixcloud'
3745
3746         def __init__(self, downloader=None):
3747                 InfoExtractor.__init__(self, downloader)
3748
3749         def report_download_json(self, file_id):
3750                 """Report JSON download."""
3751                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3752
3753         def report_extraction(self, file_id):
3754                 """Report information extraction."""
3755                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3756
3757         def get_urls(self, jsonData, fmt, bitrate='best'):
3758                 """Get urls from 'audio_formats' section in json"""
3759                 file_url = None
3760                 try:
3761                         bitrate_list = jsonData[fmt]
3762                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3763                                 bitrate = max(bitrate_list) # select highest
3764
3765                         url_list = jsonData[fmt][bitrate]
3766                 except TypeError: # we have no bitrate info.
3767                         url_list = jsonData[fmt]
3768
3769                 return url_list
3770
3771         def check_urls(self, url_list):
3772                 """Returns 1st active url from list"""
3773                 for url in url_list:
3774                         try:
3775                                 urllib2.urlopen(url)
3776                                 return url
3777                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3778                                 url = None
3779
3780                 return None
3781
3782         def _print_formats(self, formats):
3783                 print 'Available formats:'
3784                 for fmt in formats.keys():
3785                         for b in formats[fmt]:
3786                                 try:
3787                                         ext = formats[fmt][b][0]
3788                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3789                                 except TypeError: # we have no bitrate info
3790                                         ext = formats[fmt][0]
3791                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3792                                         break
3793
3794         def _real_extract(self, url):
3795                 mobj = re.match(self._VALID_URL, url)
3796                 if mobj is None:
3797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3798                         return
3799                 # extract uploader & filename from url
3800                 uploader = mobj.group(1).decode('utf-8')
3801                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3802
3803                 # construct API request
3804                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3805                 # retrieve .json file with links to files
3806                 request = urllib2.Request(file_url)
3807                 try:
3808                         self.report_download_json(file_url)
3809                         jsonData = urllib2.urlopen(request).read()
3810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3811                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3812                         return
3813
3814                 # parse JSON
3815                 json_data = json.loads(jsonData)
3816                 player_url = json_data['player_swf_url']
3817                 formats = dict(json_data['audio_formats'])
3818
3819                 req_format = self._downloader.params.get('format', None)
3820                 bitrate = None
3821
3822                 if self._downloader.params.get('listformats', None):
3823                         self._print_formats(formats)
3824                         return
3825
3826                 if req_format is None or req_format == 'best':
3827                         for format_param in formats.keys():
3828                                 url_list = self.get_urls(formats, format_param)
3829                                 # check urls
3830                                 file_url = self.check_urls(url_list)
3831                                 if file_url is not None:
3832                                         break # got it!
3833                 else:
3834                         if req_format not in formats.keys():
3835                                 self._downloader.trouble(u'ERROR: format is not available')
3836                                 return
3837
3838                         url_list = self.get_urls(formats, req_format)
3839                         file_url = self.check_urls(url_list)
3840                         format_param = req_format
3841
3842                 # We have audio
3843                 self._downloader.increment_downloads()
3844                 try:
3845                         # Process file information
3846                         self._downloader.process_info({
3847                                 'id': file_id.decode('utf-8'),
3848                                 'url': file_url.decode('utf-8'),
3849                                 'uploader':     uploader.decode('utf-8'),
3850                                 'upload_date': u'NA',
3851                                 'title': json_data['name'],
3852                                 'stitle': _simplify_title(json_data['name']),
3853                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3854                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3855                                 'thumbnail': json_data['thumbnail_url'],
3856                                 'description': json_data['description'],
3857                                 'player_url': player_url.decode('utf-8'),
3858                         })
3859                 except UnavailableVideoError, err:
3860                         self._downloader.trouble(u'ERROR: unable to download file')
3861
3862 class StanfordOpenClassroomIE(InfoExtractor):
3863         """Information extractor for Stanford's Open ClassRoom"""
3864
3865         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3866         IE_NAME = u'stanfordoc'
3867
3868         def report_download_webpage(self, objid):
3869                 """Report information extraction."""
3870                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3871
3872         def report_extraction(self, video_id):
3873                 """Report information extraction."""
3874                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3875
3876         def _real_extract(self, url):
3877                 mobj = re.match(self._VALID_URL, url)
3878                 if mobj is None:
3879                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3880                         return
3881
3882                 if mobj.group('course') and mobj.group('video'): # A specific video
3883                         course = mobj.group('course')
3884                         video = mobj.group('video')
3885                         info = {
3886                                 'id': _simplify_title(course + '_' + video),
3887                         }
3888
3889                         self.report_extraction(info['id'])
3890                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3891                         xmlUrl = baseUrl + video + '.xml'
3892                         try:
3893                                 metaXml = urllib2.urlopen(xmlUrl).read()
3894                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3895                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3896                                 return
3897                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3898                         try:
3899                                 info['title'] = mdoc.findall('./title')[0].text
3900                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3901                         except IndexError:
3902                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3903                                 return
3904                         info['stitle'] = _simplify_title(info['title'])
3905                         info['ext'] = info['url'].rpartition('.')[2]
3906                         info['format'] = info['ext']
3907                         self._downloader.increment_downloads()
3908                         try:
3909                                 self._downloader.process_info(info)
3910                         except UnavailableVideoError, err:
3911                                 self._downloader.trouble(u'\nERROR: unable to download video')
3912                 elif mobj.group('course'): # A course page
3913                         unescapeHTML = HTMLParser.HTMLParser().unescape
3914
3915                         course = mobj.group('course')
3916                         info = {
3917                                 'id': _simplify_title(course),
3918                                 'type': 'playlist',
3919                         }
3920
3921                         self.report_download_webpage(info['id'])
3922                         try:
3923                                 coursepage = urllib2.urlopen(url).read()
3924                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3925                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3926                                 return
3927
3928                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3929                         if m:
3930                                 info['title'] = unescapeHTML(m.group(1))
3931                         else:
3932                                 info['title'] = info['id']
3933                         info['stitle'] = _simplify_title(info['title'])
3934
3935                         m = re.search('<description>([^<]+)</description>', coursepage)
3936                         if m:
3937                                 info['description'] = unescapeHTML(m.group(1))
3938
3939                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3940                         info['list'] = [
3941                                 {
3942                                         'type': 'reference',
3943                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3944                                 }
3945                                         for vpage in links]
3946
3947                         for entry in info['list']:
3948                                 assert entry['type'] == 'reference'
3949                                 self.extract(entry['url'])
3950                 else: # Root page
3951                         unescapeHTML = HTMLParser.HTMLParser().unescape
3952
3953                         info = {
3954                                 'id': 'Stanford OpenClassroom',
3955                                 'type': 'playlist',
3956                         }
3957
3958                         self.report_download_webpage(info['id'])
3959                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3960                         try:
3961                                 rootpage = urllib2.urlopen(rootURL).read()
3962                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3963                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3964                                 return
3965
3966                         info['title'] = info['id']
3967                         info['stitle'] = _simplify_title(info['title'])
3968
3969                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3970                         info['list'] = [
3971                                 {
3972                                         'type': 'reference',
3973                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3974                                 }
3975                                         for cpage in links]
3976
3977                         for entry in info['list']:
3978                                 assert entry['type'] == 'reference'
3979                                 self.extract(entry['url'])
3980
3981 class MTVIE(InfoExtractor):
3982         """Information extractor for MTV.com"""
3983
3984         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3985         IE_NAME = u'mtv'
3986
3987         def report_webpage(self, video_id):
3988                 """Report information extraction."""
3989                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3990
3991         def report_extraction(self, video_id):
3992                 """Report information extraction."""
3993                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3994
3995         def _real_extract(self, url):
3996                 mobj = re.match(self._VALID_URL, url)
3997                 if mobj is None:
3998                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3999                         return
4000                 if not mobj.group('proto'):
4001                         url = 'http://' + url
4002                 video_id = mobj.group('videoid')
4003                 self.report_webpage(video_id)
4004
4005                 request = urllib2.Request(url)
4006                 try:
4007                         webpage = urllib2.urlopen(request).read()
4008                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4009                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4010                         return
4011
4012                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4013                 if mobj is None:
4014                         self._downloader.trouble(u'ERROR: unable to extract song name')
4015                         return
4016                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4018                 if mobj is None:
4019                         self._downloader.trouble(u'ERROR: unable to extract performer')
4020                         return
4021                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022                 video_title = performer + ' - ' + song_name
4023
4024                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4025                 if mobj is None:
4026                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4027                         return
4028                 mtvn_uri = mobj.group(1)
4029
4030                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4031                 if mobj is None:
4032                         self._downloader.trouble(u'ERROR: unable to extract content id')
4033                         return
4034                 content_id = mobj.group(1)
4035
4036                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4037                 self.report_extraction(video_id)
4038                 request = urllib2.Request(videogen_url)
4039                 try:
4040                         metadataXml = urllib2.urlopen(request).read()
4041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4042                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4043                         return
4044
4045                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4046                 renditions = mdoc.findall('.//rendition')
4047
4048                 # For now, always pick the highest quality.
4049                 rendition = renditions[-1]
4050
4051                 try:
4052                         _,_,ext = rendition.attrib['type'].partition('/')
4053                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4054                         video_url = rendition.find('./src').text
4055                 except KeyError:
4056                         self._downloader.trouble('Invalid rendition field.')
4057                         return
4058
4059                 self._downloader.increment_downloads()
4060                 info = {
4061                         'id': video_id,
4062                         'url': video_url,
4063                         'uploader': performer,
4064                         'title': video_title,
4065                         'stitle': _simplify_title(video_title),
4066                         'ext': ext,
4067                         'format': format,
4068                 }
4069
4070                 try:
4071                         self._downloader.process_info(info)
4072                 except UnavailableVideoError, err:
4073                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4074
4075
4076 class PostProcessor(object):
4077         """Post Processor class.
4078
4079         PostProcessor objects can be added to downloaders with their
4080         add_post_processor() method. When the downloader has finished a
4081         successful download, it will take its internal chain of PostProcessors
4082         and start calling the run() method on each one of them, first with
4083         an initial argument and then with the returned value of the previous
4084         PostProcessor.
4085
4086         The chain will be stopped if one of them ever returns None or the end
4087         of the chain is reached.
4088
4089         PostProcessor objects follow a "mutual registration" process similar
4090         to InfoExtractor objects.
4091         """
4092
4093         _downloader = None
4094
4095         def __init__(self, downloader=None):
4096                 self._downloader = downloader
4097
4098         def set_downloader(self, downloader):
4099                 """Sets the downloader for this PP."""
4100                 self._downloader = downloader
4101
4102         def run(self, information):
4103                 """Run the PostProcessor.
4104
4105                 The "information" argument is a dictionary like the ones
4106                 composed by InfoExtractors. The only difference is that this
4107                 one has an extra field called "filepath" that points to the
4108                 downloaded file.
4109
4110                 When this method returns None, the postprocessing chain is
4111                 stopped. However, this method may return an information
4112                 dictionary that will be passed to the next postprocessing
4113                 object in the chain. It can be the one it received after
4114                 changing some fields.
4115
4116                 In addition, this method may raise a PostProcessingError
4117                 exception that will be taken into account by the downloader
4118                 it was called from.
4119                 """
4120                 return information # by default, do nothing
4121
4122 class AudioConversionError(BaseException):
4123         def __init__(self, message):
4124                 self.message = message
4125
4126 class FFmpegExtractAudioPP(PostProcessor):
4127
4128         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4129                 PostProcessor.__init__(self, downloader)
4130                 if preferredcodec is None:
4131                         preferredcodec = 'best'
4132                 self._preferredcodec = preferredcodec
4133                 self._preferredquality = preferredquality
4134                 self._keepvideo = keepvideo
4135
4136         @staticmethod
4137         def get_audio_codec(path):
4138                 try:
4139                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4140                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4141                         output = handle.communicate()[0]
4142                         if handle.wait() != 0:
4143                                 return None
4144                 except (IOError, OSError):
4145                         return None
4146                 audio_codec = None
4147                 for line in output.split('\n'):
4148                         if line.startswith('codec_name='):
4149                                 audio_codec = line.split('=')[1].strip()
4150                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4151                                 return audio_codec
4152                 return None
4153
4154         @staticmethod
4155         def run_ffmpeg(path, out_path, codec, more_opts):
4156                 if codec is None:
4157                         acodec_opts = []
4158                 else:
4159                         acodec_opts = ['-acodec', codec]
4160                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4161                 try:
4162                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4163                         stdout,stderr = p.communicate()
4164                 except (IOError, OSError):
4165                         e = sys.exc_info()[1]
4166                         if isinstance(e, OSError) and e.errno == 2:
4167                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4168                         else:
4169                                 raise e
4170                 if p.returncode != 0:
4171                         msg = stderr.strip().split('\n')[-1]
4172                         raise AudioConversionError(msg)
4173
4174         def run(self, information):
4175                 path = information['filepath']
4176
4177                 filecodec = self.get_audio_codec(path)
4178                 if filecodec is None:
4179                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4180                         return None
4181
4182                 more_opts = []
4183                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4184                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4185                                 # Lossless, but in another container
4186                                 acodec = 'copy'
4187                                 extension = self._preferredcodec
4188                                 more_opts = ['-absf', 'aac_adtstoasc']
4189                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4190                                 # Lossless if possible
4191                                 acodec = 'copy'
4192                                 extension = filecodec
4193                                 if filecodec == 'aac':
4194                                         more_opts = ['-f', 'adts']
4195                                 if filecodec == 'vorbis':
4196                                         extension = 'ogg'
4197                         else:
4198                                 # MP3 otherwise.
4199                                 acodec = 'libmp3lame'
4200                                 extension = 'mp3'
4201                                 more_opts = []
4202                                 if self._preferredquality is not None:
4203                                         more_opts += ['-ab', self._preferredquality]
4204                 else:
4205                         # We convert the audio (lossy)
4206                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4207                         extension = self._preferredcodec
4208                         more_opts = []
4209                         if self._preferredquality is not None:
4210                                 more_opts += ['-ab', self._preferredquality]
4211                         if self._preferredcodec == 'aac':
4212                                 more_opts += ['-f', 'adts']
4213                         if self._preferredcodec == 'm4a':
4214                                 more_opts += ['-absf', 'aac_adtstoasc']
4215                         if self._preferredcodec == 'vorbis':
4216                                 extension = 'ogg'
4217                         if self._preferredcodec == 'wav':
4218                                 extension = 'wav'
4219                                 more_opts += ['-f', 'wav']
4220
4221                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4222                 new_path = prefix + sep + extension
4223                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4224                 try:
4225                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4226                 except:
4227                         etype,e,tb = sys.exc_info()
4228                         if isinstance(e, AudioConversionError):
4229                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4230                         else:
4231                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4232                         return None
4233
4234                 # Try to update the date time for extracted audio file.
4235                 if information.get('filetime') is not None:
4236                         try:
4237                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4238                         except:
4239                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4240
4241                 if not self._keepvideo:
4242                         try:
4243                                 os.remove(_encodeFilename(path))
4244                         except (IOError, OSError):
4245                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4246                                 return None
4247
4248                 information['filepath'] = new_path
4249                 return information
4250
4251
4252 def updateSelf(downloader, filename):
4253         ''' Update the program file with the latest version from the repository '''
4254         # Note: downloader only used for options
4255         if not os.access(filename, os.W_OK):
4256                 sys.exit('ERROR: no write permissions on %s' % filename)
4257
4258         downloader.to_screen(u'Updating to latest version...')
4259
4260         try:
4261                 try:
4262                         urlh = urllib.urlopen(UPDATE_URL)
4263                         newcontent = urlh.read()
4264
4265                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4266                         if vmatch is not None and vmatch.group(1) == __version__:
4267                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4268                                 return
4269                 finally:
4270                         urlh.close()
4271         except (IOError, OSError), err:
4272                 sys.exit('ERROR: unable to download latest version')
4273
4274         try:
4275                 outf = open(filename, 'wb')
4276                 try:
4277                         outf.write(newcontent)
4278                 finally:
4279                         outf.close()
4280         except (IOError, OSError), err:
4281                 sys.exit('ERROR: unable to overwrite current version')
4282
4283         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4284
4285 def parseOpts():
4286         def _readOptions(filename_bytes):
4287                 try:
4288                         optionf = open(filename_bytes)
4289                 except IOError:
4290                         return [] # silently skip if file is not present
4291                 try:
4292                         res = []
4293                         for l in optionf:
4294                                 res += shlex.split(l, comments=True)
4295                 finally:
4296                         optionf.close()
4297                 return res
4298
4299         def _format_option_string(option):
4300                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4301
4302                 opts = []
4303
4304                 if option._short_opts: opts.append(option._short_opts[0])
4305                 if option._long_opts: opts.append(option._long_opts[0])
4306                 if len(opts) > 1: opts.insert(1, ', ')
4307
4308                 if option.takes_value(): opts.append(' %s' % option.metavar)
4309
4310                 return "".join(opts)
4311
4312         def _find_term_columns():
4313                 columns = os.environ.get('COLUMNS', None)
4314                 if columns:
4315                         return int(columns)
4316
4317                 try:
4318                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4319                         out,err = sp.communicate()
4320                         return int(out.split()[1])
4321                 except:
4322                         pass
4323                 return None
4324
4325         max_width = 80
4326         max_help_position = 80
4327
4328         # No need to wrap help messages if we're on a wide console
4329         columns = _find_term_columns()
4330         if columns: max_width = columns
4331
4332         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4333         fmt.format_option_strings = _format_option_string
4334
4335         kw = {
4336                 'version'   : __version__,
4337                 'formatter' : fmt,
4338                 'usage' : '%prog [options] url [url...]',
4339                 'conflict_handler' : 'resolve',
4340         }
4341
4342         parser = optparse.OptionParser(**kw)
4343
4344         # option groups
4345         general        = optparse.OptionGroup(parser, 'General Options')
4346         selection      = optparse.OptionGroup(parser, 'Video Selection')
4347         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4348         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4349         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4350         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4351         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4352
4353         general.add_option('-h', '--help',
4354                         action='help', help='print this help text and exit')
4355         general.add_option('-v', '--version',
4356                         action='version', help='print program version and exit')
4357         general.add_option('-U', '--update',
4358                         action='store_true', dest='update_self', help='update this program to latest version')
4359         general.add_option('-i', '--ignore-errors',
4360                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4361         general.add_option('-r', '--rate-limit',
4362                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4363         general.add_option('-R', '--retries',
4364                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4365         general.add_option('--dump-user-agent',
4366                         action='store_true', dest='dump_user_agent',
4367                         help='display the current browser identification', default=False)
4368         general.add_option('--list-extractors',
4369                         action='store_true', dest='list_extractors',
4370                         help='List all supported extractors and the URLs they would handle', default=False)
4371
4372         selection.add_option('--playlist-start',
4373                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4374         selection.add_option('--playlist-end',
4375                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4376         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4377         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4378         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4379
4380         authentication.add_option('-u', '--username',
4381                         dest='username', metavar='USERNAME', help='account username')
4382         authentication.add_option('-p', '--password',
4383                         dest='password', metavar='PASSWORD', help='account password')
4384         authentication.add_option('-n', '--netrc',
4385                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4386
4387
4388         video_format.add_option('-f', '--format',
4389                         action='store', dest='format', metavar='FORMAT', help='video format code')
4390         video_format.add_option('--all-formats',
4391                         action='store_const', dest='format', help='download all available video formats', const='all')
4392         video_format.add_option('--prefer-free-formats',
4393                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4394         video_format.add_option('--max-quality',
4395                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4396         video_format.add_option('-F', '--list-formats',
4397                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4398         video_format.add_option('--write-srt',
4399                         action='store_true', dest='writesubtitles',
4400                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4401         video_format.add_option('--srt-lang',
4402                         action='store', dest='subtitleslang', metavar='LANG',
4403                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4404
4405
4406         verbosity.add_option('-q', '--quiet',
4407                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4408         verbosity.add_option('-s', '--simulate',
4409                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4410         verbosity.add_option('--skip-download',
4411                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4412         verbosity.add_option('-g', '--get-url',
4413                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4414         verbosity.add_option('-e', '--get-title',
4415                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4416         verbosity.add_option('--get-thumbnail',
4417                         action='store_true', dest='getthumbnail',
4418                         help='simulate, quiet but print thumbnail URL', default=False)
4419         verbosity.add_option('--get-description',
4420                         action='store_true', dest='getdescription',
4421                         help='simulate, quiet but print video description', default=False)
4422         verbosity.add_option('--get-filename',
4423                         action='store_true', dest='getfilename',
4424                         help='simulate, quiet but print output filename', default=False)
4425         verbosity.add_option('--get-format',
4426                         action='store_true', dest='getformat',
4427                         help='simulate, quiet but print output format', default=False)
4428         verbosity.add_option('--no-progress',
4429                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4430         verbosity.add_option('--console-title',
4431                         action='store_true', dest='consoletitle',
4432                         help='display progress in console titlebar', default=False)
4433         verbosity.add_option('-v', '--verbose',
4434                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4435
4436
4437         filesystem.add_option('-t', '--title',
4438                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4439         filesystem.add_option('-l', '--literal',
4440                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4441         filesystem.add_option('-A', '--auto-number',
4442                         action='store_true', dest='autonumber',
4443                         help='number downloaded files starting from 00000', default=False)
4444         filesystem.add_option('-o', '--output',
4445                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4446         filesystem.add_option('-a', '--batch-file',
4447                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4448         filesystem.add_option('-w', '--no-overwrites',
4449                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4450         filesystem.add_option('-c', '--continue',
4451                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4452         filesystem.add_option('--no-continue',
4453                         action='store_false', dest='continue_dl',
4454                         help='do not resume partially downloaded files (restart from beginning)')
4455         filesystem.add_option('--cookies',
4456                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4457         filesystem.add_option('--no-part',
4458                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4459         filesystem.add_option('--no-mtime',
4460                         action='store_false', dest='updatetime',
4461                         help='do not use the Last-modified header to set the file modification time', default=True)
4462         filesystem.add_option('--write-description',
4463                         action='store_true', dest='writedescription',
4464                         help='write video description to a .description file', default=False)
4465         filesystem.add_option('--write-info-json',
4466                         action='store_true', dest='writeinfojson',
4467                         help='write video metadata to a .info.json file', default=False)
4468
4469
4470         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4471                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4472         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4473                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4474         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4475                         help='ffmpeg audio bitrate specification, 128k by default')
4476         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4477                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4478
4479
4480         parser.add_option_group(general)
4481         parser.add_option_group(selection)
4482         parser.add_option_group(filesystem)
4483         parser.add_option_group(verbosity)
4484         parser.add_option_group(video_format)
4485         parser.add_option_group(authentication)
4486         parser.add_option_group(postproc)
4487
4488         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4489         if xdg_config_home:
4490                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4491         else:
4492                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4493         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4494         opts, args = parser.parse_args(argv)
4495
4496         return parser, opts, args
4497
4498 def gen_extractors():
4499         """ Return a list of an instance of every supported extractor.
4500         The order does matter; the first extractor matched is the one handling the URL.
4501         """
4502         youtube_ie = YoutubeIE()
4503         google_ie = GoogleIE()
4504         yahoo_ie = YahooIE()
4505         return [
4506                 YoutubePlaylistIE(youtube_ie),
4507                 YoutubeUserIE(youtube_ie),
4508                 YoutubeSearchIE(youtube_ie),
4509                 youtube_ie,
4510                 MetacafeIE(youtube_ie),
4511                 DailymotionIE(),
4512                 google_ie,
4513                 GoogleSearchIE(google_ie),
4514                 PhotobucketIE(),
4515                 yahoo_ie,
4516                 YahooSearchIE(yahoo_ie),
4517                 DepositFilesIE(),
4518                 FacebookIE(),
4519                 BlipTVIE(),
4520                 VimeoIE(),
4521                 MyVideoIE(),
4522                 ComedyCentralIE(),
4523                 EscapistIE(),
4524                 CollegeHumorIE(),
4525                 XVideosIE(),
4526                 SoundcloudIE(),
4527                 InfoQIE(),
4528                 MixcloudIE(),
4529                 StanfordOpenClassroomIE(),
4530                 MTVIE(),
4531
4532                 GenericIE()
4533         ]
4534
4535 def _real_main():
4536         parser, opts, args = parseOpts()
4537
4538         # Open appropriate CookieJar
4539         if opts.cookiefile is None:
4540                 jar = cookielib.CookieJar()
4541         else:
4542                 try:
4543                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4544                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4545                                 jar.load()
4546                 except (IOError, OSError), err:
4547                         sys.exit(u'ERROR: unable to open cookie file')
4548
4549         # Dump user agent
4550         if opts.dump_user_agent:
4551                 print std_headers['User-Agent']
4552                 sys.exit(0)
4553
4554         # Batch file verification
4555         batchurls = []
4556         if opts.batchfile is not None:
4557                 try:
4558                         if opts.batchfile == '-':
4559                                 batchfd = sys.stdin
4560                         else:
4561                                 batchfd = open(opts.batchfile, 'r')
4562                         batchurls = batchfd.readlines()
4563                         batchurls = [x.strip() for x in batchurls]
4564                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4565                 except IOError:
4566                         sys.exit(u'ERROR: batch file could not be read')
4567         all_urls = batchurls + args
4568
4569         # General configuration
4570         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4571         proxy_handler = urllib2.ProxyHandler()
4572         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4573         urllib2.install_opener(opener)
4574         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4575
4576         if opts.verbose:
4577                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4578
4579         extractors = gen_extractors()
4580
4581         if opts.list_extractors:
4582                 for ie in extractors:
4583                         print(ie.IE_NAME)
4584                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4585                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4586                         for mu in matchedUrls:
4587                                 print(u'  ' + mu)
4588                 sys.exit(0)
4589
4590         # Conflicting, missing and erroneous options
4591         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4592                 parser.error(u'using .netrc conflicts with giving username/password')
4593         if opts.password is not None and opts.username is None:
4594                 parser.error(u'account username missing')
4595         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4596                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4597         if opts.usetitle and opts.useliteral:
4598                 parser.error(u'using title conflicts with using literal title')
4599         if opts.username is not None and opts.password is None:
4600                 opts.password = getpass.getpass(u'Type account password and press return:')
4601         if opts.ratelimit is not None:
4602                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4603                 if numeric_limit is None:
4604                         parser.error(u'invalid rate limit specified')
4605                 opts.ratelimit = numeric_limit
4606         if opts.retries is not None:
4607                 try:
4608                         opts.retries = long(opts.retries)
4609                 except (TypeError, ValueError), err:
4610                         parser.error(u'invalid retry count specified')
4611         try:
4612                 opts.playliststart = int(opts.playliststart)
4613                 if opts.playliststart <= 0:
4614                         raise ValueError(u'Playlist start must be positive')
4615         except (TypeError, ValueError), err:
4616                 parser.error(u'invalid playlist start number specified')
4617         try:
4618                 opts.playlistend = int(opts.playlistend)
4619                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4620                         raise ValueError(u'Playlist end must be greater than playlist start')
4621         except (TypeError, ValueError), err:
4622                 parser.error(u'invalid playlist end number specified')
4623         if opts.extractaudio:
4624                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4625                         parser.error(u'invalid audio format specified')
4626
4627         # File downloader
4628         fd = FileDownloader({
4629                 'usenetrc': opts.usenetrc,
4630                 'username': opts.username,
4631                 'password': opts.password,
4632                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633                 'forceurl': opts.geturl,
4634                 'forcetitle': opts.gettitle,
4635                 'forcethumbnail': opts.getthumbnail,
4636                 'forcedescription': opts.getdescription,
4637                 'forcefilename': opts.getfilename,
4638                 'forceformat': opts.getformat,
4639                 'simulate': opts.simulate,
4640                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4641                 'format': opts.format,
4642                 'format_limit': opts.format_limit,
4643                 'listformats': opts.listformats,
4644                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4645                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4646                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4647                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4648                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4649                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4650                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4651                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4652                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4653                         or u'%(id)s.%(ext)s'),
4654                 'ignoreerrors': opts.ignoreerrors,
4655                 'ratelimit': opts.ratelimit,
4656                 'nooverwrites': opts.nooverwrites,
4657                 'retries': opts.retries,
4658                 'continuedl': opts.continue_dl,
4659                 'noprogress': opts.noprogress,
4660                 'playliststart': opts.playliststart,
4661                 'playlistend': opts.playlistend,
4662                 'logtostderr': opts.outtmpl == '-',
4663                 'consoletitle': opts.consoletitle,
4664                 'nopart': opts.nopart,
4665                 'updatetime': opts.updatetime,
4666                 'writedescription': opts.writedescription,
4667                 'writeinfojson': opts.writeinfojson,
4668                 'writesubtitles': opts.writesubtitles,
4669                 'subtitleslang': opts.subtitleslang,
4670                 'matchtitle': opts.matchtitle,
4671                 'rejecttitle': opts.rejecttitle,
4672                 'max_downloads': opts.max_downloads,
4673                 'prefer_free_formats': opts.prefer_free_formats,
4674                 'verbose': opts.verbose,
4675                 })
4676         for extractor in extractors:
4677                 fd.add_info_extractor(extractor)
4678
4679         # PostProcessors
4680         if opts.extractaudio:
4681                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4682
4683         # Update version
4684         if opts.update_self:
4685                 updateSelf(fd, sys.argv[0])
4686
4687         # Maybe do nothing
4688         if len(all_urls) < 1:
4689                 if not opts.update_self:
4690                         parser.error(u'you must provide at least one URL')
4691                 else:
4692                         sys.exit()
4693
4694         try:
4695                 retcode = fd.download(all_urls)
4696         except MaxDownloadsReached:
4697                 fd.to_screen(u'--max-download limit reached, aborting.')
4698                 retcode = 101
4699
4700         # Dump cookie jar if requested
4701         if opts.cookiefile is not None:
4702                 try:
4703                         jar.save()
4704                 except (IOError, OSError), err:
4705                         sys.exit(u'ERROR: unable to save cookie jar')
4706
4707         sys.exit(retcode)
4708
4709 def main():
4710         try:
4711                 _real_main()
4712         except DownloadError:
4713                 sys.exit(1)
4714         except SameFileError:
4715                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4716         except KeyboardInterrupt:
4717                 sys.exit(u'\nERROR: Interrupted by user')
4718
4719 if __name__ == '__main__':
4720         main()
4721
4722 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: