_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2012.02.27'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25
  26 import cookielib
  27 import datetime
  28 import getpass
  29 import gzip
  30 import htmlentitydefs
  31 import HTMLParser
  32 import httplib
  33 import locale
  34 import math
  35 import netrc
  36 import optparse
  37 import os
  38 import os.path
  39 import re
  40 import shlex
  41 import socket
  42 import string
  43 import subprocess
  44 import sys
  45 import time
  46 import urllib
  47 import urllib2
  48 import warnings
  49 import zlib
  50
  51 if os.name == 'nt':
  52         import ctypes
  53
  54 try:
  55         import email.utils
  56 except ImportError: # Python 2.4
  57         import email.Utils
  58 try:
  59         import cStringIO as StringIO
  60 except ImportError:
  61         import StringIO
  62
  63 # parse_qs was moved from the cgi module to the urlparse module recently.
  64 try:
  65         from urlparse import parse_qs
  66 except ImportError:
  67         from cgi import parse_qs
  68
  69 try:
  70         import lxml.etree
  71 except ImportError:
  72         pass # Handled below
  73
  74 try:
  75         import xml.etree.ElementTree
  76 except ImportError: # Python<2.5: Not officially supported, but let it slip
  77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  78
  79 std_headers = {
  80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83         'Accept-Encoding': 'gzip, deflate',
  84         'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87 try:
  88         import json
  89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  90         import re
  91         class json(object):
  92                 @staticmethod
  93                 def loads(s):
  94                         s = s.decode('UTF-8')
  95                         def raiseError(msg, i):
  96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  97                         def skipSpace(i, expectMore=True):
  98                                 while i < len(s) and s[i] in ' \t\r\n':
  99                                         i += 1
 100                                 if expectMore:
 101                                         if i >= len(s):
 102                                                 raiseError('Premature end', i)
 103                                 return i
 104                         def decodeEscape(match):
 105                                 esc = match.group(1)
 106                                 _STATIC = {
 107                                         '"': '"',
 108                                         '\\': '\\',
 109                                         '/': '/',
 110                                         'b': unichr(0x8),
 111                                         'f': unichr(0xc),
 112                                         'n': '\n',
 113                                         'r': '\r',
 114                                         't': '\t',
 115                                 }
 116                                 if esc in _STATIC:
 117                                         return _STATIC[esc]
 118                                 if esc[0] == 'u':
 119                                         if len(esc) == 1+4:
 120                                                 return unichr(int(esc[1:5], 16))
 121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 122                                                 hi = int(esc[1:5], 16)
 123                                                 low = int(esc[7:11], 16)
 124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 125                                 raise ValueError('Unknown escape ' + str(esc))
 126                         def parseString(i):
 127                                 i += 1
 128                                 e = i
 129                                 while True:
 130                                         e = s.index('"', e)
 131                                         bslashes = 0
 132                                         while s[e-bslashes-1] == '\\':
 133                                                 bslashes += 1
 134                                         if bslashes % 2 == 1:
 135                                                 e += 1
 136                                                 continue
 137                                         break
 138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 139                                 stri = rexp.sub(decodeEscape, s[i:e])
 140                                 return (e+1,stri)
 141                         def parseObj(i):
 142                                 i += 1
 143                                 res = {}
 144                                 i = skipSpace(i)
 145                                 if s[i] == '}': # Empty dictionary
 146                                         return (i+1,res)
 147                                 while True:
 148                                         if s[i] != '"':
 149                                                 raiseError('Expected a string object key', i)
 150                                         i,key = parseString(i)
 151                                         i = skipSpace(i)
 152                                         if i >= len(s) or s[i] != ':':
 153                                                 raiseError('Expected a colon', i)
 154                                         i,val = parse(i+1)
 155                                         res[key] = val
 156                                         i = skipSpace(i)
 157                                         if s[i] == '}':
 158                                                 return (i+1, res)
 159                                         if s[i] != ',':
 160                                                 raiseError('Expected comma or closing curly brace', i)
 161                                         i = skipSpace(i+1)
 162                         def parseArray(i):
 163                                 res = []
 164                                 i = skipSpace(i+1)
 165                                 if s[i] == ']': # Empty array
 166                                         return (i+1,res)
 167                                 while True:
 168                                         i,val = parse(i)
 169                                         res.append(val)
 170                                         i = skipSpace(i) # Raise exception if premature end
 171                                         if s[i] == ']':
 172                                                 return (i+1, res)
 173                                         if s[i] != ',':
 174                                                 raiseError('Expected a comma or closing bracket', i)
 175                                         i = skipSpace(i+1)
 176                         def parseDiscrete(i):
 177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 178                                         if s.startswith(k, i):
 179                                                 return (i+len(k), v)
 180                                 raiseError('Not a boolean (or null)', i)
 181                         def parseNumber(i):
 182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 183                                 if mobj is None:
 184                                         raiseError('Not a number', i)
 185                                 nums = mobj.group(1)
 186                                 if '.' in nums or 'e' in nums or 'E' in nums:
 187                                         return (i+len(nums), float(nums))
 188                                 return (i+len(nums), int(nums))
 189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 190                         def parse(i):
 191                                 i = skipSpace(i)
 192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 193                                 i = skipSpace(i, False)
 194                                 return (i,res)
 195                         i,res = parse(0)
 196                         if i < len(s):
 197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 198                         return res
 199
 200 def preferredencoding():
 201         """Get preferred encoding.
 202
 203         Returns the best encoding scheme for the system, based on
 204         locale.getpreferredencoding() and some further tweaks.
 205         """
 206         def yield_preferredencoding():
 207                 try:
 208                         pref = locale.getpreferredencoding()
 209                         u'TEST'.encode(pref)
 210                 except:
 211                         pref = 'UTF-8'
 212                 while True:
 213                         yield pref
 214         return yield_preferredencoding().next()
 215
 216
 217 def htmlentity_transform(matchobj):
 218         """Transforms an HTML entity to a Unicode character.
 219
 220         This function receives a match object and is intended to be used with
 221         the re.sub() function.
 222         """
 223         entity = matchobj.group(1)
 224
 225         # Known non-numeric HTML entity
 226         if entity in htmlentitydefs.name2codepoint:
 227                 return unichr(htmlentitydefs.name2codepoint[entity])
 228
 229         # Unicode character
 230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 231         if mobj is not None:
 232                 numstr = mobj.group(1)
 233                 if numstr.startswith(u'x'):
 234                         base = 16
 235                         numstr = u'0%s' % numstr
 236                 else:
 237                         base = 10
 238                 return unichr(long(numstr, base))
 239
 240         # Unknown entity in name, return its literal representation
 241         return (u'&%s;' % entity)
 242
 243
 244 def sanitize_title(utitle):
 245         """Sanitizes a video title so it could be used as part of a filename."""
 246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 247         return utitle.replace(unicode(os.sep), u'%')
 248
 249
 250 def sanitize_open(filename, open_mode):
 251         """Try to open the given filename, and slightly tweak it if this fails.
 252
 253         Attempts to open the given filename. If this fails, it tries to change
 254         the filename slightly, step by step, until it's either able to open it
 255         or it fails and raises a final exception, like the standard open()
 256         function.
 257
 258         It returns the tuple (stream, definitive_file_name).
 259         """
 260         try:
 261                 if filename == u'-':
 262                         if sys.platform == 'win32':
 263                                 import msvcrt
 264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 265                         return (sys.stdout, filename)
 266                 stream = open(_encodeFilename(filename), open_mode)
 267                 return (stream, filename)
 268         except (IOError, OSError), err:
 269                 # In case of error, try to remove win32 forbidden chars
 270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 271
 272                 # An exception here should be caught in the caller
 273                 stream = open(_encodeFilename(filename), open_mode)
 274                 return (stream, filename)
 275
 276
 277 def timeconvert(timestr):
 278         """Convert RFC 2822 defined time string into system timestamp"""
 279         timestamp = None
 280         timetuple = email.utils.parsedate_tz(timestr)
 281         if timetuple is not None:
 282                 timestamp = email.utils.mktime_tz(timetuple)
 283         return timestamp
 284
 285 def _simplify_title(title):
 286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 287         return expr.sub(u'_', title).strip(u'_')
 288
 289 def _orderedSet(iterable):
 290         """ Remove all duplicates from the input iterable """
 291         res = []
 292         for el in iterable:
 293                 if el not in res:
 294                         res.append(el)
 295         return res
 296
 297 def _unescapeHTML(s):
 298         """
 299         @param s a string (of type unicode)
 300         """
 301         assert type(s) == type(u'')
 302
 303         htmlParser = HTMLParser.HTMLParser()
 304         return htmlParser.unescape(s)
 305
 306 def _encodeFilename(s):
 307         """
 308         @param s The name of the file (of type unicode)
 309         """
 310
 311         assert type(s) == type(u'')
 312
 313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 317                 return s
 318         else:
 319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 320
 321 class DownloadError(Exception):
 322         """Download Error exception.
 323
 324         This exception may be thrown by FileDownloader objects if they are not
 325         configured to continue on errors. They will contain the appropriate
 326         error message.
 327         """
 328         pass
 329
 330
 331 class SameFileError(Exception):
 332         """Same File exception.
 333
 334         This exception will be thrown by FileDownloader objects if they detect
 335         multiple files would have to be downloaded to the same file on disk.
 336         """
 337         pass
 338
 339
 340 class PostProcessingError(Exception):
 341         """Post Processing exception.
 342
 343         This exception may be raised by PostProcessor's .run() method to
 344         indicate an error in the postprocessing task.
 345         """
 346         pass
 347
 348 class MaxDownloadsReached(Exception):
 349         """ --max-downloads limit has been reached. """
 350         pass
 351
 352
 353 class UnavailableVideoError(Exception):
 354         """Unavailable Format exception.
 355
 356         This exception will be thrown when a video is requested
 357         in a format that is not available for that video.
 358         """
 359         pass
 360
 361
 362 class ContentTooShortError(Exception):
 363         """Content Too Short exception.
 364
 365         This exception may be raised by FileDownloader objects when a file they
 366         download is too small for what the server announced first, indicating
 367         the connection was probably interrupted.
 368         """
 369         # Both in bytes
 370         downloaded = None
 371         expected = None
 372
 373         def __init__(self, downloaded, expected):
 374                 self.downloaded = downloaded
 375                 self.expected = expected
 376
 377
 378 class YoutubeDLHandler(urllib2.HTTPHandler):
 379         """Handler for HTTP requests and responses.
 380
 381         This class, when installed with an OpenerDirector, automatically adds
 382         the standard headers to every HTTP request and handles gzipped and
 383         deflated responses from web servers. If compression is to be avoided in
 384         a particular request, the original request in the program code only has
 385         to include the HTTP header "Youtubedl-No-Compression", which will be
 386         removed before making the real request.
 387
 388         Part of this code was copied from:
 389
 390         http://techknack.net/python-urllib2-handlers/
 391
 392         Andrew Rowls, the author of that code, agreed to release it to the
 393         public domain.
 394         """
 395
 396         @staticmethod
 397         def deflate(data):
 398                 try:
 399                         return zlib.decompress(data, -zlib.MAX_WBITS)
 400                 except zlib.error:
 401                         return zlib.decompress(data)
 402
 403         @staticmethod
 404         def addinfourl_wrapper(stream, headers, url, code):
 405                 if hasattr(urllib2.addinfourl, 'getcode'):
 406                         return urllib2.addinfourl(stream, headers, url, code)
 407                 ret = urllib2.addinfourl(stream, headers, url)
 408                 ret.code = code
 409                 return ret
 410
 411         def http_request(self, req):
 412                 for h in std_headers:
 413                         if h in req.headers:
 414                                 del req.headers[h]
 415                         req.add_header(h, std_headers[h])
 416                 if 'Youtubedl-no-compression' in req.headers:
 417                         if 'Accept-encoding' in req.headers:
 418                                 del req.headers['Accept-encoding']
 419                         del req.headers['Youtubedl-no-compression']
 420                 return req
 421
 422         def http_response(self, req, resp):
 423                 old_resp = resp
 424                 # gzip
 425                 if resp.headers.get('Content-encoding', '') == 'gzip':
 426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 428                         resp.msg = old_resp.msg
 429                 # deflate
 430                 if resp.headers.get('Content-encoding', '') == 'deflate':
 431                         gz = StringIO.StringIO(self.deflate(resp.read()))
 432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 433                         resp.msg = old_resp.msg
 434                 return resp
 435
 436
 437 class FileDownloader(object):
 438         """File Downloader class.
 439
 440         File downloader objects are the ones responsible of downloading the
 441         actual video file and writing it to disk if the user has requested
 442         it, among some other tasks. In most cases there should be one per
 443         program. As, given a video URL, the downloader doesn't know how to
 444         extract all the needed information, task that InfoExtractors do, it
 445         has to pass the URL to one of them.
 446
 447         For this, file downloader objects have a method that allows
 448         InfoExtractors to be registered in a given order. When it is passed
 449         a URL, the file downloader handles it to the first InfoExtractor it
 450         finds that reports being able to handle it. The InfoExtractor extracts
 451         all the information about the video or videos the URL refers to, and
 452         asks the FileDownloader to process the video information, possibly
 453         downloading the video.
 454
 455         File downloaders accept a lot of parameters. In order not to saturate
 456         the object constructor with arguments, it receives a dictionary of
 457         options instead. These options are available through the params
 458         attribute for the InfoExtractors to use. The FileDownloader also
 459         registers itself as the downloader in charge for the InfoExtractors
 460         that are added to it, so this is a "mutual registration".
 461
 462         Available options:
 463
 464         username:         Username for authentication purposes.
 465         password:         Password for authentication purposes.
 466         usenetrc:         Use netrc for authentication instead.
 467         quiet:            Do not print messages to stdout.
 468         forceurl:         Force printing final URL.
 469         forcetitle:       Force printing title.
 470         forcethumbnail:   Force printing thumbnail URL.
 471         forcedescription: Force printing description.
 472         forcefilename:    Force printing final filename.
 473         simulate:         Do not download the video files.
 474         format:           Video format code.
 475         format_limit:     Highest quality format to try.
 476         outtmpl:          Template for output names.
 477         ignoreerrors:     Do not stop on download errors.
 478         ratelimit:        Download speed limit, in bytes/sec.
 479         nooverwrites:     Prevent overwriting files.
 480         retries:          Number of times to retry for HTTP error 5xx
 481         continuedl:       Try to continue downloads if possible.
 482         noprogress:       Do not print the progress bar.
 483         playliststart:    Playlist item to start at.
 484         playlistend:      Playlist item to end at.
 485         matchtitle:       Download only matching titles.
 486         rejecttitle:      Reject downloads for matching titles.
 487         logtostderr:      Log messages to stderr instead of stdout.
 488         consoletitle:     Display progress in console window's titlebar.
 489         nopart:           Do not use temporary .part files.
 490         updatetime:       Use the Last-modified header to set output file timestamps.
 491         writedescription: Write the video description to a .description file
 492         writeinfojson:    Write the video description to a .info.json file
 493         writesubtitles:   Write the video subtitles to a .srt file
 494         subtitleslang:    Language of the subtitles to download
 495         """
 496
 497         params = None
 498         _ies = []
 499         _pps = []
 500         _download_retcode = None
 501         _num_downloads = None
 502         _screen_file = None
 503
 504         def __init__(self, params):
 505                 """Create a FileDownloader object with the given options."""
 506                 self._ies = []
 507                 self._pps = []
 508                 self._download_retcode = 0
 509                 self._num_downloads = 0
 510                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 511                 self.params = params
 512
 513         @staticmethod
 514         def format_bytes(bytes):
 515                 if bytes is None:
 516                         return 'N/A'
 517                 if type(bytes) is str:
 518                         bytes = float(bytes)
 519                 if bytes == 0.0:
 520                         exponent = 0
 521                 else:
 522                         exponent = long(math.log(bytes, 1024.0))
 523                 suffix = 'bkMGTPEZY'[exponent]
 524                 converted = float(bytes) / float(1024 ** exponent)
 525                 return '%.2f%s' % (converted, suffix)
 526
 527         @staticmethod
 528         def calc_percent(byte_counter, data_len):
 529                 if data_len is None:
 530                         return '---.-%'
 531                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 532
 533         @staticmethod
 534         def calc_eta(start, now, total, current):
 535                 if total is None:
 536                         return '--:--'
 537                 dif = now - start
 538                 if current == 0 or dif < 0.001: # One millisecond
 539                         return '--:--'
 540                 rate = float(current) / dif
 541                 eta = long((float(total) - float(current)) / rate)
 542                 (eta_mins, eta_secs) = divmod(eta, 60)
 543                 if eta_mins > 99:
 544                         return '--:--'
 545                 return '%02d:%02d' % (eta_mins, eta_secs)
 546
 547         @staticmethod
 548         def calc_speed(start, now, bytes):
 549                 dif = now - start
 550                 if bytes == 0 or dif < 0.001: # One millisecond
 551                         return '%10s' % '---b/s'
 552                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 553
 554         @staticmethod
 555         def best_block_size(elapsed_time, bytes):
 556                 new_min = max(bytes / 2.0, 1.0)
 557                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 558                 if elapsed_time < 0.001:
 559                         return long(new_max)
 560                 rate = bytes / elapsed_time
 561                 if rate > new_max:
 562                         return long(new_max)
 563                 if rate < new_min:
 564                         return long(new_min)
 565                 return long(rate)
 566
 567         @staticmethod
 568         def parse_bytes(bytestr):
 569                 """Parse a string indicating a byte quantity into a long integer."""
 570                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 571                 if matchobj is None:
 572                         return None
 573                 number = float(matchobj.group(1))
 574                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 575                 return long(round(number * multiplier))
 576
 577         def add_info_extractor(self, ie):
 578                 """Add an InfoExtractor object to the end of the list."""
 579                 self._ies.append(ie)
 580                 ie.set_downloader(self)
 581
 582         def add_post_processor(self, pp):
 583                 """Add a PostProcessor object to the end of the chain."""
 584                 self._pps.append(pp)
 585                 pp.set_downloader(self)
 586
 587         def to_screen(self, message, skip_eol=False):
 588                 """Print message to stdout if not in quiet mode."""
 589                 assert type(message) == type(u'')
 590                 if not self.params.get('quiet', False):
 591                         terminator = [u'\n', u''][skip_eol]
 592                         output = message + terminator
 593
 594                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 595                                 output = output.encode(preferredencoding(), 'ignore')
 596                         self._screen_file.write(output)
 597                         self._screen_file.flush()
 598
 599         def to_stderr(self, message):
 600                 """Print message to stderr."""
 601                 print >>sys.stderr, message.encode(preferredencoding())
 602
 603         def to_cons_title(self, message):
 604                 """Set console/terminal window title to message."""
 605                 if not self.params.get('consoletitle', False):
 606                         return
 607                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 608                         # c_wchar_p() might not be necessary if `message` is
 609                         # already of type unicode()
 610                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 611                 elif 'TERM' in os.environ:
 612                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 613
 614         def fixed_template(self):
 615                 """Checks if the output template is fixed."""
 616                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 617
 618         def trouble(self, message=None):
 619                 """Determine action to take when a download problem appears.
 620
 621                 Depending on if the downloader has been configured to ignore
 622                 download errors or not, this method may throw an exception or
 623                 not when errors are found, after printing the message.
 624                 """
 625                 if message is not None:
 626                         self.to_stderr(message)
 627                 if not self.params.get('ignoreerrors', False):
 628                         raise DownloadError(message)
 629                 self._download_retcode = 1
 630
 631         def slow_down(self, start_time, byte_counter):
 632                 """Sleep if the download speed is over the rate limit."""
 633                 rate_limit = self.params.get('ratelimit', None)
 634                 if rate_limit is None or byte_counter == 0:
 635                         return
 636                 now = time.time()
 637                 elapsed = now - start_time
 638                 if elapsed <= 0.0:
 639                         return
 640                 speed = float(byte_counter) / elapsed
 641                 if speed > rate_limit:
 642                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 643
 644         def temp_name(self, filename):
 645                 """Returns a temporary filename for the given filename."""
 646                 if self.params.get('nopart', False) or filename == u'-' or \
 647                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 648                         return filename
 649                 return filename + u'.part'
 650
 651         def undo_temp_name(self, filename):
 652                 if filename.endswith(u'.part'):
 653                         return filename[:-len(u'.part')]
 654                 return filename
 655
 656         def try_rename(self, old_filename, new_filename):
 657                 try:
 658                         if old_filename == new_filename:
 659                                 return
 660                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 661                 except (IOError, OSError), err:
 662                         self.trouble(u'ERROR: unable to rename file')
 663
 664         def try_utime(self, filename, last_modified_hdr):
 665                 """Try to set the last-modified time of the given file."""
 666                 if last_modified_hdr is None:
 667                         return
 668                 if not os.path.isfile(_encodeFilename(filename)):
 669                         return
 670                 timestr = last_modified_hdr
 671                 if timestr is None:
 672                         return
 673                 filetime = timeconvert(timestr)
 674                 if filetime is None:
 675                         return filetime
 676                 try:
 677                         os.utime(filename, (time.time(), filetime))
 678                 except:
 679                         pass
 680                 return filetime
 681
 682         def report_writedescription(self, descfn):
 683                 """ Report that the description file is being written """
 684                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 685
 686         def report_writesubtitles(self, srtfn):
 687                 """ Report that the subtitles file is being written """
 688                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 689
 690         def report_writeinfojson(self, infofn):
 691                 """ Report that the metadata file has been written """
 692                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 693
 694         def report_destination(self, filename):
 695                 """Report destination filename."""
 696                 self.to_screen(u'[download] Destination: ' + filename)
 697
 698         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 699                 """Report download progress."""
 700                 if self.params.get('noprogress', False):
 701                         return
 702                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 703                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 704                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 705                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 706
 707         def report_resuming_byte(self, resume_len):
 708                 """Report attempt to resume at given byte."""
 709                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 710
 711         def report_retry(self, count, retries):
 712                 """Report retry in case of HTTP error 5xx"""
 713                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 714
 715         def report_file_already_downloaded(self, file_name):
 716                 """Report file has already been fully downloaded."""
 717                 try:
 718                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 719                 except (UnicodeEncodeError), err:
 720                         self.to_screen(u'[download] The file has already been downloaded')
 721
 722         def report_unable_to_resume(self):
 723                 """Report it was impossible to resume download."""
 724                 self.to_screen(u'[download] Unable to resume')
 725
 726         def report_finish(self):
 727                 """Report download finished."""
 728                 if self.params.get('noprogress', False):
 729                         self.to_screen(u'[download] Download completed')
 730                 else:
 731                         self.to_screen(u'')
 732
 733         def increment_downloads(self):
 734                 """Increment the ordinal that assigns a number to each file."""
 735                 self._num_downloads += 1
 736
 737         def prepare_filename(self, info_dict):
 738                 """Generate the output filename."""
 739                 try:
 740                         template_dict = dict(info_dict)
 741                         template_dict['epoch'] = unicode(long(time.time()))
 742                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 743                         filename = self.params['outtmpl'] % template_dict
 744                         return filename
 745                 except (ValueError, KeyError), err:
 746                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 747                         return None
 748
 749         def _match_entry(self, info_dict):
 750                 """ Returns None iff the file should be downloaded """
 751
 752                 title = info_dict['title']
 753                 matchtitle = self.params.get('matchtitle', False)
 754                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 755                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 756                 rejecttitle = self.params.get('rejecttitle', False)
 757                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 758                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 759                 return None
 760
 761         def process_info(self, info_dict):
 762                 """Process a single dictionary returned by an InfoExtractor."""
 763
 764                 reason = self._match_entry(info_dict)
 765                 if reason is not None:
 766                         self.to_screen(u'[download] ' + reason)
 767                         return
 768
 769                 max_downloads = self.params.get('max_downloads')
 770                 if max_downloads is not None:
 771                         if self._num_downloads > int(max_downloads):
 772                                 raise MaxDownloadsReached()
 773
 774                 filename = self.prepare_filename(info_dict)
 775
 776                 # Forced printings
 777                 if self.params.get('forcetitle', False):
 778                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 779                 if self.params.get('forceurl', False):
 780                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 781                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 782                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 783                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 784                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 785                 if self.params.get('forcefilename', False) and filename is not None:
 786                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 787                 if self.params.get('forceformat', False):
 788                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 789
 790                 # Do nothing else if in simulate mode
 791                 if self.params.get('simulate', False):
 792                         return
 793
 794                 if filename is None:
 795                         return
 796
 797                 try:
 798                         dn = os.path.dirname(_encodeFilename(filename))
 799                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 800                                 os.makedirs(dn)
 801                 except (OSError, IOError), err:
 802                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 803                         return
 804
 805                 if self.params.get('writedescription', False):
 806                         try:
 807                                 descfn = filename + u'.description'
 808                                 self.report_writedescription(descfn)
 809                                 descfile = open(_encodeFilename(descfn), 'wb')
 810                                 try:
 811                                         descfile.write(info_dict['description'].encode('utf-8'))
 812                                 finally:
 813                                         descfile.close()
 814                         except (OSError, IOError):
 815                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 816                                 return
 817
 818                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 819                         # subtitles download errors are already managed as troubles in relevant IE
 820                         # that way it will silently go on when used with unsupporting IE
 821                         try:
 822                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 823                                 self.report_writesubtitles(srtfn)
 824                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 825                                 try:
 826                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 827                                 finally:
 828                                         srtfile.close()
 829                         except (OSError, IOError):
 830                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 831                                 return
 832
 833                 if self.params.get('writeinfojson', False):
 834                         infofn = filename + u'.info.json'
 835                         self.report_writeinfojson(infofn)
 836                         try:
 837                                 json.dump
 838                         except (NameError,AttributeError):
 839                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 840                                 return
 841                         try:
 842                                 infof = open(_encodeFilename(infofn), 'wb')
 843                                 try:
 844                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 845                                         json.dump(json_info_dict, infof)
 846                                 finally:
 847                                         infof.close()
 848                         except (OSError, IOError):
 849                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 850                                 return
 851
 852                 if not self.params.get('skip_download', False):
 853                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 854                                 success = True
 855                         else:
 856                                 try:
 857                                         success = self._do_download(filename, info_dict)
 858                                 except (OSError, IOError), err:
 859                                         raise UnavailableVideoError
 860                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 861                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 862                                         return
 863                                 except (ContentTooShortError, ), err:
 864                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 865                                         return
 866
 867                         if success:
 868                                 try:
 869                                         self.post_process(filename, info_dict)
 870                                 except (PostProcessingError), err:
 871                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 872                                         return
 873
 874         def download(self, url_list):
 875                 """Download a given list of URLs."""
 876                 if len(url_list) > 1 and self.fixed_template():
 877                         raise SameFileError(self.params['outtmpl'])
 878
 879                 for url in url_list:
 880                         suitable_found = False
 881                         for ie in self._ies:
 882                                 # Go to next InfoExtractor if not suitable
 883                                 if not ie.suitable(url):
 884                                         continue
 885
 886                                 # Suitable InfoExtractor found
 887                                 suitable_found = True
 888
 889                                 # Extract information from URL and process it
 890                                 ie.extract(url)
 891
 892                                 # Suitable InfoExtractor had been found; go to next URL
 893                                 break
 894
 895                         if not suitable_found:
 896                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 897
 898                 return self._download_retcode
 899
 900         def post_process(self, filename, ie_info):
 901                 """Run the postprocessing chain on the given file."""
 902                 info = dict(ie_info)
 903                 info['filepath'] = filename
 904                 for pp in self._pps:
 905                         info = pp.run(info)
 906                         if info is None:
 907                                 break
 908
 909         def _download_with_rtmpdump(self, filename, url, player_url):
 910                 self.report_destination(filename)
 911                 tmpfilename = self.temp_name(filename)
 912
 913                 # Check for rtmpdump first
 914                 try:
 915                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 916                 except (OSError, IOError):
 917                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 918                         return False
 919
 920                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 921                 # the connection was interrumpted and resuming appears to be
 922                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 923                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 924                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 925                 if self.params.get('verbose', False):
 926                         try:
 927                                 import pipes
 928                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 929                         except ImportError:
 930                                 shell_quote = repr
 931                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 932                 retval = subprocess.call(args)
 933                 while retval == 2 or retval == 1:
 934                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 935                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 936                         time.sleep(5.0) # This seems to be needed
 937                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 938                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 939                         if prevsize == cursize and retval == 1:
 940                                 break
 941                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 942                         if prevsize == cursize and retval == 2 and cursize > 1024:
 943                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 944                                 retval = 0
 945                                 break
 946                 if retval == 0:
 947                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 948                         self.try_rename(tmpfilename, filename)
 949                         return True
 950                 else:
 951                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 952                         return False
 953
 954         def _do_download(self, filename, info_dict):
 955                 url = info_dict['url']
 956                 player_url = info_dict.get('player_url', None)
 957
 958                 # Check file already present
 959                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 960                         self.report_file_already_downloaded(filename)
 961                         return True
 962
 963                 # Attempt to download using rtmpdump
 964                 if url.startswith('rtmp'):
 965                         return self._download_with_rtmpdump(filename, url, player_url)
 966
 967                 tmpfilename = self.temp_name(filename)
 968                 stream = None
 969
 970                 # Do not include the Accept-Encoding header
 971                 headers = {'Youtubedl-no-compression': 'True'}
 972                 basic_request = urllib2.Request(url, None, headers)
 973                 request = urllib2.Request(url, None, headers)
 974
 975                 # Establish possible resume length
 976                 if os.path.isfile(_encodeFilename(tmpfilename)):
 977                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 978                 else:
 979                         resume_len = 0
 980
 981                 open_mode = 'wb'
 982                 if resume_len != 0:
 983                         if self.params.get('continuedl', False):
 984                                 self.report_resuming_byte(resume_len)
 985                                 request.add_header('Range','bytes=%d-' % resume_len)
 986                                 open_mode = 'ab'
 987                         else:
 988                                 resume_len = 0
 989
 990                 count = 0
 991                 retries = self.params.get('retries', 0)
 992                 while count <= retries:
 993                         # Establish connection
 994                         try:
 995                                 if count == 0 and 'urlhandle' in info_dict:
 996                                         data = info_dict['urlhandle']
 997                                 data = urllib2.urlopen(request)
 998                                 break
 999                         except (urllib2.HTTPError, ), err:
1000                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001                                         # Unexpected HTTP error
1002                                         raise
1003                                 elif err.code == 416:
1004                                         # Unable to resume (requested range not satisfiable)
1005                                         try:
1006                                                 # Open the connection again without the range header
1007                                                 data = urllib2.urlopen(basic_request)
1008                                                 content_length = data.info()['Content-Length']
1009                                         except (urllib2.HTTPError, ), err:
1010                                                 if err.code < 500 or err.code >= 600:
1011                                                         raise
1012                                         else:
1013                                                 # Examine the reported length
1014                                                 if (content_length is not None and
1015                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016                                                         # The file had already been fully downloaded.
1017                                                         # Explanation to the above condition: in issue #175 it was revealed that
1018                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1019                                                         # changing the file size slightly and causing problems for some users. So
1020                                                         # I decided to implement a suggested change and consider the file
1021                                                         # completely downloaded if the file size differs less than 100 bytes from
1022                                                         # the one in the hard drive.
1023                                                         self.report_file_already_downloaded(filename)
1024                                                         self.try_rename(tmpfilename, filename)
1025                                                         return True
1026                                                 else:
1027                                                         # The length does not match, we start the download over
1028                                                         self.report_unable_to_resume()
1029                                                         open_mode = 'wb'
1030                                                         break
1031                         # Retry
1032                         count += 1
1033                         if count <= retries:
1034                                 self.report_retry(count, retries)
1035
1036                 if count > retries:
1037                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1038                         return False
1039
1040                 data_len = data.info().get('Content-length', None)
1041                 if data_len is not None:
1042                         data_len = long(data_len) + resume_len
1043                 data_len_str = self.format_bytes(data_len)
1044                 byte_counter = 0 + resume_len
1045                 block_size = 1024
1046                 start = time.time()
1047                 while True:
1048                         # Download and write
1049                         before = time.time()
1050                         data_block = data.read(block_size)
1051                         after = time.time()
1052                         if len(data_block) == 0:
1053                                 break
1054                         byte_counter += len(data_block)
1055
1056                         # Open file just in time
1057                         if stream is None:
1058                                 try:
1059                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060                                         assert stream is not None
1061                                         filename = self.undo_temp_name(tmpfilename)
1062                                         self.report_destination(filename)
1063                                 except (OSError, IOError), err:
1064                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065                                         return False
1066                         try:
1067                                 stream.write(data_block)
1068                         except (IOError, OSError), err:
1069                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070                                 return False
1071                         block_size = self.best_block_size(after - before, len(data_block))
1072
1073                         # Progress message
1074                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075                         if data_len is None:
1076                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077                         else:
1078                                 percent_str = self.calc_percent(byte_counter, data_len)
1079                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082                         # Apply rate limit
1083                         self.slow_down(start, byte_counter - resume_len)
1084
1085                 if stream is None:
1086                         self.trouble(u'\nERROR: Did not get any data blocks')
1087                         return False
1088                 stream.close()
1089                 self.report_finish()
1090                 if data_len is not None and byte_counter != data_len:
1091                         raise ContentTooShortError(byte_counter, long(data_len))
1092                 self.try_rename(tmpfilename, filename)
1093
1094                 # Update file modification time
1095                 if self.params.get('updatetime', True):
1096                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098                 return True
1099
1100
1101 class InfoExtractor(object):
1102         """Information Extractor class.
1103
1104         Information extractors are the classes that, given a URL, extract
1105         information from the video (or videos) the URL refers to. This
1106         information includes the real video URL, the video title and simplified
1107         title, author and others. The information is stored in a dictionary
1108         which is then passed to the FileDownloader. The FileDownloader
1109         processes this information possibly downloading the video to the file
1110         system, among other possible outcomes. The dictionaries must include
1111         the following fields:
1112
1113         id:             Video identifier.
1114         url:            Final video URL.
1115         uploader:       Nickname of the video uploader.
1116         title:          Literal title.
1117         stitle:         Simplified title.
1118         ext:            Video filename extension.
1119         format:         Video format.
1120         player_url:     SWF Player URL (may be None).
1121
1122         The following fields are optional. Their primary purpose is to allow
1123         youtube-dl to serve as the backend for a video search function, such
1124         as the one in youtube2mp3.  They are only used when their respective
1125         forced printing functions are called:
1126
1127         thumbnail:      Full URL to a video thumbnail image.
1128         description:    One-line video description.
1129
1130         Subclasses of this one should re-define the _real_initialize() and
1131         _real_extract() methods and define a _VALID_URL regexp.
1132         Probably, they should also be added to the list of extractors.
1133         """
1134
1135         _ready = False
1136         _downloader = None
1137
1138         def __init__(self, downloader=None):
1139                 """Constructor. Receives an optional downloader."""
1140                 self._ready = False
1141                 self.set_downloader(downloader)
1142
1143         def suitable(self, url):
1144                 """Receives a URL and returns True if suitable for this IE."""
1145                 return re.match(self._VALID_URL, url) is not None
1146
1147         def initialize(self):
1148                 """Initializes an instance (authentication, etc)."""
1149                 if not self._ready:
1150                         self._real_initialize()
1151                         self._ready = True
1152
1153         def extract(self, url):
1154                 """Extracts URL information and returns it in list of dicts."""
1155                 self.initialize()
1156                 return self._real_extract(url)
1157
1158         def set_downloader(self, downloader):
1159                 """Sets the downloader for this IE."""
1160                 self._downloader = downloader
1161
1162         def _real_initialize(self):
1163                 """Real initialization process. Redefine in subclasses."""
1164                 pass
1165
1166         def _real_extract(self, url):
1167                 """Real extraction process. Redefine in subclasses."""
1168                 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172         """Information extractor for youtube.com."""
1173
1174         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178         _NETRC_MACHINE = 'youtube'
1179         # Listed in order of quality
1180         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1181         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1182         _video_extensions = {
1183                 '13': '3gp',
1184                 '17': 'mp4',
1185                 '18': 'mp4',
1186                 '22': 'mp4',
1187                 '37': 'mp4',
1188                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1189                 '43': 'webm',
1190                 '44': 'webm',
1191                 '45': 'webm',
1192         }
1193         _video_dimensions = {
1194                 '5': '240x400',
1195                 '6': '???',
1196                 '13': '???',
1197                 '17': '144x176',
1198                 '18': '360x640',
1199                 '22': '720x1280',
1200                 '34': '360x640',
1201                 '35': '480x854',
1202                 '37': '1080x1920',
1203                 '38': '3072x4096',
1204                 '43': '360x640',
1205                 '44': '480x854',
1206                 '45': '720x1280',
1207         }
1208         IE_NAME = u'youtube'
1209
1210         def report_lang(self):
1211                 """Report attempt to set language."""
1212                 self._downloader.to_screen(u'[youtube] Setting language')
1213
1214         def report_login(self):
1215                 """Report attempt to log in."""
1216                 self._downloader.to_screen(u'[youtube] Logging in')
1217
1218         def report_age_confirmation(self):
1219                 """Report attempt to confirm age."""
1220                 self._downloader.to_screen(u'[youtube] Confirming age')
1221
1222         def report_video_webpage_download(self, video_id):
1223                 """Report attempt to download video webpage."""
1224                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1225
1226         def report_video_info_webpage_download(self, video_id):
1227                 """Report attempt to download video info webpage."""
1228                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1229
1230         def report_video_subtitles_download(self, video_id):
1231                 """Report attempt to download video info webpage."""
1232                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1233
1234         def report_information_extraction(self, video_id):
1235                 """Report attempt to extract video information."""
1236                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1237
1238         def report_unavailable_format(self, video_id, format):
1239                 """Report extracted video URL."""
1240                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1241
1242         def report_rtmp_download(self):
1243                 """Indicate the download will use the RTMP protocol."""
1244                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1245
1246         def _closed_captions_xml_to_srt(self, xml_string):
1247                 srt = ''
1248                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1249                 # TODO parse xml instead of regex
1250                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1251                         if not dur: dur = '4'
1252                         start = float(start)
1253                         end = start + float(dur)
1254                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1255                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1256                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1257                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1258                         srt += str(n) + '\n'
1259                         srt += start + ' --> ' + end + '\n'
1260                         srt += caption + '\n\n'
1261                 return srt
1262
1263         def _print_formats(self, formats):
1264                 print 'Available formats:'
1265                 for x in formats:
1266                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1267
1268         def _real_initialize(self):
1269                 if self._downloader is None:
1270                         return
1271
1272                 username = None
1273                 password = None
1274                 downloader_params = self._downloader.params
1275
1276                 # Attempt to use provided username and password or .netrc data
1277                 if downloader_params.get('username', None) is not None:
1278                         username = downloader_params['username']
1279                         password = downloader_params['password']
1280                 elif downloader_params.get('usenetrc', False):
1281                         try:
1282                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1283                                 if info is not None:
1284                                         username = info[0]
1285                                         password = info[2]
1286                                 else:
1287                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1288                         except (IOError, netrc.NetrcParseError), err:
1289                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1290                                 return
1291
1292                 # Set language
1293                 request = urllib2.Request(self._LANG_URL)
1294                 try:
1295                         self.report_lang()
1296                         urllib2.urlopen(request).read()
1297                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1298                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1299                         return
1300
1301                 # No authentication to be performed
1302                 if username is None:
1303                         return
1304
1305                 # Log in
1306                 login_form = {
1307                                 'current_form': 'loginForm',
1308                                 'next':         '/',
1309                                 'action_login': 'Log In',
1310                                 'username':     username,
1311                                 'password':     password,
1312                                 }
1313                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1314                 try:
1315                         self.report_login()
1316                         login_results = urllib2.urlopen(request).read()
1317                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1318                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1319                                 return
1320                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1321                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1322                         return
1323
1324                 # Confirm age
1325                 age_form = {
1326                                 'next_url':             '/',
1327                                 'action_confirm':       'Confirm',
1328                                 }
1329                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1330                 try:
1331                         self.report_age_confirmation()
1332                         age_results = urllib2.urlopen(request).read()
1333                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1335                         return
1336
1337         def _real_extract(self, url):
1338                 # Extract video id from URL
1339                 mobj = re.match(self._VALID_URL, url)
1340                 if mobj is None:
1341                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1342                         return
1343                 video_id = mobj.group(2)
1344
1345                 # Get video webpage
1346                 self.report_video_webpage_download(video_id)
1347                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1348                 try:
1349                         video_webpage = urllib2.urlopen(request).read()
1350                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1351                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1352                         return
1353
1354                 # Attempt to extract SWF player URL
1355                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1356                 if mobj is not None:
1357                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1358                 else:
1359                         player_url = None
1360
1361                 # Get video info
1362                 self.report_video_info_webpage_download(video_id)
1363                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1364                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1365                                         % (video_id, el_type))
1366                         request = urllib2.Request(video_info_url)
1367                         try:
1368                                 video_info_webpage = urllib2.urlopen(request).read()
1369                                 video_info = parse_qs(video_info_webpage)
1370                                 if 'token' in video_info:
1371                                         break
1372                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1374                                 return
1375                 if 'token' not in video_info:
1376                         if 'reason' in video_info:
1377                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1378                         else:
1379                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1380                         return
1381
1382                 # Start extracting information
1383                 self.report_information_extraction(video_id)
1384
1385                 # uploader
1386                 if 'author' not in video_info:
1387                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1388                         return
1389                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1390
1391                 # title
1392                 if 'title' not in video_info:
1393                         self._downloader.trouble(u'ERROR: unable to extract video title')
1394                         return
1395                 video_title = urllib.unquote_plus(video_info['title'][0])
1396                 video_title = video_title.decode('utf-8')
1397                 video_title = sanitize_title(video_title)
1398
1399                 # simplified title
1400                 simple_title = _simplify_title(video_title)
1401
1402                 # thumbnail image
1403                 if 'thumbnail_url' not in video_info:
1404                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1405                         video_thumbnail = ''
1406                 else:   # don't panic if we can't find it
1407                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1408
1409                 # upload date
1410                 upload_date = u'NA'
1411                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1412                 if mobj is not None:
1413                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1414                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1415                         for expression in format_expressions:
1416                                 try:
1417                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1418                                 except:
1419                                         pass
1420
1421                 # description
1422                 try:
1423                         lxml.etree
1424                 except NameError:
1425                         video_description = u'No description available.'
1426                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1427                         if mobj is not None:
1428                                 video_description = mobj.group(1).decode('utf-8')
1429                 else:
1430                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1431                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1432                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1433                         # TODO use another parser
1434
1435                 # closed captions
1436                 video_subtitles = None
1437                 if self._downloader.params.get('writesubtitles', False):
1438                         self.report_video_subtitles_download(video_id)
1439                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1440                         try:
1441                                 srt_list = urllib2.urlopen(request).read()
1442                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1444                         else:
1445                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1446                                 if srt_lang_list:
1447                                         if self._downloader.params.get('subtitleslang', False):
1448                                                 srt_lang = self._downloader.params.get('subtitleslang')
1449                                         elif 'en' in srt_lang_list:
1450                                                 srt_lang = 'en'
1451                                         else:
1452                                                 srt_lang = srt_lang_list[0]
1453                                         if not srt_lang in srt_lang_list:
1454                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1455                                         else:
1456                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1457                                                 try:
1458                                                         srt_xml = urllib2.urlopen(request).read()
1459                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1460                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1461                                                 else:
1462                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1463                                 else:
1464                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1465
1466                 # token
1467                 video_token = urllib.unquote_plus(video_info['token'][0])
1468
1469                 # Decide which formats to download
1470                 req_format = self._downloader.params.get('format', None)
1471
1472                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1473                         self.report_rtmp_download()
1474                         video_url_list = [(None, video_info['conn'][0])]
1475                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1476                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1477                         url_data = [parse_qs(uds) for uds in url_data_strs]
1478                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1479                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1480
1481                         format_limit = self._downloader.params.get('format_limit', None)
1482                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1483                         if format_limit is not None and format_limit in available_formats:
1484                                 format_list = available_formats[available_formats.index(format_limit):]
1485                         else:
1486                                 format_list = available_formats
1487                         existing_formats = [x for x in format_list if x in url_map]
1488                         if len(existing_formats) == 0:
1489                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1490                                 return
1491                         if self._downloader.params.get('listformats', None):
1492                                 self._print_formats(existing_formats)
1493                                 return
1494                         if req_format is None or req_format == 'best':
1495                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1496                         elif req_format == 'worst':
1497                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1498                         elif req_format in ('-1', 'all'):
1499                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1500                         else:
1501                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1502                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1503                                 req_formats = req_format.split('/')
1504                                 video_url_list = None
1505                                 for rf in req_formats:
1506                                         if rf in url_map:
1507                                                 video_url_list = [(rf, url_map[rf])]
1508                                                 break
1509                                 if video_url_list is None:
1510                                         self._downloader.trouble(u'ERROR: requested format not available')
1511                                         return
1512                 else:
1513                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1514                         return
1515
1516                 for format_param, video_real_url in video_url_list:
1517                         # At this point we have a new video
1518                         self._downloader.increment_downloads()
1519
1520                         # Extension
1521                         video_extension = self._video_extensions.get(format_param, 'flv')
1522
1523                         try:
1524                                 # Process video information
1525                                 self._downloader.process_info({
1526                                         'id':           video_id.decode('utf-8'),
1527                                         'url':          video_real_url.decode('utf-8'),
1528                                         'uploader':     video_uploader.decode('utf-8'),
1529                                         'upload_date':  upload_date,
1530                                         'title':        video_title,
1531                                         'stitle':       simple_title,
1532                                         'ext':          video_extension.decode('utf-8'),
1533                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1534                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1535                                         'description':  video_description,
1536                                         'player_url':   player_url,
1537                                         'subtitles':    video_subtitles
1538                                 })
1539                         except UnavailableVideoError, err:
1540                                 self._downloader.trouble(u'\nERROR: unable to download video')
1541
1542
1543 class MetacafeIE(InfoExtractor):
1544         """Information Extractor for metacafe.com."""
1545
1546         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1547         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1548         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1549         _youtube_ie = None
1550         IE_NAME = u'metacafe'
1551
1552         def __init__(self, youtube_ie, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554                 self._youtube_ie = youtube_ie
1555
1556         def report_disclaimer(self):
1557                 """Report disclaimer retrieval."""
1558                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1559
1560         def report_age_confirmation(self):
1561                 """Report attempt to confirm age."""
1562                 self._downloader.to_screen(u'[metacafe] Confirming age')
1563
1564         def report_download_webpage(self, video_id):
1565                 """Report webpage download."""
1566                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1567
1568         def report_extraction(self, video_id):
1569                 """Report information extraction."""
1570                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1571
1572         def _real_initialize(self):
1573                 # Retrieve disclaimer
1574                 request = urllib2.Request(self._DISCLAIMER)
1575                 try:
1576                         self.report_disclaimer()
1577                         disclaimer = urllib2.urlopen(request).read()
1578                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1579                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1580                         return
1581
1582                 # Confirm age
1583                 disclaimer_form = {
1584                         'filters': '0',
1585                         'submit': "Continue - I'm over 18",
1586                         }
1587                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1588                 try:
1589                         self.report_age_confirmation()
1590                         disclaimer = urllib2.urlopen(request).read()
1591                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1593                         return
1594
1595         def _real_extract(self, url):
1596                 # Extract id and simplified title from URL
1597                 mobj = re.match(self._VALID_URL, url)
1598                 if mobj is None:
1599                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1600                         return
1601
1602                 video_id = mobj.group(1)
1603
1604                 # Check if video comes from YouTube
1605                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1606                 if mobj2 is not None:
1607                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1608                         return
1609
1610                 # At this point we have a new video
1611                 self._downloader.increment_downloads()
1612
1613                 simple_title = mobj.group(2).decode('utf-8')
1614
1615                 # Retrieve video webpage to extract further information
1616                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1617                 try:
1618                         self.report_download_webpage(video_id)
1619                         webpage = urllib2.urlopen(request).read()
1620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1622                         return
1623
1624                 # Extract URL, uploader and title from webpage
1625                 self.report_extraction(video_id)
1626                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1627                 if mobj is not None:
1628                         mediaURL = urllib.unquote(mobj.group(1))
1629                         video_extension = mediaURL[-3:]
1630
1631                         # Extract gdaKey if available
1632                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1633                         if mobj is None:
1634                                 video_url = mediaURL
1635                         else:
1636                                 gdaKey = mobj.group(1)
1637                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1638                 else:
1639                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1640                         if mobj is None:
1641                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1642                                 return
1643                         vardict = parse_qs(mobj.group(1))
1644                         if 'mediaData' not in vardict:
1645                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1646                                 return
1647                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1648                         if mobj is None:
1649                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650                                 return
1651                         mediaURL = mobj.group(1).replace('\\/', '/')
1652                         video_extension = mediaURL[-3:]
1653                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1654
1655                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: unable to extract title')
1658                         return
1659                 video_title = mobj.group(1).decode('utf-8')
1660                 video_title = sanitize_title(video_title)
1661
1662                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1665                         return
1666                 video_uploader = mobj.group(1)
1667
1668                 try:
1669                         # Process video information
1670                         self._downloader.process_info({
1671                                 'id':           video_id.decode('utf-8'),
1672                                 'url':          video_url.decode('utf-8'),
1673                                 'uploader':     video_uploader.decode('utf-8'),
1674                                 'upload_date':  u'NA',
1675                                 'title':        video_title,
1676                                 'stitle':       simple_title,
1677                                 'ext':          video_extension.decode('utf-8'),
1678                                 'format':       u'NA',
1679                                 'player_url':   None,
1680                         })
1681                 except UnavailableVideoError:
1682                         self._downloader.trouble(u'\nERROR: unable to download video')
1683
1684
1685 class DailymotionIE(InfoExtractor):
1686         """Information Extractor for Dailymotion"""
1687
1688         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1689         IE_NAME = u'dailymotion'
1690
1691         def __init__(self, downloader=None):
1692                 InfoExtractor.__init__(self, downloader)
1693
1694         def report_download_webpage(self, video_id):
1695                 """Report webpage download."""
1696                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1697
1698         def report_extraction(self, video_id):
1699                 """Report information extraction."""
1700                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1701
1702         def _real_extract(self, url):
1703                 # Extract id and simplified title from URL
1704                 mobj = re.match(self._VALID_URL, url)
1705                 if mobj is None:
1706                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1707                         return
1708
1709                 # At this point we have a new video
1710                 self._downloader.increment_downloads()
1711                 video_id = mobj.group(1)
1712
1713                 video_extension = 'flv'
1714
1715                 # Retrieve video webpage to extract further information
1716                 request = urllib2.Request(url)
1717                 request.add_header('Cookie', 'family_filter=off')
1718                 try:
1719                         self.report_download_webpage(video_id)
1720                         webpage = urllib2.urlopen(request).read()
1721                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1722                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1723                         return
1724
1725                 # Extract URL, uploader and title from webpage
1726                 self.report_extraction(video_id)
1727                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1728                 if mobj is None:
1729                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1730                         return
1731                 sequence = urllib.unquote(mobj.group(1))
1732                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1733                 if mobj is None:
1734                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1735                         return
1736                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1737
1738                 # if needed add http://www.dailymotion.com/ if relative URL
1739
1740                 video_url = mediaURL
1741
1742                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract title')
1745                         return
1746                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1747                 video_title = sanitize_title(video_title)
1748                 simple_title = _simplify_title(video_title)
1749
1750                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1751                 if mobj is None:
1752                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1753                         return
1754                 video_uploader = mobj.group(1)
1755
1756                 try:
1757                         # Process video information
1758                         self._downloader.process_info({
1759                                 'id':           video_id.decode('utf-8'),
1760                                 'url':          video_url.decode('utf-8'),
1761                                 'uploader':     video_uploader.decode('utf-8'),
1762                                 'upload_date':  u'NA',
1763                                 'title':        video_title,
1764                                 'stitle':       simple_title,
1765                                 'ext':          video_extension.decode('utf-8'),
1766                                 'format':       u'NA',
1767                                 'player_url':   None,
1768                         })
1769                 except UnavailableVideoError:
1770                         self._downloader.trouble(u'\nERROR: unable to download video')
1771
1772
1773 class GoogleIE(InfoExtractor):
1774         """Information extractor for video.google.com."""
1775
1776         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1777         IE_NAME = u'video.google'
1778
1779         def __init__(self, downloader=None):
1780                 InfoExtractor.__init__(self, downloader)
1781
1782         def report_download_webpage(self, video_id):
1783                 """Report webpage download."""
1784                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1785
1786         def report_extraction(self, video_id):
1787                 """Report information extraction."""
1788                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1789
1790         def _real_extract(self, url):
1791                 # Extract id from URL
1792                 mobj = re.match(self._VALID_URL, url)
1793                 if mobj is None:
1794                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1795                         return
1796
1797                 # At this point we have a new video
1798                 self._downloader.increment_downloads()
1799                 video_id = mobj.group(1)
1800
1801                 video_extension = 'mp4'
1802
1803                 # Retrieve video webpage to extract further information
1804                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1805                 try:
1806                         self.report_download_webpage(video_id)
1807                         webpage = urllib2.urlopen(request).read()
1808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1810                         return
1811
1812                 # Extract URL, uploader, and title from webpage
1813                 self.report_extraction(video_id)
1814                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1815                 if mobj is None:
1816                         video_extension = 'flv'
1817                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1820                         return
1821                 mediaURL = urllib.unquote(mobj.group(1))
1822                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1823                 mediaURL = mediaURL.replace('\\x26', '\x26')
1824
1825                 video_url = mediaURL
1826
1827                 mobj = re.search(r'<title>(.*)</title>', webpage)
1828                 if mobj is None:
1829                         self._downloader.trouble(u'ERROR: unable to extract title')
1830                         return
1831                 video_title = mobj.group(1).decode('utf-8')
1832                 video_title = sanitize_title(video_title)
1833                 simple_title = _simplify_title(video_title)
1834
1835                 # Extract video description
1836                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1837                 if mobj is None:
1838                         self._downloader.trouble(u'ERROR: unable to extract video description')
1839                         return
1840                 video_description = mobj.group(1).decode('utf-8')
1841                 if not video_description:
1842                         video_description = 'No description available.'
1843
1844                 # Extract video thumbnail
1845                 if self._downloader.params.get('forcethumbnail', False):
1846                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1847                         try:
1848                                 webpage = urllib2.urlopen(request).read()
1849                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1851                                 return
1852                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1853                         if mobj is None:
1854                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1855                                 return
1856                         video_thumbnail = mobj.group(1)
1857                 else:   # we need something to pass to process_info
1858                         video_thumbnail = ''
1859
1860                 try:
1861                         # Process video information
1862                         self._downloader.process_info({
1863                                 'id':           video_id.decode('utf-8'),
1864                                 'url':          video_url.decode('utf-8'),
1865                                 'uploader':     u'NA',
1866                                 'upload_date':  u'NA',
1867                                 'title':        video_title,
1868                                 'stitle':       simple_title,
1869                                 'ext':          video_extension.decode('utf-8'),
1870                                 'format':       u'NA',
1871                                 'player_url':   None,
1872                         })
1873                 except UnavailableVideoError:
1874                         self._downloader.trouble(u'\nERROR: unable to download video')
1875
1876
1877 class PhotobucketIE(InfoExtractor):
1878         """Information extractor for photobucket.com."""
1879
1880         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1881         IE_NAME = u'photobucket'
1882
1883         def __init__(self, downloader=None):
1884                 InfoExtractor.__init__(self, downloader)
1885
1886         def report_download_webpage(self, video_id):
1887                 """Report webpage download."""
1888                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1889
1890         def report_extraction(self, video_id):
1891                 """Report information extraction."""
1892                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1893
1894         def _real_extract(self, url):
1895                 # Extract id from URL
1896                 mobj = re.match(self._VALID_URL, url)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1899                         return
1900
1901                 # At this point we have a new video
1902                 self._downloader.increment_downloads()
1903                 video_id = mobj.group(1)
1904
1905                 video_extension = 'flv'
1906
1907                 # Retrieve video webpage to extract further information
1908                 request = urllib2.Request(url)
1909                 try:
1910                         self.report_download_webpage(video_id)
1911                         webpage = urllib2.urlopen(request).read()
1912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1914                         return
1915
1916                 # Extract URL, uploader, and title from webpage
1917                 self.report_extraction(video_id)
1918                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1921                         return
1922                 mediaURL = urllib.unquote(mobj.group(1))
1923
1924                 video_url = mediaURL
1925
1926                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: unable to extract title')
1929                         return
1930                 video_title = mobj.group(1).decode('utf-8')
1931                 video_title = sanitize_title(video_title)
1932                 simple_title = _simplify_title(vide_title)
1933
1934                 video_uploader = mobj.group(2).decode('utf-8')
1935
1936                 try:
1937                         # Process video information
1938                         self._downloader.process_info({
1939                                 'id':           video_id.decode('utf-8'),
1940                                 'url':          video_url.decode('utf-8'),
1941                                 'uploader':     video_uploader,
1942                                 'upload_date':  u'NA',
1943                                 'title':        video_title,
1944                                 'stitle':       simple_title,
1945                                 'ext':          video_extension.decode('utf-8'),
1946                                 'format':       u'NA',
1947                                 'player_url':   None,
1948                         })
1949                 except UnavailableVideoError:
1950                         self._downloader.trouble(u'\nERROR: unable to download video')
1951
1952
1953 class YahooIE(InfoExtractor):
1954         """Information extractor for video.yahoo.com."""
1955
1956         # _VALID_URL matches all Yahoo! Video URLs
1957         # _VPAGE_URL matches only the extractable '/watch/' URLs
1958         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1959         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1960         IE_NAME = u'video.yahoo'
1961
1962         def __init__(self, downloader=None):
1963                 InfoExtractor.__init__(self, downloader)
1964
1965         def report_download_webpage(self, video_id):
1966                 """Report webpage download."""
1967                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1968
1969         def report_extraction(self, video_id):
1970                 """Report information extraction."""
1971                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1972
1973         def _real_extract(self, url, new_video=True):
1974                 # Extract ID from URL
1975                 mobj = re.match(self._VALID_URL, url)
1976                 if mobj is None:
1977                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1978                         return
1979
1980                 # At this point we have a new video
1981                 self._downloader.increment_downloads()
1982                 video_id = mobj.group(2)
1983                 video_extension = 'flv'
1984
1985                 # Rewrite valid but non-extractable URLs as
1986                 # extractable English language /watch/ URLs
1987                 if re.match(self._VPAGE_URL, url) is None:
1988                         request = urllib2.Request(url)
1989                         try:
1990                                 webpage = urllib2.urlopen(request).read()
1991                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1992                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1993                                 return
1994
1995                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1996                         if mobj is None:
1997                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1998                                 return
1999                         yahoo_id = mobj.group(1)
2000
2001                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2002                         if mobj is None:
2003                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2004                                 return
2005                         yahoo_vid = mobj.group(1)
2006
2007                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2008                         return self._real_extract(url, new_video=False)
2009
2010                 # Retrieve video webpage to extract further information
2011                 request = urllib2.Request(url)
2012                 try:
2013                         self.report_download_webpage(video_id)
2014                         webpage = urllib2.urlopen(request).read()
2015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017                         return
2018
2019                 # Extract uploader and title from webpage
2020                 self.report_extraction(video_id)
2021                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2022                 if mobj is None:
2023                         self._downloader.trouble(u'ERROR: unable to extract video title')
2024                         return
2025                 video_title = mobj.group(1).decode('utf-8')
2026                 simple_title = _simplify_title(video_title)
2027
2028                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2029                 if mobj is None:
2030                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2031                         return
2032                 video_uploader = mobj.group(1).decode('utf-8')
2033
2034                 # Extract video thumbnail
2035                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2038                         return
2039                 video_thumbnail = mobj.group(1).decode('utf-8')
2040
2041                 # Extract video description
2042                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video description')
2045                         return
2046                 video_description = mobj.group(1).decode('utf-8')
2047                 if not video_description:
2048                         video_description = 'No description available.'
2049
2050                 # Extract video height and width
2051                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: unable to extract video height')
2054                         return
2055                 yv_video_height = mobj.group(1)
2056
2057                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2058                 if mobj is None:
2059                         self._downloader.trouble(u'ERROR: unable to extract video width')
2060                         return
2061                 yv_video_width = mobj.group(1)
2062
2063                 # Retrieve video playlist to extract media URL
2064                 # I'm not completely sure what all these options are, but we
2065                 # seem to need most of them, otherwise the server sends a 401.
2066                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2067                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2068                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2069                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2070                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2071                 try:
2072                         self.report_download_webpage(video_id)
2073                         webpage = urllib2.urlopen(request).read()
2074                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2075                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2076                         return
2077
2078                 # Extract media URL from playlist XML
2079                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2080                 if mobj is None:
2081                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2082                         return
2083                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2084                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2085
2086                 try:
2087                         # Process video information
2088                         self._downloader.process_info({
2089                                 'id':           video_id.decode('utf-8'),
2090                                 'url':          video_url,
2091                                 'uploader':     video_uploader,
2092                                 'upload_date':  u'NA',
2093                                 'title':        video_title,
2094                                 'stitle':       simple_title,
2095                                 'ext':          video_extension.decode('utf-8'),
2096                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2097                                 'description':  video_description,
2098                                 'thumbnail':    video_thumbnail,
2099                                 'player_url':   None,
2100                         })
2101                 except UnavailableVideoError:
2102                         self._downloader.trouble(u'\nERROR: unable to download video')
2103
2104
2105 class VimeoIE(InfoExtractor):
2106         """Information extractor for vimeo.com."""
2107
2108         # _VALID_URL matches Vimeo URLs
2109         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2110         IE_NAME = u'vimeo'
2111
2112         def __init__(self, downloader=None):
2113                 InfoExtractor.__init__(self, downloader)
2114
2115         def report_download_webpage(self, video_id):
2116                 """Report webpage download."""
2117                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2118
2119         def report_extraction(self, video_id):
2120                 """Report information extraction."""
2121                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2122
2123         def _real_extract(self, url, new_video=True):
2124                 # Extract ID from URL
2125                 mobj = re.match(self._VALID_URL, url)
2126                 if mobj is None:
2127                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2128                         return
2129
2130                 # At this point we have a new video
2131                 self._downloader.increment_downloads()
2132                 video_id = mobj.group(1)
2133
2134                 # Retrieve video webpage to extract further information
2135                 request = urllib2.Request(url, None, std_headers)
2136                 try:
2137                         self.report_download_webpage(video_id)
2138                         webpage = urllib2.urlopen(request).read()
2139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2141                         return
2142
2143                 # Now we begin extracting as much information as we can from what we
2144                 # retrieved. First we extract the information common to all extractors,
2145                 # and latter we extract those that are Vimeo specific.
2146                 self.report_extraction(video_id)
2147
2148                 # Extract the config JSON
2149                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2150                 try:
2151                         config = json.loads(config)
2152                 except:
2153                         self._downloader.trouble(u'ERROR: unable to extract info section')
2154                         return
2155
2156                 # Extract title
2157                 video_title = config["video"]["title"]
2158                 simple_title = _simplify_title(video_title)
2159
2160                 # Extract uploader
2161                 video_uploader = config["video"]["owner"]["name"]
2162
2163                 # Extract video thumbnail
2164                 video_thumbnail = config["video"]["thumbnail"]
2165
2166                 # Extract video description
2167                 try:
2168                         lxml.etree
2169                 except NameError:
2170                         video_description = u'No description available.'
2171                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2172                         if mobj is not None:
2173                                 video_description = mobj.group(1)
2174                 else:
2175                         html_parser = lxml.etree.HTMLParser()
2176                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2177                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2178                         # TODO use another parser
2179
2180                 # Extract upload date
2181                 video_upload_date = u'NA'
2182                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2183                 if mobj is not None:
2184                         video_upload_date = mobj.group(1)
2185
2186                 # Vimeo specific: extract request signature and timestamp
2187                 sig = config['request']['signature']
2188                 timestamp = config['request']['timestamp']
2189
2190                 # Vimeo specific: extract video codec and quality information
2191                 # TODO bind to format param
2192                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2193                 for codec in codecs:
2194                         if codec[0] in config["video"]["files"]:
2195                                 video_codec = codec[0]
2196                                 video_extension = codec[1]
2197                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2198                                 else: quality = 'sd'
2199                                 break
2200                 else:
2201                         self._downloader.trouble(u'ERROR: no known codec found')
2202                         return
2203
2204                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2205                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2206
2207                 try:
2208                         # Process video information
2209                         self._downloader.process_info({
2210                                 'id':           video_id,
2211                                 'url':          video_url,
2212                                 'uploader':     video_uploader,
2213                                 'upload_date':  video_upload_date,
2214                                 'title':        video_title,
2215                                 'stitle':       simple_title,
2216                                 'ext':          video_extension,
2217                                 'thumbnail':    video_thumbnail,
2218                                 'description':  video_description,
2219                                 'player_url':   None,
2220                         })
2221                 except UnavailableVideoError:
2222                         self._downloader.trouble(u'ERROR: unable to download video')
2223
2224
2225 class GenericIE(InfoExtractor):
2226         """Generic last-resort information extractor."""
2227
2228         _VALID_URL = r'.*'
2229         IE_NAME = u'generic'
2230
2231         def __init__(self, downloader=None):
2232                 InfoExtractor.__init__(self, downloader)
2233
2234         def report_download_webpage(self, video_id):
2235                 """Report webpage download."""
2236                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2237                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2238
2239         def report_extraction(self, video_id):
2240                 """Report information extraction."""
2241                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2242
2243         def _real_extract(self, url):
2244                 # At this point we have a new video
2245                 self._downloader.increment_downloads()
2246
2247                 video_id = url.split('/')[-1]
2248                 request = urllib2.Request(url)
2249                 try:
2250                         self.report_download_webpage(video_id)
2251                         webpage = urllib2.urlopen(request).read()
2252                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2254                         return
2255                 except ValueError, err:
2256                         # since this is the last-resort InfoExtractor, if
2257                         # this error is thrown, it'll be thrown here
2258                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2259                         return
2260
2261                 self.report_extraction(video_id)
2262                 # Start with something easy: JW Player in SWFObject
2263                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2264                 if mobj is None:
2265                         # Broaden the search a little bit
2266                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2267                 if mobj is None:
2268                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269                         return
2270
2271                 # It's possible that one of the regexes
2272                 # matched, but returned an empty group:
2273                 if mobj.group(1) is None:
2274                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2275                         return
2276
2277                 video_url = urllib.unquote(mobj.group(1))
2278                 video_id = os.path.basename(video_url)
2279
2280                 # here's a fun little line of code for you:
2281                 video_extension = os.path.splitext(video_id)[1][1:]
2282                 video_id = os.path.splitext(video_id)[0]
2283
2284                 # it's tempting to parse this further, but you would
2285                 # have to take into account all the variations like
2286                 #   Video Title - Site Name
2287                 #   Site Name | Video Title
2288                 #   Video Title - Tagline | Site Name
2289                 # and so on and so forth; it's just not practical
2290                 mobj = re.search(r'<title>(.*)</title>', webpage)
2291                 if mobj is None:
2292                         self._downloader.trouble(u'ERROR: unable to extract title')
2293                         return
2294                 video_title = mobj.group(1).decode('utf-8')
2295                 video_title = sanitize_title(video_title)
2296                 simple_title = _simplify_title(video_title)
2297
2298                 # video uploader is domain name
2299                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2300                 if mobj is None:
2301                         self._downloader.trouble(u'ERROR: unable to extract title')
2302                         return
2303                 video_uploader = mobj.group(1).decode('utf-8')
2304
2305                 try:
2306                         # Process video information
2307                         self._downloader.process_info({
2308                                 'id':           video_id.decode('utf-8'),
2309                                 'url':          video_url.decode('utf-8'),
2310                                 'uploader':     video_uploader,
2311                                 'upload_date':  u'NA',
2312                                 'title':        video_title,
2313                                 'stitle':       simple_title,
2314                                 'ext':          video_extension.decode('utf-8'),
2315                                 'format':       u'NA',
2316                                 'player_url':   None,
2317                         })
2318                 except UnavailableVideoError, err:
2319                         self._downloader.trouble(u'\nERROR: unable to download video')
2320
2321
2322 class YoutubeSearchIE(InfoExtractor):
2323         """Information Extractor for YouTube search queries."""
2324         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2325         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2326         _youtube_ie = None
2327         _max_youtube_results = 1000
2328         IE_NAME = u'youtube:search'
2329
2330         def __init__(self, youtube_ie, downloader=None):
2331                 InfoExtractor.__init__(self, downloader)
2332                 self._youtube_ie = youtube_ie
2333
2334         def report_download_page(self, query, pagenum):
2335                 """Report attempt to download playlist page with given number."""
2336                 query = query.decode(preferredencoding())
2337                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2338
2339         def _real_initialize(self):
2340                 self._youtube_ie.initialize()
2341
2342         def _real_extract(self, query):
2343                 mobj = re.match(self._VALID_URL, query)
2344                 if mobj is None:
2345                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2346                         return
2347
2348                 prefix, query = query.split(':')
2349                 prefix = prefix[8:]
2350                 query = query.encode('utf-8')
2351                 if prefix == '':
2352                         self._download_n_results(query, 1)
2353                         return
2354                 elif prefix == 'all':
2355                         self._download_n_results(query, self._max_youtube_results)
2356                         return
2357                 else:
2358                         try:
2359                                 n = long(prefix)
2360                                 if n <= 0:
2361                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2362                                         return
2363                                 elif n > self._max_youtube_results:
2364                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2365                                         n = self._max_youtube_results
2366                                 self._download_n_results(query, n)
2367                                 return
2368                         except ValueError: # parsing prefix as integer fails
2369                                 self._download_n_results(query, 1)
2370                                 return
2371
2372         def _download_n_results(self, query, n):
2373                 """Downloads a specified number of results for a query"""
2374
2375                 video_ids = []
2376                 pagenum = 0
2377                 limit = n
2378
2379                 while (50 * pagenum) < limit:
2380                         self.report_download_page(query, pagenum+1)
2381                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2382                         request = urllib2.Request(result_url)
2383                         try:
2384                                 data = urllib2.urlopen(request).read()
2385                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2387                                 return
2388                         api_response = json.loads(data)['data']
2389
2390                         new_ids = list(video['id'] for video in api_response['items'])
2391                         video_ids += new_ids
2392
2393                         limit = min(n, api_response['totalItems'])
2394                         pagenum += 1
2395
2396                 if len(video_ids) > n:
2397                         video_ids = video_ids[:n]
2398                 for id in video_ids:
2399                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2400                 return
2401
2402
2403 class GoogleSearchIE(InfoExtractor):
2404         """Information Extractor for Google Video search queries."""
2405         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2407         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2409         _google_ie = None
2410         _max_google_results = 1000
2411         IE_NAME = u'video.google:search'
2412
2413         def __init__(self, google_ie, downloader=None):
2414                 InfoExtractor.__init__(self, downloader)
2415                 self._google_ie = google_ie
2416
2417         def report_download_page(self, query, pagenum):
2418                 """Report attempt to download playlist page with given number."""
2419                 query = query.decode(preferredencoding())
2420                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2421
2422         def _real_initialize(self):
2423                 self._google_ie.initialize()
2424
2425         def _real_extract(self, query):
2426                 mobj = re.match(self._VALID_URL, query)
2427                 if mobj is None:
2428                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2429                         return
2430
2431                 prefix, query = query.split(':')
2432                 prefix = prefix[8:]
2433                 query = query.encode('utf-8')
2434                 if prefix == '':
2435                         self._download_n_results(query, 1)
2436                         return
2437                 elif prefix == 'all':
2438                         self._download_n_results(query, self._max_google_results)
2439                         return
2440                 else:
2441                         try:
2442                                 n = long(prefix)
2443                                 if n <= 0:
2444                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2445                                         return
2446                                 elif n > self._max_google_results:
2447                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448                                         n = self._max_google_results
2449                                 self._download_n_results(query, n)
2450                                 return
2451                         except ValueError: # parsing prefix as integer fails
2452                                 self._download_n_results(query, 1)
2453                                 return
2454
2455         def _download_n_results(self, query, n):
2456                 """Downloads a specified number of results for a query"""
2457
2458                 video_ids = []
2459                 pagenum = 0
2460
2461                 while True:
2462                         self.report_download_page(query, pagenum)
2463                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2464                         request = urllib2.Request(result_url)
2465                         try:
2466                                 page = urllib2.urlopen(request).read()
2467                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469                                 return
2470
2471                         # Extract video identifiers
2472                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473                                 video_id = mobj.group(1)
2474                                 if video_id not in video_ids:
2475                                         video_ids.append(video_id)
2476                                         if len(video_ids) == n:
2477                                                 # Specified n videos reached
2478                                                 for id in video_ids:
2479                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2480                                                 return
2481
2482                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483                                 for id in video_ids:
2484                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2485                                 return
2486
2487                         pagenum = pagenum + 1
2488
2489
2490 class YahooSearchIE(InfoExtractor):
2491         """Information Extractor for Yahoo! Video search queries."""
2492         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495         _MORE_PAGES_INDICATOR = r'\s*Next'
2496         _yahoo_ie = None
2497         _max_yahoo_results = 1000
2498         IE_NAME = u'video.yahoo:search'
2499
2500         def __init__(self, yahoo_ie, downloader=None):
2501                 InfoExtractor.__init__(self, downloader)
2502                 self._yahoo_ie = yahoo_ie
2503
2504         def report_download_page(self, query, pagenum):
2505                 """Report attempt to download playlist page with given number."""
2506                 query = query.decode(preferredencoding())
2507                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2508
2509         def _real_initialize(self):
2510                 self._yahoo_ie.initialize()
2511
2512         def _real_extract(self, query):
2513                 mobj = re.match(self._VALID_URL, query)
2514                 if mobj is None:
2515                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2516                         return
2517
2518                 prefix, query = query.split(':')
2519                 prefix = prefix[8:]
2520                 query = query.encode('utf-8')
2521                 if prefix == '':
2522                         self._download_n_results(query, 1)
2523                         return
2524                 elif prefix == 'all':
2525                         self._download_n_results(query, self._max_yahoo_results)
2526                         return
2527                 else:
2528                         try:
2529                                 n = long(prefix)
2530                                 if n <= 0:
2531                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2532                                         return
2533                                 elif n > self._max_yahoo_results:
2534                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535                                         n = self._max_yahoo_results
2536                                 self._download_n_results(query, n)
2537                                 return
2538                         except ValueError: # parsing prefix as integer fails
2539                                 self._download_n_results(query, 1)
2540                                 return
2541
2542         def _download_n_results(self, query, n):
2543                 """Downloads a specified number of results for a query"""
2544
2545                 video_ids = []
2546                 already_seen = set()
2547                 pagenum = 1
2548
2549                 while True:
2550                         self.report_download_page(query, pagenum)
2551                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552                         request = urllib2.Request(result_url)
2553                         try:
2554                                 page = urllib2.urlopen(request).read()
2555                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2557                                 return
2558
2559                         # Extract video identifiers
2560                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561                                 video_id = mobj.group(1)
2562                                 if video_id not in already_seen:
2563                                         video_ids.append(video_id)
2564                                         already_seen.add(video_id)
2565                                         if len(video_ids) == n:
2566                                                 # Specified n videos reached
2567                                                 for id in video_ids:
2568                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2569                                                 return
2570
2571                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572                                 for id in video_ids:
2573                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2574                                 return
2575
2576                         pagenum = pagenum + 1
2577
2578
2579 class YoutubePlaylistIE(InfoExtractor):
2580         """Information Extractor for YouTube playlists."""
2581
2582         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2584         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2585         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2586         _youtube_ie = None
2587         IE_NAME = u'youtube:playlist'
2588
2589         def __init__(self, youtube_ie, downloader=None):
2590                 InfoExtractor.__init__(self, downloader)
2591                 self._youtube_ie = youtube_ie
2592
2593         def report_download_page(self, playlist_id, pagenum):
2594                 """Report attempt to download playlist page with given number."""
2595                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2596
2597         def _real_initialize(self):
2598                 self._youtube_ie.initialize()
2599
2600         def _real_extract(self, url):
2601                 # Extract playlist id
2602                 mobj = re.match(self._VALID_URL, url)
2603                 if mobj is None:
2604                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2605                         return
2606
2607                 # Single video case
2608                 if mobj.group(3) is not None:
2609                         self._youtube_ie.extract(mobj.group(3))
2610                         return
2611
2612                 # Download playlist pages
2613                 # prefix is 'p' as default for playlists but there are other types that need extra care
2614                 playlist_prefix = mobj.group(1)
2615                 if playlist_prefix == 'a':
2616                         playlist_access = 'artist'
2617                 else:
2618                         playlist_prefix = 'p'
2619                         playlist_access = 'view_play_list'
2620                 playlist_id = mobj.group(2)
2621                 video_ids = []
2622                 pagenum = 1
2623
2624                 while True:
2625                         self.report_download_page(playlist_id, pagenum)
2626                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627                         request = urllib2.Request(url)
2628                         try:
2629                                 page = urllib2.urlopen(request).read()
2630                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2632                                 return
2633
2634                         # Extract video identifiers
2635                         ids_in_page = []
2636                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2637                                 if mobj.group(1) not in ids_in_page:
2638                                         ids_in_page.append(mobj.group(1))
2639                         video_ids.extend(ids_in_page)
2640
2641                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2642                                 break
2643                         pagenum = pagenum + 1
2644
2645                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646                 playlistend = self._downloader.params.get('playlistend', -1)
2647                 if playlistend == -1:
2648                         video_ids = video_ids[playliststart:]
2649                 else:
2650                         video_ids = video_ids[playliststart:playlistend]
2651
2652                 for id in video_ids:
2653                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2654                 return
2655
2656
2657 class YoutubeUserIE(InfoExtractor):
2658         """Information Extractor for YouTube users."""
2659
2660         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2661         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2662         _GDATA_PAGE_SIZE = 50
2663         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2664         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2665         _youtube_ie = None
2666         IE_NAME = u'youtube:user'
2667
2668         def __init__(self, youtube_ie, downloader=None):
2669                 InfoExtractor.__init__(self, downloader)
2670                 self._youtube_ie = youtube_ie
2671
2672         def report_download_page(self, username, start_index):
2673                 """Report attempt to download user page."""
2674                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2675                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2676
2677         def _real_initialize(self):
2678                 self._youtube_ie.initialize()
2679
2680         def _real_extract(self, url):
2681                 # Extract username
2682                 mobj = re.match(self._VALID_URL, url)
2683                 if mobj is None:
2684                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2685                         return
2686
2687                 username = mobj.group(1)
2688
2689                 # Download video ids using YouTube Data API. Result size per
2690                 # query is limited (currently to 50 videos) so we need to query
2691                 # page by page until there are no video ids - it means we got
2692                 # all of them.
2693
2694                 video_ids = []
2695                 pagenum = 0
2696
2697                 while True:
2698                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2699                         self.report_download_page(username, start_index)
2700
2701                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2702
2703                         try:
2704                                 page = urllib2.urlopen(request).read()
2705                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2706                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2707                                 return
2708
2709                         # Extract video identifiers
2710                         ids_in_page = []
2711
2712                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2713                                 if mobj.group(1) not in ids_in_page:
2714                                         ids_in_page.append(mobj.group(1))
2715
2716                         video_ids.extend(ids_in_page)
2717
2718                         # A little optimization - if current page is not
2719                         # "full", ie. does not contain PAGE_SIZE video ids then
2720                         # we can assume that this page is the last one - there
2721                         # are no more ids on further pages - no need to query
2722                         # again.
2723
2724                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2725                                 break
2726
2727                         pagenum += 1
2728
2729                 all_ids_count = len(video_ids)
2730                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2731                 playlistend = self._downloader.params.get('playlistend', -1)
2732
2733                 if playlistend == -1:
2734                         video_ids = video_ids[playliststart:]
2735                 else:
2736                         video_ids = video_ids[playliststart:playlistend]
2737
2738                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2739                                 (username, all_ids_count, len(video_ids)))
2740
2741                 for video_id in video_ids:
2742                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2743
2744
2745 class DepositFilesIE(InfoExtractor):
2746         """Information extractor for depositfiles.com"""
2747
2748         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2749         IE_NAME = u'DepositFiles'
2750
2751         def __init__(self, downloader=None):
2752                 InfoExtractor.__init__(self, downloader)
2753
2754         def report_download_webpage(self, file_id):
2755                 """Report webpage download."""
2756                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2757
2758         def report_extraction(self, file_id):
2759                 """Report information extraction."""
2760                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2761
2762         def _real_extract(self, url):
2763                 # At this point we have a new file
2764                 self._downloader.increment_downloads()
2765
2766                 file_id = url.split('/')[-1]
2767                 # Rebuild url in english locale
2768                 url = 'http://depositfiles.com/en/files/' + file_id
2769
2770                 # Retrieve file webpage with 'Free download' button pressed
2771                 free_download_indication = { 'gateway_result' : '1' }
2772                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2773                 try:
2774                         self.report_download_webpage(file_id)
2775                         webpage = urllib2.urlopen(request).read()
2776                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2777                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2778                         return
2779
2780                 # Search for the real file URL
2781                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2782                 if (mobj is None) or (mobj.group(1) is None):
2783                         # Try to figure out reason of the error.
2784                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2785                         if (mobj is not None) and (mobj.group(1) is not None):
2786                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2787                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2788                         else:
2789                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2790                         return
2791
2792                 file_url = mobj.group(1)
2793                 file_extension = os.path.splitext(file_url)[1][1:]
2794
2795                 # Search for file title
2796                 mobj = re.search(r'<b title="(.*?)">', webpage)
2797                 if mobj is None:
2798                         self._downloader.trouble(u'ERROR: unable to extract title')
2799                         return
2800                 file_title = mobj.group(1).decode('utf-8')
2801
2802                 try:
2803                         # Process file information
2804                         self._downloader.process_info({
2805                                 'id':           file_id.decode('utf-8'),
2806                                 'url':          file_url.decode('utf-8'),
2807                                 'uploader':     u'NA',
2808                                 'upload_date':  u'NA',
2809                                 'title':        file_title,
2810                                 'stitle':       file_title,
2811                                 'ext':          file_extension.decode('utf-8'),
2812                                 'format':       u'NA',
2813                                 'player_url':   None,
2814                         })
2815                 except UnavailableVideoError, err:
2816                         self._downloader.trouble(u'ERROR: unable to download file')
2817
2818
2819 class FacebookIE(InfoExtractor):
2820         """Information Extractor for Facebook"""
2821
2822         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2823         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2824         _NETRC_MACHINE = 'facebook'
2825         _available_formats = ['video', 'highqual', 'lowqual']
2826         _video_extensions = {
2827                 'video': 'mp4',
2828                 'highqual': 'mp4',
2829                 'lowqual': 'mp4',
2830         }
2831         IE_NAME = u'facebook'
2832
2833         def __init__(self, downloader=None):
2834                 InfoExtractor.__init__(self, downloader)
2835
2836         def _reporter(self, message):
2837                 """Add header and report message."""
2838                 self._downloader.to_screen(u'[facebook] %s' % message)
2839
2840         def report_login(self):
2841                 """Report attempt to log in."""
2842                 self._reporter(u'Logging in')
2843
2844         def report_video_webpage_download(self, video_id):
2845                 """Report attempt to download video webpage."""
2846                 self._reporter(u'%s: Downloading video webpage' % video_id)
2847
2848         def report_information_extraction(self, video_id):
2849                 """Report attempt to extract video information."""
2850                 self._reporter(u'%s: Extracting video information' % video_id)
2851
2852         def _parse_page(self, video_webpage):
2853                 """Extract video information from page"""
2854                 # General data
2855                 data = {'title': r'\("video_title", "(.*?)"\)',
2856                         'description': r'<div class="datawrap">(.*?)</div>',
2857                         'owner': r'\("video_owner_name", "(.*?)"\)',
2858                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2859                         }
2860                 video_info = {}
2861                 for piece in data.keys():
2862                         mobj = re.search(data[piece], video_webpage)
2863                         if mobj is not None:
2864                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2865
2866                 # Video urls
2867                 video_urls = {}
2868                 for fmt in self._available_formats:
2869                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2870                         if mobj is not None:
2871                                 # URL is in a Javascript segment inside an escaped Unicode format within
2872                                 # the generally utf-8 page
2873                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2874                 video_info['video_urls'] = video_urls
2875
2876                 return video_info
2877
2878         def _real_initialize(self):
2879                 if self._downloader is None:
2880                         return
2881
2882                 useremail = None
2883                 password = None
2884                 downloader_params = self._downloader.params
2885
2886                 # Attempt to use provided username and password or .netrc data
2887                 if downloader_params.get('username', None) is not None:
2888                         useremail = downloader_params['username']
2889                         password = downloader_params['password']
2890                 elif downloader_params.get('usenetrc', False):
2891                         try:
2892                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2893                                 if info is not None:
2894                                         useremail = info[0]
2895                                         password = info[2]
2896                                 else:
2897                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2898                         except (IOError, netrc.NetrcParseError), err:
2899                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2900                                 return
2901
2902                 if useremail is None:
2903                         return
2904
2905                 # Log in
2906                 login_form = {
2907                         'email': useremail,
2908                         'pass': password,
2909                         'login': 'Log+In'
2910                         }
2911                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2912                 try:
2913                         self.report_login()
2914                         login_results = urllib2.urlopen(request).read()
2915                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2916                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2917                                 return
2918                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2920                         return
2921
2922         def _real_extract(self, url):
2923                 mobj = re.match(self._VALID_URL, url)
2924                 if mobj is None:
2925                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2926                         return
2927                 video_id = mobj.group('ID')
2928
2929                 # Get video webpage
2930                 self.report_video_webpage_download(video_id)
2931                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2932                 try:
2933                         page = urllib2.urlopen(request)
2934                         video_webpage = page.read()
2935                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2936                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2937                         return
2938
2939                 # Start extracting information
2940                 self.report_information_extraction(video_id)
2941
2942                 # Extract information
2943                 video_info = self._parse_page(video_webpage)
2944
2945                 # uploader
2946                 if 'owner' not in video_info:
2947                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2948                         return
2949                 video_uploader = video_info['owner']
2950
2951                 # title
2952                 if 'title' not in video_info:
2953                         self._downloader.trouble(u'ERROR: unable to extract video title')
2954                         return
2955                 video_title = video_info['title']
2956                 video_title = video_title.decode('utf-8')
2957                 video_title = sanitize_title(video_title)
2958
2959                 simple_title = _simplify_title(video_title)
2960
2961                 # thumbnail image
2962                 if 'thumbnail' not in video_info:
2963                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2964                         video_thumbnail = ''
2965                 else:
2966                         video_thumbnail = video_info['thumbnail']
2967
2968                 # upload date
2969                 upload_date = u'NA'
2970                 if 'upload_date' in video_info:
2971                         upload_time = video_info['upload_date']
2972                         timetuple = email.utils.parsedate_tz(upload_time)
2973                         if timetuple is not None:
2974                                 try:
2975                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2976                                 except:
2977                                         pass
2978
2979                 # description
2980                 video_description = video_info.get('description', 'No description available.')
2981
2982                 url_map = video_info['video_urls']
2983                 if len(url_map.keys()) > 0:
2984                         # Decide which formats to download
2985                         req_format = self._downloader.params.get('format', None)
2986                         format_limit = self._downloader.params.get('format_limit', None)
2987
2988                         if format_limit is not None and format_limit in self._available_formats:
2989                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2990                         else:
2991                                 format_list = self._available_formats
2992                         existing_formats = [x for x in format_list if x in url_map]
2993                         if len(existing_formats) == 0:
2994                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2995                                 return
2996                         if req_format is None:
2997                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2998                         elif req_format == 'worst':
2999                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3000                         elif req_format == '-1':
3001                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3002                         else:
3003                                 # Specific format
3004                                 if req_format not in url_map:
3005                                         self._downloader.trouble(u'ERROR: requested format not available')
3006                                         return
3007                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3008
3009                 for format_param, video_real_url in video_url_list:
3010
3011                         # At this point we have a new video
3012                         self._downloader.increment_downloads()
3013
3014                         # Extension
3015                         video_extension = self._video_extensions.get(format_param, 'mp4')
3016
3017                         try:
3018                                 # Process video information
3019                                 self._downloader.process_info({
3020                                         'id':           video_id.decode('utf-8'),
3021                                         'url':          video_real_url.decode('utf-8'),
3022                                         'uploader':     video_uploader.decode('utf-8'),
3023                                         'upload_date':  upload_date,
3024                                         'title':        video_title,
3025                                         'stitle':       simple_title,
3026                                         'ext':          video_extension.decode('utf-8'),
3027                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3028                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3029                                         'description':  video_description.decode('utf-8'),
3030                                         'player_url':   None,
3031                                 })
3032                         except UnavailableVideoError, err:
3033                                 self._downloader.trouble(u'\nERROR: unable to download video')
3034
3035 class BlipTVIE(InfoExtractor):
3036         """Information extractor for blip.tv"""
3037
3038         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3039         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3040         IE_NAME = u'blip.tv'
3041
3042         def report_extraction(self, file_id):
3043                 """Report information extraction."""
3044                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3045
3046         def report_direct_download(self, title):
3047                 """Report information extraction."""
3048                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3049
3050         def _real_extract(self, url):
3051                 mobj = re.match(self._VALID_URL, url)
3052                 if mobj is None:
3053                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3054                         return
3055
3056                 if '?' in url:
3057                         cchar = '&'
3058                 else:
3059                         cchar = '?'
3060                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3061                 request = urllib2.Request(json_url)
3062                 self.report_extraction(mobj.group(1))
3063                 info = None
3064                 try:
3065                         urlh = urllib2.urlopen(request)
3066                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3067                                 basename = url.split('/')[-1]
3068                                 title,ext = os.path.splitext(basename)
3069                                 title = title.decode('UTF-8')
3070                                 ext = ext.replace('.', '')
3071                                 self.report_direct_download(title)
3072                                 info = {
3073                                         'id': title,
3074                                         'url': url,
3075                                         'title': title,
3076                                         'stitle': _simplify_title(title),
3077                                         'ext': ext,
3078                                         'urlhandle': urlh
3079                                 }
3080                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3081                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3082                         return
3083                 if info is None: # Regular URL
3084                         try:
3085                                 json_code = urlh.read()
3086                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3088                                 return
3089
3090                         try:
3091                                 json_data = json.loads(json_code)
3092                                 if 'Post' in json_data:
3093                                         data = json_data['Post']
3094                                 else:
3095                                         data = json_data
3096
3097                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3098                                 video_url = data['media']['url']
3099                                 umobj = re.match(self._URL_EXT, video_url)
3100                                 if umobj is None:
3101                                         raise ValueError('Can not determine filename extension')
3102                                 ext = umobj.group(1)
3103
3104                                 info = {
3105                                         'id': data['item_id'],
3106                                         'url': video_url,
3107                                         'uploader': data['display_name'],
3108                                         'upload_date': upload_date,
3109                                         'title': data['title'],
3110                                         'stitle': _simplify_title(data['title']),
3111                                         'ext': ext,
3112                                         'format': data['media']['mimeType'],
3113                                         'thumbnail': data['thumbnailUrl'],
3114                                         'description': data['description'],
3115                                         'player_url': data['embedUrl']
3116                                 }
3117                         except (ValueError,KeyError), err:
3118                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3119                                 return
3120
3121                 self._downloader.increment_downloads()
3122
3123                 try:
3124                         self._downloader.process_info(info)
3125                 except UnavailableVideoError, err:
3126                         self._downloader.trouble(u'\nERROR: unable to download video')
3127
3128
3129 class MyVideoIE(InfoExtractor):
3130         """Information Extractor for myvideo.de."""
3131
3132         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3133         IE_NAME = u'myvideo'
3134
3135         def __init__(self, downloader=None):
3136                 InfoExtractor.__init__(self, downloader)
3137
3138         def report_download_webpage(self, video_id):
3139                 """Report webpage download."""
3140                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3141
3142         def report_extraction(self, video_id):
3143                 """Report information extraction."""
3144                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3145
3146         def _real_extract(self,url):
3147                 mobj = re.match(self._VALID_URL, url)
3148                 if mobj is None:
3149                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3150                         return
3151
3152                 video_id = mobj.group(1)
3153
3154                 # Get video webpage
3155                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3156                 try:
3157                         self.report_download_webpage(video_id)
3158                         webpage = urllib2.urlopen(request).read()
3159                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3161                         return
3162
3163                 self.report_extraction(video_id)
3164                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3165                                  webpage)
3166                 if mobj is None:
3167                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3168                         return
3169                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3170
3171                 mobj = re.search('<title>([^<]+)</title>', webpage)
3172                 if mobj is None:
3173                         self._downloader.trouble(u'ERROR: unable to extract title')
3174                         return
3175
3176                 video_title = mobj.group(1)
3177                 video_title = sanitize_title(video_title)
3178
3179                 simple_title = _simplify_title(video_title)
3180
3181                 try:
3182                         self._downloader.process_info({
3183                                 'id':           video_id,
3184                                 'url':          video_url,
3185                                 'uploader':     u'NA',
3186                                 'upload_date':  u'NA',
3187                                 'title':        video_title,
3188                                 'stitle':       simple_title,
3189                                 'ext':          u'flv',
3190                                 'format':       u'NA',
3191                                 'player_url':   None,
3192                         })
3193                 except UnavailableVideoError:
3194                         self._downloader.trouble(u'\nERROR: Unable to download video')
3195
3196 class ComedyCentralIE(InfoExtractor):
3197         """Information extractor for The Daily Show and Colbert Report """
3198
3199         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3200         IE_NAME = u'comedycentral'
3201
3202         def report_extraction(self, episode_id):
3203                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3204
3205         def report_config_download(self, episode_id):
3206                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3207
3208         def report_index_download(self, episode_id):
3209                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3210
3211         def report_player_url(self, episode_id):
3212                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3213
3214         def _real_extract(self, url):
3215                 mobj = re.match(self._VALID_URL, url)
3216                 if mobj is None:
3217                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218                         return
3219
3220                 if mobj.group('shortname'):
3221                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3222                                 url = u'http://www.thedailyshow.com/full-episodes/'
3223                         else:
3224                                 url = u'http://www.colbertnation.com/full-episodes/'
3225                         mobj = re.match(self._VALID_URL, url)
3226                         assert mobj is not None
3227
3228                 dlNewest = not mobj.group('episode')
3229                 if dlNewest:
3230                         epTitle = mobj.group('showname')
3231                 else:
3232                         epTitle = mobj.group('episode')
3233
3234                 req = urllib2.Request(url)
3235                 self.report_extraction(epTitle)
3236                 try:
3237                         htmlHandle = urllib2.urlopen(req)
3238                         html = htmlHandle.read()
3239                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3241                         return
3242                 if dlNewest:
3243                         url = htmlHandle.geturl()
3244                         mobj = re.match(self._VALID_URL, url)
3245                         if mobj is None:
3246                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3247                                 return
3248                         if mobj.group('episode') == '':
3249                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3250                                 return
3251                         epTitle = mobj.group('episode')
3252
3253                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3254                 if len(mMovieParams) == 0:
3255                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3256                         return
3257
3258                 playerUrl_raw = mMovieParams[0][0]
3259                 self.report_player_url(epTitle)
3260                 try:
3261                         urlHandle = urllib2.urlopen(playerUrl_raw)
3262                         playerUrl = urlHandle.geturl()
3263                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3265                         return
3266
3267                 uri = mMovieParams[0][1]
3268                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3269                 self.report_index_download(epTitle)
3270                 try:
3271                         indexXml = urllib2.urlopen(indexUrl).read()
3272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3274                         return
3275
3276                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3277                 itemEls = idoc.findall('.//item')
3278                 for itemEl in itemEls:
3279                         mediaId = itemEl.findall('./guid')[0].text
3280                         shortMediaId = mediaId.split(':')[-1]
3281                         showId = mediaId.split(':')[-2].replace('.com', '')
3282                         officialTitle = itemEl.findall('./title')[0].text
3283                         officialDate = itemEl.findall('./pubDate')[0].text
3284
3285                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3286                                                 urllib.urlencode({'uri': mediaId}))
3287                         configReq = urllib2.Request(configUrl)
3288                         self.report_config_download(epTitle)
3289                         try:
3290                                 configXml = urllib2.urlopen(configReq).read()
3291                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3292                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3293                                 return
3294
3295                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3296                         turls = []
3297                         for rendition in cdoc.findall('.//rendition'):
3298                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3299                                 turls.append(finfo)
3300
3301                         if len(turls) == 0:
3302                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3303                                 continue
3304
3305                         # For now, just pick the highest bitrate
3306                         format,video_url = turls[-1]
3307
3308                         self._downloader.increment_downloads()
3309
3310                         effTitle = showId + u'-' + epTitle
3311                         info = {
3312                                 'id': shortMediaId,
3313                                 'url': video_url,
3314                                 'uploader': showId,
3315                                 'upload_date': officialDate,
3316                                 'title': effTitle,
3317                                 'stitle': _simplify_title(effTitle),
3318                                 'ext': 'mp4',
3319                                 'format': format,
3320                                 'thumbnail': None,
3321                                 'description': officialTitle,
3322                                 'player_url': playerUrl
3323                         }
3324
3325                         try:
3326                                 self._downloader.process_info(info)
3327                         except UnavailableVideoError, err:
3328                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3329                                 continue
3330
3331
3332 class EscapistIE(InfoExtractor):
3333         """Information extractor for The Escapist """
3334
3335         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3336         IE_NAME = u'escapist'
3337
3338         def report_extraction(self, showName):
3339                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3340
3341         def report_config_download(self, showName):
3342                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3343
3344         def _real_extract(self, url):
3345                 htmlParser = HTMLParser.HTMLParser()
3346
3347                 mobj = re.match(self._VALID_URL, url)
3348                 if mobj is None:
3349                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3350                         return
3351                 showName = mobj.group('showname')
3352                 videoId = mobj.group('episode')
3353
3354                 self.report_extraction(showName)
3355                 try:
3356                         webPage = urllib2.urlopen(url).read()
3357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3359                         return
3360
3361                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3362                 description = htmlParser.unescape(descMatch.group(1))
3363                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3364                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3365                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3366                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3367                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3368                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3369
3370                 self.report_config_download(showName)
3371                 try:
3372                         configJSON = urllib2.urlopen(configUrl).read()
3373                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3375                         return
3376
3377                 # Technically, it's JavaScript, not JSON
3378                 configJSON = configJSON.replace("'", '"')
3379
3380                 try:
3381                         config = json.loads(configJSON)
3382                 except (ValueError,), err:
3383                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3384                         return
3385
3386                 playlist = config['playlist']
3387                 videoUrl = playlist[1]['url']
3388
3389                 self._downloader.increment_downloads()
3390                 info = {
3391                         'id': videoId,
3392                         'url': videoUrl,
3393                         'uploader': showName,
3394                         'upload_date': None,
3395                         'title': showName,
3396                         'stitle': _simplify_title(showName),
3397                         'ext': 'flv',
3398                         'format': 'flv',
3399                         'thumbnail': imgUrl,
3400                         'description': description,
3401                         'player_url': playerUrl,
3402                 }
3403
3404                 try:
3405                         self._downloader.process_info(info)
3406                 except UnavailableVideoError, err:
3407                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3408
3409
3410 class CollegeHumorIE(InfoExtractor):
3411         """Information extractor for collegehumor.com"""
3412
3413         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3414         IE_NAME = u'collegehumor'
3415
3416         def report_webpage(self, video_id):
3417                 """Report information extraction."""
3418                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3419
3420         def report_extraction(self, video_id):
3421                 """Report information extraction."""
3422                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3423
3424         def _real_extract(self, url):
3425                 htmlParser = HTMLParser.HTMLParser()
3426
3427                 mobj = re.match(self._VALID_URL, url)
3428                 if mobj is None:
3429                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3430                         return
3431                 video_id = mobj.group('videoid')
3432
3433                 self.report_webpage(video_id)
3434                 request = urllib2.Request(url)
3435                 try:
3436                         webpage = urllib2.urlopen(request).read()
3437                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3439                         return
3440
3441                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3442                 if m is None:
3443                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3444                         return
3445                 internal_video_id = m.group('internalvideoid')
3446
3447                 info = {
3448                         'id': video_id,
3449                         'internal_id': internal_video_id,
3450                 }
3451
3452                 self.report_extraction(video_id)
3453                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3454                 try:
3455                         metaXml = urllib2.urlopen(xmlUrl).read()
3456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3457                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3458                         return
3459
3460                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3461                 try:
3462                         videoNode = mdoc.findall('./video')[0]
3463                         info['description'] = videoNode.findall('./description')[0].text
3464                         info['title'] = videoNode.findall('./caption')[0].text
3465                         info['stitle'] = _simplify_title(info['title'])
3466                         info['url'] = videoNode.findall('./file')[0].text
3467                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3468                         info['ext'] = info['url'].rpartition('.')[2]
3469                         info['format'] = info['ext']
3470                 except IndexError:
3471                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3472                         return
3473
3474                 self._downloader.increment_downloads()
3475
3476                 try:
3477                         self._downloader.process_info(info)
3478                 except UnavailableVideoError, err:
3479                         self._downloader.trouble(u'\nERROR: unable to download video')
3480
3481
3482 class XVideosIE(InfoExtractor):
3483         """Information extractor for xvideos.com"""
3484
3485         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3486         IE_NAME = u'xvideos'
3487
3488         def report_webpage(self, video_id):
3489                 """Report information extraction."""
3490                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3491
3492         def report_extraction(self, video_id):
3493                 """Report information extraction."""
3494                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3495
3496         def _real_extract(self, url):
3497                 htmlParser = HTMLParser.HTMLParser()
3498
3499                 mobj = re.match(self._VALID_URL, url)
3500                 if mobj is None:
3501                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3502                         return
3503                 video_id = mobj.group(1).decode('utf-8')
3504
3505                 self.report_webpage(video_id)
3506
3507                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3508                 try:
3509                         webpage = urllib2.urlopen(request).read()
3510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3511                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3512                         return
3513
3514                 self.report_extraction(video_id)
3515
3516
3517                 # Extract video URL
3518                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3519                 if mobj is None:
3520                         self._downloader.trouble(u'ERROR: unable to extract video url')
3521                         return
3522                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3523
3524
3525                 # Extract title
3526                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3527                 if mobj is None:
3528                         self._downloader.trouble(u'ERROR: unable to extract video title')
3529                         return
3530                 video_title = mobj.group(1).decode('utf-8')
3531
3532
3533                 # Extract video thumbnail
3534                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3535                 if mobj is None:
3536                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3537                         return
3538                 video_thumbnail = mobj.group(1).decode('utf-8')
3539
3540
3541
3542                 self._downloader.increment_downloads()
3543                 info = {
3544                         'id': video_id,
3545                         'url': video_url,
3546                         'uploader': None,
3547                         'upload_date': None,
3548                         'title': video_title,
3549                         'stitle': _simplify_title(video_title),
3550                         'ext': 'flv',
3551                         'format': 'flv',
3552                         'thumbnail': video_thumbnail,
3553                         'description': None,
3554                         'player_url': None,
3555                 }
3556
3557                 try:
3558                         self._downloader.process_info(info)
3559                 except UnavailableVideoError, err:
3560                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3561
3562
3563 class SoundcloudIE(InfoExtractor):
3564         """Information extractor for soundcloud.com
3565            To access the media, the uid of the song and a stream token
3566            must be extracted from the page source and the script must make
3567            a request to media.soundcloud.com/crossdomain.xml. Then
3568            the media can be grabbed by requesting from an url composed
3569            of the stream token and uid
3570          """
3571
3572         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3573         IE_NAME = u'soundcloud'
3574
3575         def __init__(self, downloader=None):
3576                 InfoExtractor.__init__(self, downloader)
3577
3578         def report_webpage(self, video_id):
3579                 """Report information extraction."""
3580                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3581
3582         def report_extraction(self, video_id):
3583                 """Report information extraction."""
3584                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3585
3586         def _real_extract(self, url):
3587                 htmlParser = HTMLParser.HTMLParser()
3588
3589                 mobj = re.match(self._VALID_URL, url)
3590                 if mobj is None:
3591                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3592                         return
3593
3594                 # extract uploader (which is in the url)
3595                 uploader = mobj.group(1).decode('utf-8')
3596                 # extract simple title (uploader + slug of song title)
3597                 slug_title =  mobj.group(2).decode('utf-8')
3598                 simple_title = uploader + '-' + slug_title
3599
3600                 self.report_webpage('%s/%s' % (uploader, slug_title))
3601
3602                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3603                 try:
3604                         webpage = urllib2.urlopen(request).read()
3605                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3607                         return
3608
3609                 self.report_extraction('%s/%s' % (uploader, slug_title))
3610
3611                 # extract uid and stream token that soundcloud hands out for access
3612                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3613                 if mobj:
3614                         video_id = mobj.group(1)
3615                         stream_token = mobj.group(2)
3616
3617                 # extract unsimplified title
3618                 mobj = re.search('"title":"(.*?)",', webpage)
3619                 if mobj:
3620                         title = mobj.group(1)
3621
3622                 # construct media url (with uid/token)
3623                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3624                 mediaURL = mediaURL % (video_id, stream_token)
3625
3626                 # description
3627                 description = u'No description available'
3628                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3629                 if mobj:
3630                         description = mobj.group(1)
3631
3632                 # upload date
3633                 upload_date = None
3634                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3635                 if mobj:
3636                         try:
3637                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3638                         except Exception, e:
3639                                 print str(e)
3640
3641                 # for soundcloud, a request to a cross domain is required for cookies
3642                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3643
3644                 try:
3645                         self._downloader.process_info({
3646                                 'id':           video_id.decode('utf-8'),
3647                                 'url':          mediaURL,
3648                                 'uploader':     uploader.decode('utf-8'),
3649                                 'upload_date':  upload_date,
3650                                 'title':        simple_title.decode('utf-8'),
3651                                 'stitle':       simple_title.decode('utf-8'),
3652                                 'ext':          u'mp3',
3653                                 'format':       u'NA',
3654                                 'player_url':   None,
3655                                 'description': description.decode('utf-8')
3656                         })
3657                 except UnavailableVideoError:
3658                         self._downloader.trouble(u'\nERROR: unable to download video')
3659
3660
3661 class InfoQIE(InfoExtractor):
3662         """Information extractor for infoq.com"""
3663
3664         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3665         IE_NAME = u'infoq'
3666
3667         def report_webpage(self, video_id):
3668                 """Report information extraction."""
3669                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3670
3671         def report_extraction(self, video_id):
3672                 """Report information extraction."""
3673                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3674
3675         def _real_extract(self, url):
3676                 htmlParser = HTMLParser.HTMLParser()
3677
3678                 mobj = re.match(self._VALID_URL, url)
3679                 if mobj is None:
3680                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3681                         return
3682
3683                 self.report_webpage(url)
3684
3685                 request = urllib2.Request(url)
3686                 try:
3687                         webpage = urllib2.urlopen(request).read()
3688                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3689                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3690                         return
3691
3692                 self.report_extraction(url)
3693
3694
3695                 # Extract video URL
3696                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3697                 if mobj is None:
3698                         self._downloader.trouble(u'ERROR: unable to extract video url')
3699                         return
3700                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3701
3702
3703                 # Extract title
3704                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3705                 if mobj is None:
3706                         self._downloader.trouble(u'ERROR: unable to extract video title')
3707                         return
3708                 video_title = mobj.group(1).decode('utf-8')
3709
3710                 # Extract description
3711                 video_description = u'No description available.'
3712                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3713                 if mobj is not None:
3714                         video_description = mobj.group(1).decode('utf-8')
3715
3716                 video_filename = video_url.split('/')[-1]
3717                 video_id, extension = video_filename.split('.')
3718
3719                 self._downloader.increment_downloads()
3720                 info = {
3721                         'id': video_id,
3722                         'url': video_url,
3723                         'uploader': None,
3724                         'upload_date': None,
3725                         'title': video_title,
3726                         'stitle': _simplify_title(video_title),
3727                         'ext': extension,
3728                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3729                         'thumbnail': None,
3730                         'description': video_description,
3731                         'player_url': None,
3732                 }
3733
3734                 try:
3735                         self._downloader.process_info(info)
3736                 except UnavailableVideoError, err:
3737                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3738
3739 class MixcloudIE(InfoExtractor):
3740         """Information extractor for www.mixcloud.com"""
3741         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3742         IE_NAME = u'mixcloud'
3743
3744         def __init__(self, downloader=None):
3745                 InfoExtractor.__init__(self, downloader)
3746
3747         def report_download_json(self, file_id):
3748                 """Report JSON download."""
3749                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3750
3751         def report_extraction(self, file_id):
3752                 """Report information extraction."""
3753                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3754
3755         def get_urls(self, jsonData, fmt, bitrate='best'):
3756                 """Get urls from 'audio_formats' section in json"""
3757                 file_url = None
3758                 try:
3759                         bitrate_list = jsonData[fmt]
3760                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3761                                 bitrate = max(bitrate_list) # select highest
3762
3763                         url_list = jsonData[fmt][bitrate]
3764                 except TypeError: # we have no bitrate info.
3765                         url_list = jsonData[fmt]
3766
3767                 return url_list
3768
3769         def check_urls(self, url_list):
3770                 """Returns 1st active url from list"""
3771                 for url in url_list:
3772                         try:
3773                                 urllib2.urlopen(url)
3774                                 return url
3775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3776                                 url = None
3777
3778                 return None
3779
3780         def _print_formats(self, formats):
3781                 print 'Available formats:'
3782                 for fmt in formats.keys():
3783                         for b in formats[fmt]:
3784                                 try:
3785                                         ext = formats[fmt][b][0]
3786                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787                                 except TypeError: # we have no bitrate info
3788                                         ext = formats[fmt][0]
3789                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3790                                         break
3791
3792         def _real_extract(self, url):
3793                 mobj = re.match(self._VALID_URL, url)
3794                 if mobj is None:
3795                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3796                         return
3797                 # extract uploader & filename from url
3798                 uploader = mobj.group(1).decode('utf-8')
3799                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3800
3801                 # construct API request
3802                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803                 # retrieve .json file with links to files
3804                 request = urllib2.Request(file_url)
3805                 try:
3806                         self.report_download_json(file_url)
3807                         jsonData = urllib2.urlopen(request).read()
3808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3810                         return
3811
3812                 # parse JSON
3813                 json_data = json.loads(jsonData)
3814                 player_url = json_data['player_swf_url']
3815                 formats = dict(json_data['audio_formats'])
3816
3817                 req_format = self._downloader.params.get('format', None)
3818                 bitrate = None
3819
3820                 if self._downloader.params.get('listformats', None):
3821                         self._print_formats(formats)
3822                         return
3823
3824                 if req_format is None or req_format == 'best':
3825                         for format_param in formats.keys():
3826                                 url_list = self.get_urls(formats, format_param)
3827                                 # check urls
3828                                 file_url = self.check_urls(url_list)
3829                                 if file_url is not None:
3830                                         break # got it!
3831                 else:
3832                         if req_format not in formats.keys():
3833                                 self._downloader.trouble(u'ERROR: format is not available')
3834                                 return
3835
3836                         url_list = self.get_urls(formats, req_format)
3837                         file_url = self.check_urls(url_list)
3838                         format_param = req_format
3839
3840                 # We have audio
3841                 self._downloader.increment_downloads()
3842                 try:
3843                         # Process file information
3844                         self._downloader.process_info({
3845                                 'id': file_id.decode('utf-8'),
3846                                 'url': file_url.decode('utf-8'),
3847                                 'uploader':     uploader.decode('utf-8'),
3848                                 'upload_date': u'NA',
3849                                 'title': json_data['name'],
3850                                 'stitle': _simplify_title(json_data['name']),
3851                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3852                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853                                 'thumbnail': json_data['thumbnail_url'],
3854                                 'description': json_data['description'],
3855                                 'player_url': player_url.decode('utf-8'),
3856                         })
3857                 except UnavailableVideoError, err:
3858                         self._downloader.trouble(u'ERROR: unable to download file')
3859
3860 class StanfordOpenClassroomIE(InfoExtractor):
3861         """Information extractor for Stanford's Open ClassRoom"""
3862
3863         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3864         IE_NAME = u'stanfordoc'
3865
3866         def report_download_webpage(self, objid):
3867                 """Report information extraction."""
3868                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3869
3870         def report_extraction(self, video_id):
3871                 """Report information extraction."""
3872                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3873
3874         def _real_extract(self, url):
3875                 mobj = re.match(self._VALID_URL, url)
3876                 if mobj is None:
3877                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878                         return
3879
3880                 if mobj.group('course') and mobj.group('video'): # A specific video
3881                         course = mobj.group('course')
3882                         video = mobj.group('video')
3883                         info = {
3884                                 'id': _simplify_title(course + '_' + video),
3885                         }
3886
3887                         self.report_extraction(info['id'])
3888                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889                         xmlUrl = baseUrl + video + '.xml'
3890                         try:
3891                                 metaXml = urllib2.urlopen(xmlUrl).read()
3892                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3893                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3894                                 return
3895                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3896                         try:
3897                                 info['title'] = mdoc.findall('./title')[0].text
3898                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3899                         except IndexError:
3900                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3901                                 return
3902                         info['stitle'] = _simplify_title(info['title'])
3903                         info['ext'] = info['url'].rpartition('.')[2]
3904                         info['format'] = info['ext']
3905                         self._downloader.increment_downloads()
3906                         try:
3907                                 self._downloader.process_info(info)
3908                         except UnavailableVideoError, err:
3909                                 self._downloader.trouble(u'\nERROR: unable to download video')
3910                 elif mobj.group('course'): # A course page
3911                         unescapeHTML = HTMLParser.HTMLParser().unescape
3912
3913                         course = mobj.group('course')
3914                         info = {
3915                                 'id': _simplify_title(course),
3916                                 'type': 'playlist',
3917                         }
3918
3919                         self.report_download_webpage(info['id'])
3920                         try:
3921                                 coursepage = urllib2.urlopen(url).read()
3922                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3924                                 return
3925
3926                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3927                         if m:
3928                                 info['title'] = unescapeHTML(m.group(1))
3929                         else:
3930                                 info['title'] = info['id']
3931                         info['stitle'] = _simplify_title(info['title'])
3932
3933                         m = re.search('<description>([^<]+)</description>', coursepage)
3934                         if m:
3935                                 info['description'] = unescapeHTML(m.group(1))
3936
3937                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3938                         info['list'] = [
3939                                 {
3940                                         'type': 'reference',
3941                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3942                                 }
3943                                         for vpage in links]
3944
3945                         for entry in info['list']:
3946                                 assert entry['type'] == 'reference'
3947                                 self.extract(entry['url'])
3948                 else: # Root page
3949                         unescapeHTML = HTMLParser.HTMLParser().unescape
3950
3951                         info = {
3952                                 'id': 'Stanford OpenClassroom',
3953                                 'type': 'playlist',
3954                         }
3955
3956                         self.report_download_webpage(info['id'])
3957                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3958                         try:
3959                                 rootpage = urllib2.urlopen(rootURL).read()
3960                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3962                                 return
3963
3964                         info['title'] = info['id']
3965                         info['stitle'] = _simplify_title(info['title'])
3966
3967                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3968                         info['list'] = [
3969                                 {
3970                                         'type': 'reference',
3971                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3972                                 }
3973                                         for cpage in links]
3974
3975                         for entry in info['list']:
3976                                 assert entry['type'] == 'reference'
3977                                 self.extract(entry['url'])
3978
3979 class MTVIE(InfoExtractor):
3980         """Information extractor for MTV.com"""
3981
3982         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3983         IE_NAME = u'mtv'
3984
3985         def report_webpage(self, video_id):
3986                 """Report information extraction."""
3987                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3988
3989         def report_extraction(self, video_id):
3990                 """Report information extraction."""
3991                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3992
3993         def _real_extract(self, url):
3994                 mobj = re.match(self._VALID_URL, url)
3995                 if mobj is None:
3996                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3997                         return
3998                 if not mobj.group('proto'):
3999                         url = 'http://' + url
4000                 video_id = mobj.group('videoid')
4001                 self.report_webpage(video_id)
4002
4003                 request = urllib2.Request(url)
4004                 try:
4005                         webpage = urllib2.urlopen(request).read()
4006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4008                         return
4009
4010                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4011                 if mobj is None:
4012                         self._downloader.trouble(u'ERROR: unable to extract song name')
4013                         return
4014                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4016                 if mobj is None:
4017                         self._downloader.trouble(u'ERROR: unable to extract performer')
4018                         return
4019                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020                 video_title = performer + ' - ' + song_name
4021
4022                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4023                 if mobj is None:
4024                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4025                         return
4026                 mtvn_uri = mobj.group(1)
4027
4028                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4029                 if mobj is None:
4030                         self._downloader.trouble(u'ERROR: unable to extract content id')
4031                         return
4032                 content_id = mobj.group(1)
4033
4034                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035                 self.report_extraction(video_id)
4036                 request = urllib2.Request(videogen_url)
4037                 try:
4038                         metadataXml = urllib2.urlopen(request).read()
4039                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4041                         return
4042
4043                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044                 renditions = mdoc.findall('.//rendition')
4045
4046                 # For now, always pick the highest quality.
4047                 rendition = renditions[-1]
4048
4049                 try:
4050                         _,_,ext = rendition.attrib['type'].partition('/')
4051                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052                         video_url = rendition.find('./src').text
4053                 except KeyError:
4054                         self._downloader.trouble('Invalid rendition field.')
4055                         return
4056
4057                 self._downloader.increment_downloads()
4058                 info = {
4059                         'id': video_id,
4060                         'url': video_url,
4061                         'uploader': performer,
4062                         'title': video_title,
4063                         'stitle': _simplify_title(video_title),
4064                         'ext': ext,
4065                         'format': format,
4066                 }
4067
4068                 try:
4069                         self._downloader.process_info(info)
4070                 except UnavailableVideoError, err:
4071                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4072
4073
4074 class PostProcessor(object):
4075         """Post Processor class.
4076
4077         PostProcessor objects can be added to downloaders with their
4078         add_post_processor() method. When the downloader has finished a
4079         successful download, it will take its internal chain of PostProcessors
4080         and start calling the run() method on each one of them, first with
4081         an initial argument and then with the returned value of the previous
4082         PostProcessor.
4083
4084         The chain will be stopped if one of them ever returns None or the end
4085         of the chain is reached.
4086
4087         PostProcessor objects follow a "mutual registration" process similar
4088         to InfoExtractor objects.
4089         """
4090
4091         _downloader = None
4092
4093         def __init__(self, downloader=None):
4094                 self._downloader = downloader
4095
4096         def set_downloader(self, downloader):
4097                 """Sets the downloader for this PP."""
4098                 self._downloader = downloader
4099
4100         def run(self, information):
4101                 """Run the PostProcessor.
4102
4103                 The "information" argument is a dictionary like the ones
4104                 composed by InfoExtractors. The only difference is that this
4105                 one has an extra field called "filepath" that points to the
4106                 downloaded file.
4107
4108                 When this method returns None, the postprocessing chain is
4109                 stopped. However, this method may return an information
4110                 dictionary that will be passed to the next postprocessing
4111                 object in the chain. It can be the one it received after
4112                 changing some fields.
4113
4114                 In addition, this method may raise a PostProcessingError
4115                 exception that will be taken into account by the downloader
4116                 it was called from.
4117                 """
4118                 return information # by default, do nothing
4119
4120 class AudioConversionError(BaseException):
4121         def __init__(self, message):
4122                 self.message = message
4123
4124 class FFmpegExtractAudioPP(PostProcessor):
4125
4126         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127                 PostProcessor.__init__(self, downloader)
4128                 if preferredcodec is None:
4129                         preferredcodec = 'best'
4130                 self._preferredcodec = preferredcodec
4131                 self._preferredquality = preferredquality
4132                 self._keepvideo = keepvideo
4133
4134         @staticmethod
4135         def get_audio_codec(path):
4136                 try:
4137                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4138                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139                         output = handle.communicate()[0]
4140                         if handle.wait() != 0:
4141                                 return None
4142                 except (IOError, OSError):
4143                         return None
4144                 audio_codec = None
4145                 for line in output.split('\n'):
4146                         if line.startswith('codec_name='):
4147                                 audio_codec = line.split('=')[1].strip()
4148                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4149                                 return audio_codec
4150                 return None
4151
4152         @staticmethod
4153         def run_ffmpeg(path, out_path, codec, more_opts):
4154                 if codec is None:
4155                         acodec_opts = []
4156                 else:
4157                         acodec_opts = ['-acodec', codec]
4158                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4159                 try:
4160                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161                         stdout,stderr = p.communicate()
4162                 except (IOError, OSError):
4163                         e = sys.exc_info()[1]
4164                         if isinstance(e, OSError) and e.errno == 2:
4165                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4166                         else:
4167                                 raise e
4168                 if p.returncode != 0:
4169                         msg = stderr.strip().split('\n')[-1]
4170                         raise AudioConversionError(msg)
4171
4172         def run(self, information):
4173                 path = information['filepath']
4174
4175                 filecodec = self.get_audio_codec(path)
4176                 if filecodec is None:
4177                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4178                         return None
4179
4180                 more_opts = []
4181                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183                                 # Lossless, but in another container
4184                                 acodec = 'copy'
4185                                 extension = self._preferredcodec
4186                                 more_opts = ['-absf', 'aac_adtstoasc']
4187                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4188                                 # Lossless if possible
4189                                 acodec = 'copy'
4190                                 extension = filecodec
4191                                 if filecodec == 'aac':
4192                                         more_opts = ['-f', 'adts']
4193                                 if filecodec == 'vorbis':
4194                                         extension = 'ogg'
4195                         else:
4196                                 # MP3 otherwise.
4197                                 acodec = 'libmp3lame'
4198                                 extension = 'mp3'
4199                                 more_opts = []
4200                                 if self._preferredquality is not None:
4201                                         more_opts += ['-ab', self._preferredquality]
4202                 else:
4203                         # We convert the audio (lossy)
4204                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4205                         extension = self._preferredcodec
4206                         more_opts = []
4207                         if self._preferredquality is not None:
4208                                 more_opts += ['-ab', self._preferredquality]
4209                         if self._preferredcodec == 'aac':
4210                                 more_opts += ['-f', 'adts']
4211                         if self._preferredcodec == 'm4a':
4212                                 more_opts += ['-absf', 'aac_adtstoasc']
4213                         if self._preferredcodec == 'vorbis':
4214                                 extension = 'ogg'
4215                         if self._preferredcodec == 'wav':
4216                                 extension = 'wav'
4217                                 more_opts += ['-f', 'wav']
4218
4219                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220                 new_path = prefix + sep + extension
4221                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4222                 try:
4223                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4224                 except:
4225                         etype,e,tb = sys.exc_info()
4226                         if isinstance(e, AudioConversionError):
4227                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4228                         else:
4229                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4230                         return None
4231
4232                 # Try to update the date time for extracted audio file.
4233                 if information.get('filetime') is not None:
4234                         try:
4235                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4236                         except:
4237                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4238
4239                 if not self._keepvideo:
4240                         try:
4241                                 os.remove(_encodeFilename(path))
4242                         except (IOError, OSError):
4243                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4244                                 return None
4245
4246                 information['filepath'] = new_path
4247                 return information
4248
4249
4250 def updateSelf(downloader, filename):
4251         ''' Update the program file with the latest version from the repository '''
4252         # Note: downloader only used for options
4253         if not os.access(filename, os.W_OK):
4254                 sys.exit('ERROR: no write permissions on %s' % filename)
4255
4256         downloader.to_screen(u'Updating to latest version...')
4257
4258         try:
4259                 try:
4260                         urlh = urllib.urlopen(UPDATE_URL)
4261                         newcontent = urlh.read()
4262
4263                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264                         if vmatch is not None and vmatch.group(1) == __version__:
4265                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4266                                 return
4267                 finally:
4268                         urlh.close()
4269         except (IOError, OSError), err:
4270                 sys.exit('ERROR: unable to download latest version')
4271
4272         try:
4273                 outf = open(filename, 'wb')
4274                 try:
4275                         outf.write(newcontent)
4276                 finally:
4277                         outf.close()
4278         except (IOError, OSError), err:
4279                 sys.exit('ERROR: unable to overwrite current version')
4280
4281         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4282
4283 def parseOpts():
4284         def _readOptions(filename_bytes):
4285                 try:
4286                         optionf = open(filename_bytes)
4287                 except IOError:
4288                         return [] # silently skip if file is not present
4289                 try:
4290                         res = []
4291                         for l in optionf:
4292                                 res += shlex.split(l, comments=True)
4293                 finally:
4294                         optionf.close()
4295                 return res
4296
4297         def _format_option_string(option):
4298                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4299
4300                 opts = []
4301
4302                 if option._short_opts: opts.append(option._short_opts[0])
4303                 if option._long_opts: opts.append(option._long_opts[0])
4304                 if len(opts) > 1: opts.insert(1, ', ')
4305
4306                 if option.takes_value(): opts.append(' %s' % option.metavar)
4307
4308                 return "".join(opts)
4309
4310         def _find_term_columns():
4311                 columns = os.environ.get('COLUMNS', None)
4312                 if columns:
4313                         return int(columns)
4314
4315                 try:
4316                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317                         out,err = sp.communicate()
4318                         return int(out.split()[1])
4319                 except:
4320                         pass
4321                 return None
4322
4323         max_width = 80
4324         max_help_position = 80
4325
4326         # No need to wrap help messages if we're on a wide console
4327         columns = _find_term_columns()
4328         if columns: max_width = columns
4329
4330         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331         fmt.format_option_strings = _format_option_string
4332
4333         kw = {
4334                 'version'   : __version__,
4335                 'formatter' : fmt,
4336                 'usage' : '%prog [options] url [url...]',
4337                 'conflict_handler' : 'resolve',
4338         }
4339
4340         parser = optparse.OptionParser(**kw)
4341
4342         # option groups
4343         general        = optparse.OptionGroup(parser, 'General Options')
4344         selection      = optparse.OptionGroup(parser, 'Video Selection')
4345         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4347         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4348         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4349         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4350
4351         general.add_option('-h', '--help',
4352                         action='help', help='print this help text and exit')
4353         general.add_option('-v', '--version',
4354                         action='version', help='print program version and exit')
4355         general.add_option('-U', '--update',
4356                         action='store_true', dest='update_self', help='update this program to latest version')
4357         general.add_option('-i', '--ignore-errors',
4358                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359         general.add_option('-r', '--rate-limit',
4360                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361         general.add_option('-R', '--retries',
4362                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363         general.add_option('--dump-user-agent',
4364                         action='store_true', dest='dump_user_agent',
4365                         help='display the current browser identification', default=False)
4366         general.add_option('--list-extractors',
4367                         action='store_true', dest='list_extractors',
4368                         help='List all supported extractors and the URLs they would handle', default=False)
4369
4370         selection.add_option('--playlist-start',
4371                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372         selection.add_option('--playlist-end',
4373                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4376         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4377
4378         authentication.add_option('-u', '--username',
4379                         dest='username', metavar='USERNAME', help='account username')
4380         authentication.add_option('-p', '--password',
4381                         dest='password', metavar='PASSWORD', help='account password')
4382         authentication.add_option('-n', '--netrc',
4383                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4384
4385
4386         video_format.add_option('-f', '--format',
4387                         action='store', dest='format', metavar='FORMAT', help='video format code')
4388         video_format.add_option('--all-formats',
4389                         action='store_const', dest='format', help='download all available video formats', const='all')
4390         video_format.add_option('--prefer-free-formats',
4391                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4392         video_format.add_option('--max-quality',
4393                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394         video_format.add_option('-F', '--list-formats',
4395                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4396         video_format.add_option('--write-srt',
4397                         action='store_true', dest='writesubtitles',
4398                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399         video_format.add_option('--srt-lang',
4400                         action='store', dest='subtitleslang', metavar='LANG',
4401                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4402
4403
4404         verbosity.add_option('-q', '--quiet',
4405                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406         verbosity.add_option('-s', '--simulate',
4407                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408         verbosity.add_option('--skip-download',
4409                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4410         verbosity.add_option('-g', '--get-url',
4411                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412         verbosity.add_option('-e', '--get-title',
4413                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414         verbosity.add_option('--get-thumbnail',
4415                         action='store_true', dest='getthumbnail',
4416                         help='simulate, quiet but print thumbnail URL', default=False)
4417         verbosity.add_option('--get-description',
4418                         action='store_true', dest='getdescription',
4419                         help='simulate, quiet but print video description', default=False)
4420         verbosity.add_option('--get-filename',
4421                         action='store_true', dest='getfilename',
4422                         help='simulate, quiet but print output filename', default=False)
4423         verbosity.add_option('--get-format',
4424                         action='store_true', dest='getformat',
4425                         help='simulate, quiet but print output format', default=False)
4426         verbosity.add_option('--no-progress',
4427                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428         verbosity.add_option('--console-title',
4429                         action='store_true', dest='consoletitle',
4430                         help='display progress in console titlebar', default=False)
4431         verbosity.add_option('-v', '--verbose',
4432                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4433
4434
4435         filesystem.add_option('-t', '--title',
4436                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4437         filesystem.add_option('-l', '--literal',
4438                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439         filesystem.add_option('-A', '--auto-number',
4440                         action='store_true', dest='autonumber',
4441                         help='number downloaded files starting from 00000', default=False)
4442         filesystem.add_option('-o', '--output',
4443                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4444         filesystem.add_option('-a', '--batch-file',
4445                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446         filesystem.add_option('-w', '--no-overwrites',
4447                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448         filesystem.add_option('-c', '--continue',
4449                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4450         filesystem.add_option('--no-continue',
4451                         action='store_false', dest='continue_dl',
4452                         help='do not resume partially downloaded files (restart from beginning)')
4453         filesystem.add_option('--cookies',
4454                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455         filesystem.add_option('--no-part',
4456                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4457         filesystem.add_option('--no-mtime',
4458                         action='store_false', dest='updatetime',
4459                         help='do not use the Last-modified header to set the file modification time', default=True)
4460         filesystem.add_option('--write-description',
4461                         action='store_true', dest='writedescription',
4462                         help='write video description to a .description file', default=False)
4463         filesystem.add_option('--write-info-json',
4464                         action='store_true', dest='writeinfojson',
4465                         help='write video metadata to a .info.json file', default=False)
4466
4467
4468         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4471                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4472         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473                         help='ffmpeg audio bitrate specification, 128k by default')
4474         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4476
4477
4478         parser.add_option_group(general)
4479         parser.add_option_group(selection)
4480         parser.add_option_group(filesystem)
4481         parser.add_option_group(verbosity)
4482         parser.add_option_group(video_format)
4483         parser.add_option_group(authentication)
4484         parser.add_option_group(postproc)
4485
4486         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4487         if xdg_config_home:
4488                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4489         else:
4490                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4492         opts, args = parser.parse_args(argv)
4493
4494         return parser, opts, args
4495
4496 def gen_extractors():
4497         """ Return a list of an instance of every supported extractor.
4498         The order does matter; the first extractor matched is the one handling the URL.
4499         """
4500         youtube_ie = YoutubeIE()
4501         google_ie = GoogleIE()
4502         yahoo_ie = YahooIE()
4503         return [
4504                 YoutubePlaylistIE(youtube_ie),
4505                 YoutubeUserIE(youtube_ie),
4506                 YoutubeSearchIE(youtube_ie),
4507                 youtube_ie,
4508                 MetacafeIE(youtube_ie),
4509                 DailymotionIE(),
4510                 google_ie,
4511                 GoogleSearchIE(google_ie),
4512                 PhotobucketIE(),
4513                 yahoo_ie,
4514                 YahooSearchIE(yahoo_ie),
4515                 DepositFilesIE(),
4516                 FacebookIE(),
4517                 BlipTVIE(),
4518                 VimeoIE(),
4519                 MyVideoIE(),
4520                 ComedyCentralIE(),
4521                 EscapistIE(),
4522                 CollegeHumorIE(),
4523                 XVideosIE(),
4524                 SoundcloudIE(),
4525                 InfoQIE(),
4526                 MixcloudIE(),
4527                 StanfordOpenClassroomIE(),
4528                 MTVIE(),
4529
4530                 GenericIE()
4531         ]
4532
4533 def _real_main():
4534         parser, opts, args = parseOpts()
4535
4536         # Open appropriate CookieJar
4537         if opts.cookiefile is None:
4538                 jar = cookielib.CookieJar()
4539         else:
4540                 try:
4541                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4543                                 jar.load()
4544                 except (IOError, OSError), err:
4545                         sys.exit(u'ERROR: unable to open cookie file')
4546
4547         # Dump user agent
4548         if opts.dump_user_agent:
4549                 print std_headers['User-Agent']
4550                 sys.exit(0)
4551
4552         # Batch file verification
4553         batchurls = []
4554         if opts.batchfile is not None:
4555                 try:
4556                         if opts.batchfile == '-':
4557                                 batchfd = sys.stdin
4558                         else:
4559                                 batchfd = open(opts.batchfile, 'r')
4560                         batchurls = batchfd.readlines()
4561                         batchurls = [x.strip() for x in batchurls]
4562                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4563                 except IOError:
4564                         sys.exit(u'ERROR: batch file could not be read')
4565         all_urls = batchurls + args
4566         all_urls = map(lambda url: url.strip(), all_urls)
4567
4568         # General configuration
4569         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4570         proxy_handler = urllib2.ProxyHandler()
4571         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4572         urllib2.install_opener(opener)
4573         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4574
4575         if opts.verbose:
4576                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4577
4578         extractors = gen_extractors()
4579
4580         if opts.list_extractors:
4581                 for ie in extractors:
4582                         print(ie.IE_NAME)
4583                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4584                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4585                         for mu in matchedUrls:
4586                                 print(u'  ' + mu)
4587                 sys.exit(0)
4588
4589         # Conflicting, missing and erroneous options
4590         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4591                 parser.error(u'using .netrc conflicts with giving username/password')
4592         if opts.password is not None and opts.username is None:
4593                 parser.error(u'account username missing')
4594         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4595                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4596         if opts.usetitle and opts.useliteral:
4597                 parser.error(u'using title conflicts with using literal title')
4598         if opts.username is not None and opts.password is None:
4599                 opts.password = getpass.getpass(u'Type account password and press return:')
4600         if opts.ratelimit is not None:
4601                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4602                 if numeric_limit is None:
4603                         parser.error(u'invalid rate limit specified')
4604                 opts.ratelimit = numeric_limit
4605         if opts.retries is not None:
4606                 try:
4607                         opts.retries = long(opts.retries)
4608                 except (TypeError, ValueError), err:
4609                         parser.error(u'invalid retry count specified')
4610         try:
4611                 opts.playliststart = int(opts.playliststart)
4612                 if opts.playliststart <= 0:
4613                         raise ValueError(u'Playlist start must be positive')
4614         except (TypeError, ValueError), err:
4615                 parser.error(u'invalid playlist start number specified')
4616         try:
4617                 opts.playlistend = int(opts.playlistend)
4618                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4619                         raise ValueError(u'Playlist end must be greater than playlist start')
4620         except (TypeError, ValueError), err:
4621                 parser.error(u'invalid playlist end number specified')
4622         if opts.extractaudio:
4623                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4624                         parser.error(u'invalid audio format specified')
4625
4626         # File downloader
4627         fd = FileDownloader({
4628                 'usenetrc': opts.usenetrc,
4629                 'username': opts.username,
4630                 'password': opts.password,
4631                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4632                 'forceurl': opts.geturl,
4633                 'forcetitle': opts.gettitle,
4634                 'forcethumbnail': opts.getthumbnail,
4635                 'forcedescription': opts.getdescription,
4636                 'forcefilename': opts.getfilename,
4637                 'forceformat': opts.getformat,
4638                 'simulate': opts.simulate,
4639                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4640                 'format': opts.format,
4641                 'format_limit': opts.format_limit,
4642                 'listformats': opts.listformats,
4643                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4644                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4645                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4646                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4647                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4648                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4649                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4650                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4651                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4652                         or u'%(id)s.%(ext)s'),
4653                 'ignoreerrors': opts.ignoreerrors,
4654                 'ratelimit': opts.ratelimit,
4655                 'nooverwrites': opts.nooverwrites,
4656                 'retries': opts.retries,
4657                 'continuedl': opts.continue_dl,
4658                 'noprogress': opts.noprogress,
4659                 'playliststart': opts.playliststart,
4660                 'playlistend': opts.playlistend,
4661                 'logtostderr': opts.outtmpl == '-',
4662                 'consoletitle': opts.consoletitle,
4663                 'nopart': opts.nopart,
4664                 'updatetime': opts.updatetime,
4665                 'writedescription': opts.writedescription,
4666                 'writeinfojson': opts.writeinfojson,
4667                 'writesubtitles': opts.writesubtitles,
4668                 'subtitleslang': opts.subtitleslang,
4669                 'matchtitle': opts.matchtitle,
4670                 'rejecttitle': opts.rejecttitle,
4671                 'max_downloads': opts.max_downloads,
4672                 'prefer_free_formats': opts.prefer_free_formats,
4673                 'verbose': opts.verbose,
4674                 })
4675         for extractor in extractors:
4676                 fd.add_info_extractor(extractor)
4677
4678         # PostProcessors
4679         if opts.extractaudio:
4680                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4681
4682         # Update version
4683         if opts.update_self:
4684                 updateSelf(fd, sys.argv[0])
4685
4686         # Maybe do nothing
4687         if len(all_urls) < 1:
4688                 if not opts.update_self:
4689                         parser.error(u'you must provide at least one URL')
4690                 else:
4691                         sys.exit()
4692
4693         try:
4694                 retcode = fd.download(all_urls)
4695         except MaxDownloadsReached:
4696                 fd.to_screen(u'--max-download limit reached, aborting.')
4697                 retcode = 101
4698
4699         # Dump cookie jar if requested
4700         if opts.cookiefile is not None:
4701                 try:
4702                         jar.save()
4703                 except (IOError, OSError), err:
4704                         sys.exit(u'ERROR: unable to save cookie jar')
4705
4706         sys.exit(retcode)
4707
4708 def main():
4709         try:
4710                 _real_main()
4711         except DownloadError:
4712                 sys.exit(1)
4713         except SameFileError:
4714                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4715         except KeyboardInterrupt:
4716                 sys.exit(u'\nERROR: Interrupted by user')
4717
4718 if __name__ == '__main__':
4719         main()
4720
4721 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: