_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import lxml.etree
  72 except ImportError:
  73         pass # Handled below
  74
  75 try:
  76         import xml.etree.ElementTree
  77 except ImportError: # Python<2.5: Not officially supported, but let it slip
  78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  79
  80 std_headers = {
  81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84         'Accept-Encoding': 'gzip, deflate',
  85         'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88 try:
  89         import json
  90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  91         import re
  92         class json(object):
  93                 @staticmethod
  94                 def loads(s):
  95                         s = s.decode('UTF-8')
  96                         def raiseError(msg, i):
  97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  98                         def skipSpace(i, expectMore=True):
  99                                 while i < len(s) and s[i] in ' \t\r\n':
 100                                         i += 1
 101                                 if expectMore:
 102                                         if i >= len(s):
 103                                                 raiseError('Premature end', i)
 104                                 return i
 105                         def decodeEscape(match):
 106                                 esc = match.group(1)
 107                                 _STATIC = {
 108                                         '"': '"',
 109                                         '\\': '\\',
 110                                         '/': '/',
 111                                         'b': unichr(0x8),
 112                                         'f': unichr(0xc),
 113                                         'n': '\n',
 114                                         'r': '\r',
 115                                         't': '\t',
 116                                 }
 117                                 if esc in _STATIC:
 118                                         return _STATIC[esc]
 119                                 if esc[0] == 'u':
 120                                         if len(esc) == 1+4:
 121                                                 return unichr(int(esc[1:5], 16))
 122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 123                                                 hi = int(esc[1:5], 16)
 124                                                 low = int(esc[7:11], 16)
 125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 126                                 raise ValueError('Unknown escape ' + str(esc))
 127                         def parseString(i):
 128                                 i += 1
 129                                 e = i
 130                                 while True:
 131                                         e = s.index('"', e)
 132                                         bslashes = 0
 133                                         while s[e-bslashes-1] == '\\':
 134                                                 bslashes += 1
 135                                         if bslashes % 2 == 1:
 136                                                 e += 1
 137                                                 continue
 138                                         break
 139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 140                                 stri = rexp.sub(decodeEscape, s[i:e])
 141                                 return (e+1,stri)
 142                         def parseObj(i):
 143                                 i += 1
 144                                 res = {}
 145                                 i = skipSpace(i)
 146                                 if s[i] == '}': # Empty dictionary
 147                                         return (i+1,res)
 148                                 while True:
 149                                         if s[i] != '"':
 150                                                 raiseError('Expected a string object key', i)
 151                                         i,key = parseString(i)
 152                                         i = skipSpace(i)
 153                                         if i >= len(s) or s[i] != ':':
 154                                                 raiseError('Expected a colon', i)
 155                                         i,val = parse(i+1)
 156                                         res[key] = val
 157                                         i = skipSpace(i)
 158                                         if s[i] == '}':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected comma or closing curly brace', i)
 162                                         i = skipSpace(i+1)
 163                         def parseArray(i):
 164                                 res = []
 165                                 i = skipSpace(i+1)
 166                                 if s[i] == ']': # Empty array
 167                                         return (i+1,res)
 168                                 while True:
 169                                         i,val = parse(i)
 170                                         res.append(val)
 171                                         i = skipSpace(i) # Raise exception if premature end
 172                                         if s[i] == ']':
 173                                                 return (i+1, res)
 174                                         if s[i] != ',':
 175                                                 raiseError('Expected a comma or closing bracket', i)
 176                                         i = skipSpace(i+1)
 177                         def parseDiscrete(i):
 178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 179                                         if s.startswith(k, i):
 180                                                 return (i+len(k), v)
 181                                 raiseError('Not a boolean (or null)', i)
 182                         def parseNumber(i):
 183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 184                                 if mobj is None:
 185                                         raiseError('Not a number', i)
 186                                 nums = mobj.group(1)
 187                                 if '.' in nums or 'e' in nums or 'E' in nums:
 188                                         return (i+len(nums), float(nums))
 189                                 return (i+len(nums), int(nums))
 190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 191                         def parse(i):
 192                                 i = skipSpace(i)
 193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 194                                 i = skipSpace(i, False)
 195                                 return (i,res)
 196                         i,res = parse(0)
 197                         if i < len(s):
 198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 199                         return res
 200
 201 def preferredencoding():
 202         """Get preferred encoding.
 203
 204         Returns the best encoding scheme for the system, based on
 205         locale.getpreferredencoding() and some further tweaks.
 206         """
 207         def yield_preferredencoding():
 208                 try:
 209                         pref = locale.getpreferredencoding()
 210                         u'TEST'.encode(pref)
 211                 except:
 212                         pref = 'UTF-8'
 213                 while True:
 214                         yield pref
 215         return yield_preferredencoding().next()
 216
 217
 218 def htmlentity_transform(matchobj):
 219         """Transforms an HTML entity to a Unicode character.
 220
 221         This function receives a match object and is intended to be used with
 222         the re.sub() function.
 223         """
 224         entity = matchobj.group(1)
 225
 226         # Known non-numeric HTML entity
 227         if entity in htmlentitydefs.name2codepoint:
 228                 return unichr(htmlentitydefs.name2codepoint[entity])
 229
 230         # Unicode character
 231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 232         if mobj is not None:
 233                 numstr = mobj.group(1)
 234                 if numstr.startswith(u'x'):
 235                         base = 16
 236                         numstr = u'0%s' % numstr
 237                 else:
 238                         base = 10
 239                 return unichr(long(numstr, base))
 240
 241         # Unknown entity in name, return its literal representation
 242         return (u'&%s;' % entity)
 243
 244
 245 def sanitize_title(utitle):
 246         """Sanitizes a video title so it could be used as part of a filename."""
 247         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 248         return utitle.replace(unicode(os.sep), u'%')
 249
 250
 251 def sanitize_open(filename, open_mode):
 252         """Try to open the given filename, and slightly tweak it if this fails.
 253
 254         Attempts to open the given filename. If this fails, it tries to change
 255         the filename slightly, step by step, until it's either able to open it
 256         or it fails and raises a final exception, like the standard open()
 257         function.
 258
 259         It returns the tuple (stream, definitive_file_name).
 260         """
 261         try:
 262                 if filename == u'-':
 263                         if sys.platform == 'win32':
 264                                 import msvcrt
 265                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 266                         return (sys.stdout, filename)
 267                 stream = open(_encodeFilename(filename), open_mode)
 268                 return (stream, filename)
 269         except (IOError, OSError), err:
 270                 # In case of error, try to remove win32 forbidden chars
 271                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 272
 273                 # An exception here should be caught in the caller
 274                 stream = open(_encodeFilename(filename), open_mode)
 275                 return (stream, filename)
 276
 277
 278 def timeconvert(timestr):
 279         """Convert RFC 2822 defined time string into system timestamp"""
 280         timestamp = None
 281         timetuple = email.utils.parsedate_tz(timestr)
 282         if timetuple is not None:
 283                 timestamp = email.utils.mktime_tz(timetuple)
 284         return timestamp
 285
 286 def _simplify_title(title):
 287         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 288         return expr.sub(u'_', title).strip(u'_')
 289
 290 def _orderedSet(iterable):
 291         """ Remove all duplicates from the input iterable """
 292         res = []
 293         for el in iterable:
 294                 if el not in res:
 295                         res.append(el)
 296         return res
 297
 298 def _unescapeHTML(s):
 299         """
 300         @param s a string (of type unicode)
 301         """
 302         assert type(s) == type(u'')
 303
 304         htmlParser = HTMLParser.HTMLParser()
 305         return htmlParser.unescape(s)
 306
 307 def _encodeFilename(s):
 308         """
 309         @param s The name of the file (of type unicode)
 310         """
 311
 312         assert type(s) == type(u'')
 313
 314         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 315                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 316                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 317                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 318                 return s
 319         else:
 320                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 321
 322 class DownloadError(Exception):
 323         """Download Error exception.
 324
 325         This exception may be thrown by FileDownloader objects if they are not
 326         configured to continue on errors. They will contain the appropriate
 327         error message.
 328         """
 329         pass
 330
 331
 332 class SameFileError(Exception):
 333         """Same File exception.
 334
 335         This exception will be thrown by FileDownloader objects if they detect
 336         multiple files would have to be downloaded to the same file on disk.
 337         """
 338         pass
 339
 340
 341 class PostProcessingError(Exception):
 342         """Post Processing exception.
 343
 344         This exception may be raised by PostProcessor's .run() method to
 345         indicate an error in the postprocessing task.
 346         """
 347         pass
 348
 349 class MaxDownloadsReached(Exception):
 350         """ --max-downloads limit has been reached. """
 351         pass
 352
 353
 354 class UnavailableVideoError(Exception):
 355         """Unavailable Format exception.
 356
 357         This exception will be thrown when a video is requested
 358         in a format that is not available for that video.
 359         """
 360         pass
 361
 362
 363 class ContentTooShortError(Exception):
 364         """Content Too Short exception.
 365
 366         This exception may be raised by FileDownloader objects when a file they
 367         download is too small for what the server announced first, indicating
 368         the connection was probably interrupted.
 369         """
 370         # Both in bytes
 371         downloaded = None
 372         expected = None
 373
 374         def __init__(self, downloaded, expected):
 375                 self.downloaded = downloaded
 376                 self.expected = expected
 377
 378
 379 class YoutubeDLHandler(urllib2.HTTPHandler):
 380         """Handler for HTTP requests and responses.
 381
 382         This class, when installed with an OpenerDirector, automatically adds
 383         the standard headers to every HTTP request and handles gzipped and
 384         deflated responses from web servers. If compression is to be avoided in
 385         a particular request, the original request in the program code only has
 386         to include the HTTP header "Youtubedl-No-Compression", which will be
 387         removed before making the real request.
 388
 389         Part of this code was copied from:
 390
 391         http://techknack.net/python-urllib2-handlers/
 392
 393         Andrew Rowls, the author of that code, agreed to release it to the
 394         public domain.
 395         """
 396
 397         @staticmethod
 398         def deflate(data):
 399                 try:
 400                         return zlib.decompress(data, -zlib.MAX_WBITS)
 401                 except zlib.error:
 402                         return zlib.decompress(data)
 403
 404         @staticmethod
 405         def addinfourl_wrapper(stream, headers, url, code):
 406                 if hasattr(urllib2.addinfourl, 'getcode'):
 407                         return urllib2.addinfourl(stream, headers, url, code)
 408                 ret = urllib2.addinfourl(stream, headers, url)
 409                 ret.code = code
 410                 return ret
 411
 412         def http_request(self, req):
 413                 for h in std_headers:
 414                         if h in req.headers:
 415                                 del req.headers[h]
 416                         req.add_header(h, std_headers[h])
 417                 if 'Youtubedl-no-compression' in req.headers:
 418                         if 'Accept-encoding' in req.headers:
 419                                 del req.headers['Accept-encoding']
 420                         del req.headers['Youtubedl-no-compression']
 421                 return req
 422
 423         def http_response(self, req, resp):
 424                 old_resp = resp
 425                 # gzip
 426                 if resp.headers.get('Content-encoding', '') == 'gzip':
 427                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 428                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 429                         resp.msg = old_resp.msg
 430                 # deflate
 431                 if resp.headers.get('Content-encoding', '') == 'deflate':
 432                         gz = StringIO.StringIO(self.deflate(resp.read()))
 433                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 434                         resp.msg = old_resp.msg
 435                 return resp
 436
 437
 438 class FileDownloader(object):
 439         """File Downloader class.
 440
 441         File downloader objects are the ones responsible of downloading the
 442         actual video file and writing it to disk if the user has requested
 443         it, among some other tasks. In most cases there should be one per
 444         program. As, given a video URL, the downloader doesn't know how to
 445         extract all the needed information, task that InfoExtractors do, it
 446         has to pass the URL to one of them.
 447
 448         For this, file downloader objects have a method that allows
 449         InfoExtractors to be registered in a given order. When it is passed
 450         a URL, the file downloader handles it to the first InfoExtractor it
 451         finds that reports being able to handle it. The InfoExtractor extracts
 452         all the information about the video or videos the URL refers to, and
 453         asks the FileDownloader to process the video information, possibly
 454         downloading the video.
 455
 456         File downloaders accept a lot of parameters. In order not to saturate
 457         the object constructor with arguments, it receives a dictionary of
 458         options instead. These options are available through the params
 459         attribute for the InfoExtractors to use. The FileDownloader also
 460         registers itself as the downloader in charge for the InfoExtractors
 461         that are added to it, so this is a "mutual registration".
 462
 463         Available options:
 464
 465         username:         Username for authentication purposes.
 466         password:         Password for authentication purposes.
 467         usenetrc:         Use netrc for authentication instead.
 468         quiet:            Do not print messages to stdout.
 469         forceurl:         Force printing final URL.
 470         forcetitle:       Force printing title.
 471         forcethumbnail:   Force printing thumbnail URL.
 472         forcedescription: Force printing description.
 473         forcefilename:    Force printing final filename.
 474         simulate:         Do not download the video files.
 475         format:           Video format code.
 476         format_limit:     Highest quality format to try.
 477         outtmpl:          Template for output names.
 478         ignoreerrors:     Do not stop on download errors.
 479         ratelimit:        Download speed limit, in bytes/sec.
 480         nooverwrites:     Prevent overwriting files.
 481         retries:          Number of times to retry for HTTP error 5xx
 482         continuedl:       Try to continue downloads if possible.
 483         noprogress:       Do not print the progress bar.
 484         playliststart:    Playlist item to start at.
 485         playlistend:      Playlist item to end at.
 486         matchtitle:       Download only matching titles.
 487         rejecttitle:      Reject downloads for matching titles.
 488         logtostderr:      Log messages to stderr instead of stdout.
 489         consoletitle:     Display progress in console window's titlebar.
 490         nopart:           Do not use temporary .part files.
 491         updatetime:       Use the Last-modified header to set output file timestamps.
 492         writedescription: Write the video description to a .description file
 493         writeinfojson:    Write the video description to a .info.json file
 494         writesubtitles:   Write the video subtitles to a .srt file
 495         subtitleslang:    Language of the subtitles to download
 496         """
 497
 498         params = None
 499         _ies = []
 500         _pps = []
 501         _download_retcode = None
 502         _num_downloads = None
 503         _screen_file = None
 504
 505         def __init__(self, params):
 506                 """Create a FileDownloader object with the given options."""
 507                 self._ies = []
 508                 self._pps = []
 509                 self._download_retcode = 0
 510                 self._num_downloads = 0
 511                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 512                 self.params = params
 513
 514         @staticmethod
 515         def format_bytes(bytes):
 516                 if bytes is None:
 517                         return 'N/A'
 518                 if type(bytes) is str:
 519                         bytes = float(bytes)
 520                 if bytes == 0.0:
 521                         exponent = 0
 522                 else:
 523                         exponent = long(math.log(bytes, 1024.0))
 524                 suffix = 'bkMGTPEZY'[exponent]
 525                 converted = float(bytes) / float(1024 ** exponent)
 526                 return '%.2f%s' % (converted, suffix)
 527
 528         @staticmethod
 529         def calc_percent(byte_counter, data_len):
 530                 if data_len is None:
 531                         return '---.-%'
 532                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 533
 534         @staticmethod
 535         def calc_eta(start, now, total, current):
 536                 if total is None:
 537                         return '--:--'
 538                 dif = now - start
 539                 if current == 0 or dif < 0.001: # One millisecond
 540                         return '--:--'
 541                 rate = float(current) / dif
 542                 eta = long((float(total) - float(current)) / rate)
 543                 (eta_mins, eta_secs) = divmod(eta, 60)
 544                 if eta_mins > 99:
 545                         return '--:--'
 546                 return '%02d:%02d' % (eta_mins, eta_secs)
 547
 548         @staticmethod
 549         def calc_speed(start, now, bytes):
 550                 dif = now - start
 551                 if bytes == 0 or dif < 0.001: # One millisecond
 552                         return '%10s' % '---b/s'
 553                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 554
 555         @staticmethod
 556         def best_block_size(elapsed_time, bytes):
 557                 new_min = max(bytes / 2.0, 1.0)
 558                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 559                 if elapsed_time < 0.001:
 560                         return long(new_max)
 561                 rate = bytes / elapsed_time
 562                 if rate > new_max:
 563                         return long(new_max)
 564                 if rate < new_min:
 565                         return long(new_min)
 566                 return long(rate)
 567
 568         @staticmethod
 569         def parse_bytes(bytestr):
 570                 """Parse a string indicating a byte quantity into a long integer."""
 571                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 572                 if matchobj is None:
 573                         return None
 574                 number = float(matchobj.group(1))
 575                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 576                 return long(round(number * multiplier))
 577
 578         def add_info_extractor(self, ie):
 579                 """Add an InfoExtractor object to the end of the list."""
 580                 self._ies.append(ie)
 581                 ie.set_downloader(self)
 582
 583         def add_post_processor(self, pp):
 584                 """Add a PostProcessor object to the end of the chain."""
 585                 self._pps.append(pp)
 586                 pp.set_downloader(self)
 587
 588         def to_screen(self, message, skip_eol=False):
 589                 """Print message to stdout if not in quiet mode."""
 590                 assert type(message) == type(u'')
 591                 if not self.params.get('quiet', False):
 592                         terminator = [u'\n', u''][skip_eol]
 593                         output = message + terminator
 594
 595                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 596                                 output = output.encode(preferredencoding(), 'ignore')
 597                         self._screen_file.write(output)
 598                         self._screen_file.flush()
 599
 600         def to_stderr(self, message):
 601                 """Print message to stderr."""
 602                 print >>sys.stderr, message.encode(preferredencoding())
 603
 604         def to_cons_title(self, message):
 605                 """Set console/terminal window title to message."""
 606                 if not self.params.get('consoletitle', False):
 607                         return
 608                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 609                         # c_wchar_p() might not be necessary if `message` is
 610                         # already of type unicode()
 611                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 612                 elif 'TERM' in os.environ:
 613                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 614
 615         def fixed_template(self):
 616                 """Checks if the output template is fixed."""
 617                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 618
 619         def trouble(self, message=None):
 620                 """Determine action to take when a download problem appears.
 621
 622                 Depending on if the downloader has been configured to ignore
 623                 download errors or not, this method may throw an exception or
 624                 not when errors are found, after printing the message.
 625                 """
 626                 if message is not None:
 627                         self.to_stderr(message)
 628                 if not self.params.get('ignoreerrors', False):
 629                         raise DownloadError(message)
 630                 self._download_retcode = 1
 631
 632         def slow_down(self, start_time, byte_counter):
 633                 """Sleep if the download speed is over the rate limit."""
 634                 rate_limit = self.params.get('ratelimit', None)
 635                 if rate_limit is None or byte_counter == 0:
 636                         return
 637                 now = time.time()
 638                 elapsed = now - start_time
 639                 if elapsed <= 0.0:
 640                         return
 641                 speed = float(byte_counter) / elapsed
 642                 if speed > rate_limit:
 643                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 644
 645         def temp_name(self, filename):
 646                 """Returns a temporary filename for the given filename."""
 647                 if self.params.get('nopart', False) or filename == u'-' or \
 648                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 649                         return filename
 650                 return filename + u'.part'
 651
 652         def undo_temp_name(self, filename):
 653                 if filename.endswith(u'.part'):
 654                         return filename[:-len(u'.part')]
 655                 return filename
 656
 657         def try_rename(self, old_filename, new_filename):
 658                 try:
 659                         if old_filename == new_filename:
 660                                 return
 661                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 662                 except (IOError, OSError), err:
 663                         self.trouble(u'ERROR: unable to rename file')
 664
 665         def try_utime(self, filename, last_modified_hdr):
 666                 """Try to set the last-modified time of the given file."""
 667                 if last_modified_hdr is None:
 668                         return
 669                 if not os.path.isfile(_encodeFilename(filename)):
 670                         return
 671                 timestr = last_modified_hdr
 672                 if timestr is None:
 673                         return
 674                 filetime = timeconvert(timestr)
 675                 if filetime is None:
 676                         return filetime
 677                 try:
 678                         os.utime(filename, (time.time(), filetime))
 679                 except:
 680                         pass
 681                 return filetime
 682
 683         def report_writedescription(self, descfn):
 684                 """ Report that the description file is being written """
 685                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 686
 687         def report_writesubtitles(self, srtfn):
 688                 """ Report that the subtitles file is being written """
 689                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 690
 691         def report_writeinfojson(self, infofn):
 692                 """ Report that the metadata file has been written """
 693                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 694
 695         def report_destination(self, filename):
 696                 """Report destination filename."""
 697                 self.to_screen(u'[download] Destination: ' + filename)
 698
 699         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 700                 """Report download progress."""
 701                 if self.params.get('noprogress', False):
 702                         return
 703                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 704                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 705                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 706                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 707
 708         def report_resuming_byte(self, resume_len):
 709                 """Report attempt to resume at given byte."""
 710                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 711
 712         def report_retry(self, count, retries):
 713                 """Report retry in case of HTTP error 5xx"""
 714                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 715
 716         def report_file_already_downloaded(self, file_name):
 717                 """Report file has already been fully downloaded."""
 718                 try:
 719                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 720                 except (UnicodeEncodeError), err:
 721                         self.to_screen(u'[download] The file has already been downloaded')
 722
 723         def report_unable_to_resume(self):
 724                 """Report it was impossible to resume download."""
 725                 self.to_screen(u'[download] Unable to resume')
 726
 727         def report_finish(self):
 728                 """Report download finished."""
 729                 if self.params.get('noprogress', False):
 730                         self.to_screen(u'[download] Download completed')
 731                 else:
 732                         self.to_screen(u'')
 733
 734         def increment_downloads(self):
 735                 """Increment the ordinal that assigns a number to each file."""
 736                 self._num_downloads += 1
 737
 738         def prepare_filename(self, info_dict):
 739                 """Generate the output filename."""
 740                 try:
 741                         template_dict = dict(info_dict)
 742                         template_dict['epoch'] = unicode(long(time.time()))
 743                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 744                         filename = self.params['outtmpl'] % template_dict
 745                         return filename
 746                 except (ValueError, KeyError), err:
 747                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 748                         return None
 749
 750         def _match_entry(self, info_dict):
 751                 """ Returns None iff the file should be downloaded """
 752
 753                 title = info_dict['title']
 754                 matchtitle = self.params.get('matchtitle', False)
 755                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 756                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 757                 rejecttitle = self.params.get('rejecttitle', False)
 758                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 759                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 760                 return None
 761
 762         def process_info(self, info_dict):
 763                 """Process a single dictionary returned by an InfoExtractor."""
 764
 765                 reason = self._match_entry(info_dict)
 766                 if reason is not None:
 767                         self.to_screen(u'[download] ' + reason)
 768                         return
 769
 770                 max_downloads = self.params.get('max_downloads')
 771                 if max_downloads is not None:
 772                         if self._num_downloads > int(max_downloads):
 773                                 raise MaxDownloadsReached()
 774
 775                 filename = self.prepare_filename(info_dict)
 776
 777                 # Forced printings
 778                 if self.params.get('forcetitle', False):
 779                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 780                 if self.params.get('forceurl', False):
 781                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 782                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 783                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 784                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 785                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 786                 if self.params.get('forcefilename', False) and filename is not None:
 787                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 788                 if self.params.get('forceformat', False):
 789                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 790
 791                 # Do nothing else if in simulate mode
 792                 if self.params.get('simulate', False):
 793                         return
 794
 795                 if filename is None:
 796                         return
 797
 798                 try:
 799                         dn = os.path.dirname(_encodeFilename(filename))
 800                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 801                                 os.makedirs(dn)
 802                 except (OSError, IOError), err:
 803                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 804                         return
 805
 806                 if self.params.get('writedescription', False):
 807                         try:
 808                                 descfn = filename + u'.description'
 809                                 self.report_writedescription(descfn)
 810                                 descfile = open(_encodeFilename(descfn), 'wb')
 811                                 try:
 812                                         descfile.write(info_dict['description'].encode('utf-8'))
 813                                 finally:
 814                                         descfile.close()
 815                         except (OSError, IOError):
 816                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 817                                 return
 818
 819                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 820                         # subtitles download errors are already managed as troubles in relevant IE
 821                         # that way it will silently go on when used with unsupporting IE
 822                         try:
 823                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 824                                 self.report_writesubtitles(srtfn)
 825                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 826                                 try:
 827                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 828                                 finally:
 829                                         srtfile.close()
 830                         except (OSError, IOError):
 831                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 832                                 return
 833
 834                 if self.params.get('writeinfojson', False):
 835                         infofn = filename + u'.info.json'
 836                         self.report_writeinfojson(infofn)
 837                         try:
 838                                 json.dump
 839                         except (NameError,AttributeError):
 840                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 841                                 return
 842                         try:
 843                                 infof = open(_encodeFilename(infofn), 'wb')
 844                                 try:
 845                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 846                                         json.dump(json_info_dict, infof)
 847                                 finally:
 848                                         infof.close()
 849                         except (OSError, IOError):
 850                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 851                                 return
 852
 853                 if not self.params.get('skip_download', False):
 854                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 855                                 success = True
 856                         else:
 857                                 try:
 858                                         success = self._do_download(filename, info_dict)
 859                                 except (OSError, IOError), err:
 860                                         raise UnavailableVideoError
 861                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 863                                         return
 864                                 except (ContentTooShortError, ), err:
 865                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 866                                         return
 867
 868                         if success:
 869                                 try:
 870                                         self.post_process(filename, info_dict)
 871                                 except (PostProcessingError), err:
 872                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 873                                         return
 874
 875         def download(self, url_list):
 876                 """Download a given list of URLs."""
 877                 if len(url_list) > 1 and self.fixed_template():
 878                         raise SameFileError(self.params['outtmpl'])
 879
 880                 for url in url_list:
 881                         suitable_found = False
 882                         for ie in self._ies:
 883                                 # Go to next InfoExtractor if not suitable
 884                                 if not ie.suitable(url):
 885                                         continue
 886
 887                                 # Suitable InfoExtractor found
 888                                 suitable_found = True
 889
 890                                 # Extract information from URL and process it
 891                                 ie.extract(url)
 892
 893                                 # Suitable InfoExtractor had been found; go to next URL
 894                                 break
 895
 896                         if not suitable_found:
 897                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 898
 899                 return self._download_retcode
 900
 901         def post_process(self, filename, ie_info):
 902                 """Run the postprocessing chain on the given file."""
 903                 info = dict(ie_info)
 904                 info['filepath'] = filename
 905                 for pp in self._pps:
 906                         info = pp.run(info)
 907                         if info is None:
 908                                 break
 909
 910         def _download_with_rtmpdump(self, filename, url, player_url):
 911                 self.report_destination(filename)
 912                 tmpfilename = self.temp_name(filename)
 913
 914                 # Check for rtmpdump first
 915                 try:
 916                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 917                 except (OSError, IOError):
 918                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 919                         return False
 920
 921                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 922                 # the connection was interrumpted and resuming appears to be
 923                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 924                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 925                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 926                 if self.params.get('verbose', False):
 927                         try:
 928                                 import pipes
 929                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 930                         except ImportError:
 931                                 shell_quote = repr
 932                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 933                 retval = subprocess.call(args)
 934                 while retval == 2 or retval == 1:
 935                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 936                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 937                         time.sleep(5.0) # This seems to be needed
 938                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 939                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 940                         if prevsize == cursize and retval == 1:
 941                                 break
 942                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 943                         if prevsize == cursize and retval == 2 and cursize > 1024:
 944                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 945                                 retval = 0
 946                                 break
 947                 if retval == 0:
 948                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 949                         self.try_rename(tmpfilename, filename)
 950                         return True
 951                 else:
 952                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 953                         return False
 954
 955         def _do_download(self, filename, info_dict):
 956                 url = info_dict['url']
 957                 player_url = info_dict.get('player_url', None)
 958
 959                 # Check file already present
 960                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 961                         self.report_file_already_downloaded(filename)
 962                         return True
 963
 964                 # Attempt to download using rtmpdump
 965                 if url.startswith('rtmp'):
 966                         return self._download_with_rtmpdump(filename, url, player_url)
 967
 968                 tmpfilename = self.temp_name(filename)
 969                 stream = None
 970
 971                 # Do not include the Accept-Encoding header
 972                 headers = {'Youtubedl-no-compression': 'True'}
 973                 basic_request = urllib2.Request(url, None, headers)
 974                 request = urllib2.Request(url, None, headers)
 975
 976                 # Establish possible resume length
 977                 if os.path.isfile(_encodeFilename(tmpfilename)):
 978                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 979                 else:
 980                         resume_len = 0
 981
 982                 open_mode = 'wb'
 983                 if resume_len != 0:
 984                         if self.params.get('continuedl', False):
 985                                 self.report_resuming_byte(resume_len)
 986                                 request.add_header('Range','bytes=%d-' % resume_len)
 987                                 open_mode = 'ab'
 988                         else:
 989                                 resume_len = 0
 990
 991                 count = 0
 992                 retries = self.params.get('retries', 0)
 993                 while count <= retries:
 994                         # Establish connection
 995                         try:
 996                                 if count == 0 and 'urlhandle' in info_dict:
 997                                         data = info_dict['urlhandle']
 998                                 data = urllib2.urlopen(request)
 999                                 break
1000                         except (urllib2.HTTPError, ), err:
1001                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002                                         # Unexpected HTTP error
1003                                         raise
1004                                 elif err.code == 416:
1005                                         # Unable to resume (requested range not satisfiable)
1006                                         try:
1007                                                 # Open the connection again without the range header
1008                                                 data = urllib2.urlopen(basic_request)
1009                                                 content_length = data.info()['Content-Length']
1010                                         except (urllib2.HTTPError, ), err:
1011                                                 if err.code < 500 or err.code >= 600:
1012                                                         raise
1013                                         else:
1014                                                 # Examine the reported length
1015                                                 if (content_length is not None and
1016                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017                                                         # The file had already been fully downloaded.
1018                                                         # Explanation to the above condition: in issue #175 it was revealed that
1019                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1020                                                         # changing the file size slightly and causing problems for some users. So
1021                                                         # I decided to implement a suggested change and consider the file
1022                                                         # completely downloaded if the file size differs less than 100 bytes from
1023                                                         # the one in the hard drive.
1024                                                         self.report_file_already_downloaded(filename)
1025                                                         self.try_rename(tmpfilename, filename)
1026                                                         return True
1027                                                 else:
1028                                                         # The length does not match, we start the download over
1029                                                         self.report_unable_to_resume()
1030                                                         open_mode = 'wb'
1031                                                         break
1032                         # Retry
1033                         count += 1
1034                         if count <= retries:
1035                                 self.report_retry(count, retries)
1036
1037                 if count > retries:
1038                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1039                         return False
1040
1041                 data_len = data.info().get('Content-length', None)
1042                 if data_len is not None:
1043                         data_len = long(data_len) + resume_len
1044                 data_len_str = self.format_bytes(data_len)
1045                 byte_counter = 0 + resume_len
1046                 block_size = 1024
1047                 start = time.time()
1048                 while True:
1049                         # Download and write
1050                         before = time.time()
1051                         data_block = data.read(block_size)
1052                         after = time.time()
1053                         if len(data_block) == 0:
1054                                 break
1055                         byte_counter += len(data_block)
1056
1057                         # Open file just in time
1058                         if stream is None:
1059                                 try:
1060                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061                                         assert stream is not None
1062                                         filename = self.undo_temp_name(tmpfilename)
1063                                         self.report_destination(filename)
1064                                 except (OSError, IOError), err:
1065                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066                                         return False
1067                         try:
1068                                 stream.write(data_block)
1069                         except (IOError, OSError), err:
1070                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071                                 return False
1072                         block_size = self.best_block_size(after - before, len(data_block))
1073
1074                         # Progress message
1075                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076                         if data_len is None:
1077                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078                         else:
1079                                 percent_str = self.calc_percent(byte_counter, data_len)
1080                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083                         # Apply rate limit
1084                         self.slow_down(start, byte_counter - resume_len)
1085
1086                 if stream is None:
1087                         self.trouble(u'\nERROR: Did not get any data blocks')
1088                         return False
1089                 stream.close()
1090                 self.report_finish()
1091                 if data_len is not None and byte_counter != data_len:
1092                         raise ContentTooShortError(byte_counter, long(data_len))
1093                 self.try_rename(tmpfilename, filename)
1094
1095                 # Update file modification time
1096                 if self.params.get('updatetime', True):
1097                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099                 return True
1100
1101
1102 class InfoExtractor(object):
1103         """Information Extractor class.
1104
1105         Information extractors are the classes that, given a URL, extract
1106         information from the video (or videos) the URL refers to. This
1107         information includes the real video URL, the video title and simplified
1108         title, author and others. The information is stored in a dictionary
1109         which is then passed to the FileDownloader. The FileDownloader
1110         processes this information possibly downloading the video to the file
1111         system, among other possible outcomes. The dictionaries must include
1112         the following fields:
1113
1114         id:             Video identifier.
1115         url:            Final video URL.
1116         uploader:       Nickname of the video uploader.
1117         title:          Literal title.
1118         stitle:         Simplified title.
1119         ext:            Video filename extension.
1120         format:         Video format.
1121         player_url:     SWF Player URL (may be None).
1122
1123         The following fields are optional. Their primary purpose is to allow
1124         youtube-dl to serve as the backend for a video search function, such
1125         as the one in youtube2mp3.  They are only used when their respective
1126         forced printing functions are called:
1127
1128         thumbnail:      Full URL to a video thumbnail image.
1129         description:    One-line video description.
1130
1131         Subclasses of this one should re-define the _real_initialize() and
1132         _real_extract() methods and define a _VALID_URL regexp.
1133         Probably, they should also be added to the list of extractors.
1134         """
1135
1136         _ready = False
1137         _downloader = None
1138
1139         def __init__(self, downloader=None):
1140                 """Constructor. Receives an optional downloader."""
1141                 self._ready = False
1142                 self.set_downloader(downloader)
1143
1144         def suitable(self, url):
1145                 """Receives a URL and returns True if suitable for this IE."""
1146                 return re.match(self._VALID_URL, url) is not None
1147
1148         def initialize(self):
1149                 """Initializes an instance (authentication, etc)."""
1150                 if not self._ready:
1151                         self._real_initialize()
1152                         self._ready = True
1153
1154         def extract(self, url):
1155                 """Extracts URL information and returns it in list of dicts."""
1156                 self.initialize()
1157                 return self._real_extract(url)
1158
1159         def set_downloader(self, downloader):
1160                 """Sets the downloader for this IE."""
1161                 self._downloader = downloader
1162
1163         def _real_initialize(self):
1164                 """Real initialization process. Redefine in subclasses."""
1165                 pass
1166
1167         def _real_extract(self, url):
1168                 """Real extraction process. Redefine in subclasses."""
1169                 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173         """Information extractor for youtube.com."""
1174
1175         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1180         _NETRC_MACHINE = 'youtube'
1181         # Listed in order of quality
1182         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184         _video_extensions = {
1185                 '13': '3gp',
1186                 '17': 'mp4',
1187                 '18': 'mp4',
1188                 '22': 'mp4',
1189                 '37': 'mp4',
1190                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1191                 '43': 'webm',
1192                 '44': 'webm',
1193                 '45': 'webm',
1194         }
1195         _video_dimensions = {
1196                 '5': '240x400',
1197                 '6': '???',
1198                 '13': '???',
1199                 '17': '144x176',
1200                 '18': '360x640',
1201                 '22': '720x1280',
1202                 '34': '360x640',
1203                 '35': '480x854',
1204                 '37': '1080x1920',
1205                 '38': '3072x4096',
1206                 '43': '360x640',
1207                 '44': '480x854',
1208                 '45': '720x1280',
1209         }
1210         IE_NAME = u'youtube'
1211
1212         def report_lang(self):
1213                 """Report attempt to set language."""
1214                 self._downloader.to_screen(u'[youtube] Setting language')
1215
1216         def report_login(self):
1217                 """Report attempt to log in."""
1218                 self._downloader.to_screen(u'[youtube] Logging in')
1219
1220         def report_age_confirmation(self):
1221                 """Report attempt to confirm age."""
1222                 self._downloader.to_screen(u'[youtube] Confirming age')
1223
1224         def report_video_webpage_download(self, video_id):
1225                 """Report attempt to download video webpage."""
1226                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227
1228         def report_video_info_webpage_download(self, video_id):
1229                 """Report attempt to download video info webpage."""
1230                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231
1232         def report_video_subtitles_download(self, video_id):
1233                 """Report attempt to download video info webpage."""
1234                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235
1236         def report_information_extraction(self, video_id):
1237                 """Report attempt to extract video information."""
1238                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239
1240         def report_unavailable_format(self, video_id, format):
1241                 """Report extracted video URL."""
1242                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243
1244         def report_rtmp_download(self):
1245                 """Indicate the download will use the RTMP protocol."""
1246                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247
1248         def _closed_captions_xml_to_srt(self, xml_string):
1249                 srt = ''
1250                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251                 # TODO parse xml instead of regex
1252                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253                         if not dur: dur = '4'
1254                         start = float(start)
1255                         end = start + float(dur)
1256                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260                         srt += str(n) + '\n'
1261                         srt += start + ' --> ' + end + '\n'
1262                         srt += caption + '\n\n'
1263                 return srt
1264
1265         def _print_formats(self, formats):
1266                 print 'Available formats:'
1267                 for x in formats:
1268                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269
1270         def _real_initialize(self):
1271                 if self._downloader is None:
1272                         return
1273
1274                 username = None
1275                 password = None
1276                 downloader_params = self._downloader.params
1277
1278                 # Attempt to use provided username and password or .netrc data
1279                 if downloader_params.get('username', None) is not None:
1280                         username = downloader_params['username']
1281                         password = downloader_params['password']
1282                 elif downloader_params.get('usenetrc', False):
1283                         try:
1284                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285                                 if info is not None:
1286                                         username = info[0]
1287                                         password = info[2]
1288                                 else:
1289                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290                         except (IOError, netrc.NetrcParseError), err:
1291                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292                                 return
1293
1294                 # Set language
1295                 request = urllib2.Request(self._LANG_URL)
1296                 try:
1297                         self.report_lang()
1298                         urllib2.urlopen(request).read()
1299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301                         return
1302
1303                 # No authentication to be performed
1304                 if username is None:
1305                         return
1306
1307                 # Log in
1308                 login_form = {
1309                                 'current_form': 'loginForm',
1310                                 'next':         '/',
1311                                 'action_login': 'Log In',
1312                                 'username':     username,
1313                                 'password':     password,
1314                                 }
1315                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316                 try:
1317                         self.report_login()
1318                         login_results = urllib2.urlopen(request).read()
1319                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321                                 return
1322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1324                         return
1325
1326                 # Confirm age
1327                 age_form = {
1328                                 'next_url':             '/',
1329                                 'action_confirm':       'Confirm',
1330                                 }
1331                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332                 try:
1333                         self.report_age_confirmation()
1334                         age_results = urllib2.urlopen(request).read()
1335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337                         return
1338
1339         def _real_extract(self, url):
1340                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1341                 mobj = re.search(self._NEXT_URL_RE, url)
1342                 if mobj:
1343                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1344
1345                 # Extract video id from URL
1346                 mobj = re.match(self._VALID_URL, url)
1347                 if mobj is None:
1348                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1349                         return
1350                 video_id = mobj.group(2)
1351
1352                 # Get video webpage
1353                 self.report_video_webpage_download(video_id)
1354                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1355                 try:
1356                         video_webpage = urllib2.urlopen(request).read()
1357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1359                         return
1360
1361                 # Attempt to extract SWF player URL
1362                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1363                 if mobj is not None:
1364                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1365                 else:
1366                         player_url = None
1367
1368                 # Get video info
1369                 self.report_video_info_webpage_download(video_id)
1370                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1371                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1372                                         % (video_id, el_type))
1373                         request = urllib2.Request(video_info_url)
1374                         try:
1375                                 video_info_webpage = urllib2.urlopen(request).read()
1376                                 video_info = parse_qs(video_info_webpage)
1377                                 if 'token' in video_info:
1378                                         break
1379                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1381                                 return
1382                 if 'token' not in video_info:
1383                         if 'reason' in video_info:
1384                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1385                         else:
1386                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1387                         return
1388
1389                 # Start extracting information
1390                 self.report_information_extraction(video_id)
1391
1392                 # uploader
1393                 if 'author' not in video_info:
1394                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1395                         return
1396                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1397
1398                 # title
1399                 if 'title' not in video_info:
1400                         self._downloader.trouble(u'ERROR: unable to extract video title')
1401                         return
1402                 video_title = urllib.unquote_plus(video_info['title'][0])
1403                 video_title = video_title.decode('utf-8')
1404                 video_title = sanitize_title(video_title)
1405
1406                 # simplified title
1407                 simple_title = _simplify_title(video_title)
1408
1409                 # thumbnail image
1410                 if 'thumbnail_url' not in video_info:
1411                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1412                         video_thumbnail = ''
1413                 else:   # don't panic if we can't find it
1414                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1415
1416                 # upload date
1417                 upload_date = u'NA'
1418                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1419                 if mobj is not None:
1420                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1421                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1422                         for expression in format_expressions:
1423                                 try:
1424                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1425                                 except:
1426                                         pass
1427
1428                 # description
1429                 try:
1430                         lxml.etree
1431                 except NameError:
1432                         video_description = u'No description available.'
1433                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1434                         if mobj is not None:
1435                                 video_description = mobj.group(1).decode('utf-8')
1436                 else:
1437                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1438                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1439                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1440                         # TODO use another parser
1441
1442                 # closed captions
1443                 video_subtitles = None
1444                 if self._downloader.params.get('writesubtitles', False):
1445                         self.report_video_subtitles_download(video_id)
1446                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1447                         try:
1448                                 srt_list = urllib2.urlopen(request).read()
1449                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1450                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1451                         else:
1452                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1453                                 if srt_lang_list:
1454                                         if self._downloader.params.get('subtitleslang', False):
1455                                                 srt_lang = self._downloader.params.get('subtitleslang')
1456                                         elif 'en' in srt_lang_list:
1457                                                 srt_lang = 'en'
1458                                         else:
1459                                                 srt_lang = srt_lang_list[0]
1460                                         if not srt_lang in srt_lang_list:
1461                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1462                                         else:
1463                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1464                                                 try:
1465                                                         srt_xml = urllib2.urlopen(request).read()
1466                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1468                                                 else:
1469                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1470                                 else:
1471                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1472
1473                 # token
1474                 video_token = urllib.unquote_plus(video_info['token'][0])
1475
1476                 # Decide which formats to download
1477                 req_format = self._downloader.params.get('format', None)
1478
1479                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1480                         self.report_rtmp_download()
1481                         video_url_list = [(None, video_info['conn'][0])]
1482                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1483                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1484                         url_data = [parse_qs(uds) for uds in url_data_strs]
1485                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1486                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1487
1488                         format_limit = self._downloader.params.get('format_limit', None)
1489                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1490                         if format_limit is not None and format_limit in available_formats:
1491                                 format_list = available_formats[available_formats.index(format_limit):]
1492                         else:
1493                                 format_list = available_formats
1494                         existing_formats = [x for x in format_list if x in url_map]
1495                         if len(existing_formats) == 0:
1496                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1497                                 return
1498                         if self._downloader.params.get('listformats', None):
1499                                 self._print_formats(existing_formats)
1500                                 return
1501                         if req_format is None or req_format == 'best':
1502                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1503                         elif req_format == 'worst':
1504                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1505                         elif req_format in ('-1', 'all'):
1506                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1507                         else:
1508                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1509                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1510                                 req_formats = req_format.split('/')
1511                                 video_url_list = None
1512                                 for rf in req_formats:
1513                                         if rf in url_map:
1514                                                 video_url_list = [(rf, url_map[rf])]
1515                                                 break
1516                                 if video_url_list is None:
1517                                         self._downloader.trouble(u'ERROR: requested format not available')
1518                                         return
1519                 else:
1520                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1521                         return
1522
1523                 for format_param, video_real_url in video_url_list:
1524                         # At this point we have a new video
1525                         self._downloader.increment_downloads()
1526
1527                         # Extension
1528                         video_extension = self._video_extensions.get(format_param, 'flv')
1529
1530                         try:
1531                                 # Process video information
1532                                 self._downloader.process_info({
1533                                         'id':           video_id.decode('utf-8'),
1534                                         'url':          video_real_url.decode('utf-8'),
1535                                         'uploader':     video_uploader.decode('utf-8'),
1536                                         'upload_date':  upload_date,
1537                                         'title':        video_title,
1538                                         'stitle':       simple_title,
1539                                         'ext':          video_extension.decode('utf-8'),
1540                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1541                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1542                                         'description':  video_description,
1543                                         'player_url':   player_url,
1544                                         'subtitles':    video_subtitles
1545                                 })
1546                         except UnavailableVideoError, err:
1547                                 self._downloader.trouble(u'\nERROR: unable to download video')
1548
1549
1550 class MetacafeIE(InfoExtractor):
1551         """Information Extractor for metacafe.com."""
1552
1553         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1554         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1555         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1556         _youtube_ie = None
1557         IE_NAME = u'metacafe'
1558
1559         def __init__(self, youtube_ie, downloader=None):
1560                 InfoExtractor.__init__(self, downloader)
1561                 self._youtube_ie = youtube_ie
1562
1563         def report_disclaimer(self):
1564                 """Report disclaimer retrieval."""
1565                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1566
1567         def report_age_confirmation(self):
1568                 """Report attempt to confirm age."""
1569                 self._downloader.to_screen(u'[metacafe] Confirming age')
1570
1571         def report_download_webpage(self, video_id):
1572                 """Report webpage download."""
1573                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1574
1575         def report_extraction(self, video_id):
1576                 """Report information extraction."""
1577                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1578
1579         def _real_initialize(self):
1580                 # Retrieve disclaimer
1581                 request = urllib2.Request(self._DISCLAIMER)
1582                 try:
1583                         self.report_disclaimer()
1584                         disclaimer = urllib2.urlopen(request).read()
1585                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1587                         return
1588
1589                 # Confirm age
1590                 disclaimer_form = {
1591                         'filters': '0',
1592                         'submit': "Continue - I'm over 18",
1593                         }
1594                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1595                 try:
1596                         self.report_age_confirmation()
1597                         disclaimer = urllib2.urlopen(request).read()
1598                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1600                         return
1601
1602         def _real_extract(self, url):
1603                 # Extract id and simplified title from URL
1604                 mobj = re.match(self._VALID_URL, url)
1605                 if mobj is None:
1606                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1607                         return
1608
1609                 video_id = mobj.group(1)
1610
1611                 # Check if video comes from YouTube
1612                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1613                 if mobj2 is not None:
1614                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1615                         return
1616
1617                 # At this point we have a new video
1618                 self._downloader.increment_downloads()
1619
1620                 simple_title = mobj.group(2).decode('utf-8')
1621
1622                 # Retrieve video webpage to extract further information
1623                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1624                 try:
1625                         self.report_download_webpage(video_id)
1626                         webpage = urllib2.urlopen(request).read()
1627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1629                         return
1630
1631                 # Extract URL, uploader and title from webpage
1632                 self.report_extraction(video_id)
1633                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1634                 if mobj is not None:
1635                         mediaURL = urllib.unquote(mobj.group(1))
1636                         video_extension = mediaURL[-3:]
1637
1638                         # Extract gdaKey if available
1639                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1640                         if mobj is None:
1641                                 video_url = mediaURL
1642                         else:
1643                                 gdaKey = mobj.group(1)
1644                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1645                 else:
1646                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1647                         if mobj is None:
1648                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649                                 return
1650                         vardict = parse_qs(mobj.group(1))
1651                         if 'mediaData' not in vardict:
1652                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653                                 return
1654                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1655                         if mobj is None:
1656                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657                                 return
1658                         mediaURL = mobj.group(1).replace('\\/', '/')
1659                         video_extension = mediaURL[-3:]
1660                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1661
1662                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: unable to extract title')
1665                         return
1666                 video_title = mobj.group(1).decode('utf-8')
1667                 video_title = sanitize_title(video_title)
1668
1669                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1672                         return
1673                 video_uploader = mobj.group(1)
1674
1675                 try:
1676                         # Process video information
1677                         self._downloader.process_info({
1678                                 'id':           video_id.decode('utf-8'),
1679                                 'url':          video_url.decode('utf-8'),
1680                                 'uploader':     video_uploader.decode('utf-8'),
1681                                 'upload_date':  u'NA',
1682                                 'title':        video_title,
1683                                 'stitle':       simple_title,
1684                                 'ext':          video_extension.decode('utf-8'),
1685                                 'format':       u'NA',
1686                                 'player_url':   None,
1687                         })
1688                 except UnavailableVideoError:
1689                         self._downloader.trouble(u'\nERROR: unable to download video')
1690
1691
1692 class DailymotionIE(InfoExtractor):
1693         """Information Extractor for Dailymotion"""
1694
1695         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1696         IE_NAME = u'dailymotion'
1697
1698         def __init__(self, downloader=None):
1699                 InfoExtractor.__init__(self, downloader)
1700
1701         def report_download_webpage(self, video_id):
1702                 """Report webpage download."""
1703                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1704
1705         def report_extraction(self, video_id):
1706                 """Report information extraction."""
1707                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1708
1709         def _real_extract(self, url):
1710                 # Extract id and simplified title from URL
1711                 mobj = re.match(self._VALID_URL, url)
1712                 if mobj is None:
1713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1714                         return
1715
1716                 # At this point we have a new video
1717                 self._downloader.increment_downloads()
1718                 video_id = mobj.group(1)
1719
1720                 video_extension = 'flv'
1721
1722                 # Retrieve video webpage to extract further information
1723                 request = urllib2.Request(url)
1724                 request.add_header('Cookie', 'family_filter=off')
1725                 try:
1726                         self.report_download_webpage(video_id)
1727                         webpage = urllib2.urlopen(request).read()
1728                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1729                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1730                         return
1731
1732                 # Extract URL, uploader and title from webpage
1733                 self.report_extraction(video_id)
1734                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1737                         return
1738                 sequence = urllib.unquote(mobj.group(1))
1739                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1740                 if mobj is None:
1741                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1742                         return
1743                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1744
1745                 # if needed add http://www.dailymotion.com/ if relative URL
1746
1747                 video_url = mediaURL
1748
1749                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: unable to extract title')
1752                         return
1753                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1754                 video_title = sanitize_title(video_title)
1755                 simple_title = _simplify_title(video_title)
1756
1757                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1760                         return
1761                 video_uploader = mobj.group(1)
1762
1763                 try:
1764                         # Process video information
1765                         self._downloader.process_info({
1766                                 'id':           video_id.decode('utf-8'),
1767                                 'url':          video_url.decode('utf-8'),
1768                                 'uploader':     video_uploader.decode('utf-8'),
1769                                 'upload_date':  u'NA',
1770                                 'title':        video_title,
1771                                 'stitle':       simple_title,
1772                                 'ext':          video_extension.decode('utf-8'),
1773                                 'format':       u'NA',
1774                                 'player_url':   None,
1775                         })
1776                 except UnavailableVideoError:
1777                         self._downloader.trouble(u'\nERROR: unable to download video')
1778
1779
1780 class GoogleIE(InfoExtractor):
1781         """Information extractor for video.google.com."""
1782
1783         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1784         IE_NAME = u'video.google'
1785
1786         def __init__(self, downloader=None):
1787                 InfoExtractor.__init__(self, downloader)
1788
1789         def report_download_webpage(self, video_id):
1790                 """Report webpage download."""
1791                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1792
1793         def report_extraction(self, video_id):
1794                 """Report information extraction."""
1795                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1796
1797         def _real_extract(self, url):
1798                 # Extract id from URL
1799                 mobj = re.match(self._VALID_URL, url)
1800                 if mobj is None:
1801                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1802                         return
1803
1804                 # At this point we have a new video
1805                 self._downloader.increment_downloads()
1806                 video_id = mobj.group(1)
1807
1808                 video_extension = 'mp4'
1809
1810                 # Retrieve video webpage to extract further information
1811                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1812                 try:
1813                         self.report_download_webpage(video_id)
1814                         webpage = urllib2.urlopen(request).read()
1815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1816                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1817                         return
1818
1819                 # Extract URL, uploader, and title from webpage
1820                 self.report_extraction(video_id)
1821                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1822                 if mobj is None:
1823                         video_extension = 'flv'
1824                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1827                         return
1828                 mediaURL = urllib.unquote(mobj.group(1))
1829                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1830                 mediaURL = mediaURL.replace('\\x26', '\x26')
1831
1832                 video_url = mediaURL
1833
1834                 mobj = re.search(r'<title>(.*)</title>', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract title')
1837                         return
1838                 video_title = mobj.group(1).decode('utf-8')
1839                 video_title = sanitize_title(video_title)
1840                 simple_title = _simplify_title(video_title)
1841
1842                 # Extract video description
1843                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1844                 if mobj is None:
1845                         self._downloader.trouble(u'ERROR: unable to extract video description')
1846                         return
1847                 video_description = mobj.group(1).decode('utf-8')
1848                 if not video_description:
1849                         video_description = 'No description available.'
1850
1851                 # Extract video thumbnail
1852                 if self._downloader.params.get('forcethumbnail', False):
1853                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1854                         try:
1855                                 webpage = urllib2.urlopen(request).read()
1856                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1858                                 return
1859                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1860                         if mobj is None:
1861                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1862                                 return
1863                         video_thumbnail = mobj.group(1)
1864                 else:   # we need something to pass to process_info
1865                         video_thumbnail = ''
1866
1867                 try:
1868                         # Process video information
1869                         self._downloader.process_info({
1870                                 'id':           video_id.decode('utf-8'),
1871                                 'url':          video_url.decode('utf-8'),
1872                                 'uploader':     u'NA',
1873                                 'upload_date':  u'NA',
1874                                 'title':        video_title,
1875                                 'stitle':       simple_title,
1876                                 'ext':          video_extension.decode('utf-8'),
1877                                 'format':       u'NA',
1878                                 'player_url':   None,
1879                         })
1880                 except UnavailableVideoError:
1881                         self._downloader.trouble(u'\nERROR: unable to download video')
1882
1883
1884 class PhotobucketIE(InfoExtractor):
1885         """Information extractor for photobucket.com."""
1886
1887         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1888         IE_NAME = u'photobucket'
1889
1890         def __init__(self, downloader=None):
1891                 InfoExtractor.__init__(self, downloader)
1892
1893         def report_download_webpage(self, video_id):
1894                 """Report webpage download."""
1895                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1896
1897         def report_extraction(self, video_id):
1898                 """Report information extraction."""
1899                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1900
1901         def _real_extract(self, url):
1902                 # Extract id from URL
1903                 mobj = re.match(self._VALID_URL, url)
1904                 if mobj is None:
1905                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1906                         return
1907
1908                 # At this point we have a new video
1909                 self._downloader.increment_downloads()
1910                 video_id = mobj.group(1)
1911
1912                 video_extension = 'flv'
1913
1914                 # Retrieve video webpage to extract further information
1915                 request = urllib2.Request(url)
1916                 try:
1917                         self.report_download_webpage(video_id)
1918                         webpage = urllib2.urlopen(request).read()
1919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921                         return
1922
1923                 # Extract URL, uploader, and title from webpage
1924                 self.report_extraction(video_id)
1925                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1926                 if mobj is None:
1927                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1928                         return
1929                 mediaURL = urllib.unquote(mobj.group(1))
1930
1931                 video_url = mediaURL
1932
1933                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1934                 if mobj is None:
1935                         self._downloader.trouble(u'ERROR: unable to extract title')
1936                         return
1937                 video_title = mobj.group(1).decode('utf-8')
1938                 video_title = sanitize_title(video_title)
1939                 simple_title = _simplify_title(vide_title)
1940
1941                 video_uploader = mobj.group(2).decode('utf-8')
1942
1943                 try:
1944                         # Process video information
1945                         self._downloader.process_info({
1946                                 'id':           video_id.decode('utf-8'),
1947                                 'url':          video_url.decode('utf-8'),
1948                                 'uploader':     video_uploader,
1949                                 'upload_date':  u'NA',
1950                                 'title':        video_title,
1951                                 'stitle':       simple_title,
1952                                 'ext':          video_extension.decode('utf-8'),
1953                                 'format':       u'NA',
1954                                 'player_url':   None,
1955                         })
1956                 except UnavailableVideoError:
1957                         self._downloader.trouble(u'\nERROR: unable to download video')
1958
1959
1960 class YahooIE(InfoExtractor):
1961         """Information extractor for video.yahoo.com."""
1962
1963         # _VALID_URL matches all Yahoo! Video URLs
1964         # _VPAGE_URL matches only the extractable '/watch/' URLs
1965         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1966         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1967         IE_NAME = u'video.yahoo'
1968
1969         def __init__(self, downloader=None):
1970                 InfoExtractor.__init__(self, downloader)
1971
1972         def report_download_webpage(self, video_id):
1973                 """Report webpage download."""
1974                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1975
1976         def report_extraction(self, video_id):
1977                 """Report information extraction."""
1978                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1979
1980         def _real_extract(self, url, new_video=True):
1981                 # Extract ID from URL
1982                 mobj = re.match(self._VALID_URL, url)
1983                 if mobj is None:
1984                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1985                         return
1986
1987                 # At this point we have a new video
1988                 self._downloader.increment_downloads()
1989                 video_id = mobj.group(2)
1990                 video_extension = 'flv'
1991
1992                 # Rewrite valid but non-extractable URLs as
1993                 # extractable English language /watch/ URLs
1994                 if re.match(self._VPAGE_URL, url) is None:
1995                         request = urllib2.Request(url)
1996                         try:
1997                                 webpage = urllib2.urlopen(request).read()
1998                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2000                                 return
2001
2002                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2003                         if mobj is None:
2004                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2005                                 return
2006                         yahoo_id = mobj.group(1)
2007
2008                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2009                         if mobj is None:
2010                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2011                                 return
2012                         yahoo_vid = mobj.group(1)
2013
2014                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2015                         return self._real_extract(url, new_video=False)
2016
2017                 # Retrieve video webpage to extract further information
2018                 request = urllib2.Request(url)
2019                 try:
2020                         self.report_download_webpage(video_id)
2021                         webpage = urllib2.urlopen(request).read()
2022                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2023                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2024                         return
2025
2026                 # Extract uploader and title from webpage
2027                 self.report_extraction(video_id)
2028                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2029                 if mobj is None:
2030                         self._downloader.trouble(u'ERROR: unable to extract video title')
2031                         return
2032                 video_title = mobj.group(1).decode('utf-8')
2033                 simple_title = _simplify_title(video_title)
2034
2035                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2036                 if mobj is None:
2037                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038                         return
2039                 video_uploader = mobj.group(1).decode('utf-8')
2040
2041                 # Extract video thumbnail
2042                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2043                 if mobj is None:
2044                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045                         return
2046                 video_thumbnail = mobj.group(1).decode('utf-8')
2047
2048                 # Extract video description
2049                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video description')
2052                         return
2053                 video_description = mobj.group(1).decode('utf-8')
2054                 if not video_description:
2055                         video_description = 'No description available.'
2056
2057                 # Extract video height and width
2058                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract video height')
2061                         return
2062                 yv_video_height = mobj.group(1)
2063
2064                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2065                 if mobj is None:
2066                         self._downloader.trouble(u'ERROR: unable to extract video width')
2067                         return
2068                 yv_video_width = mobj.group(1)
2069
2070                 # Retrieve video playlist to extract media URL
2071                 # I'm not completely sure what all these options are, but we
2072                 # seem to need most of them, otherwise the server sends a 401.
2073                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2074                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2075                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2076                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2077                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2078                 try:
2079                         self.report_download_webpage(video_id)
2080                         webpage = urllib2.urlopen(request).read()
2081                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2082                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2083                         return
2084
2085                 # Extract media URL from playlist XML
2086                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2087                 if mobj is None:
2088                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2089                         return
2090                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2091                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2092
2093                 try:
2094                         # Process video information
2095                         self._downloader.process_info({
2096                                 'id':           video_id.decode('utf-8'),
2097                                 'url':          video_url,
2098                                 'uploader':     video_uploader,
2099                                 'upload_date':  u'NA',
2100                                 'title':        video_title,
2101                                 'stitle':       simple_title,
2102                                 'ext':          video_extension.decode('utf-8'),
2103                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2104                                 'description':  video_description,
2105                                 'thumbnail':    video_thumbnail,
2106                                 'player_url':   None,
2107                         })
2108                 except UnavailableVideoError:
2109                         self._downloader.trouble(u'\nERROR: unable to download video')
2110
2111
2112 class VimeoIE(InfoExtractor):
2113         """Information extractor for vimeo.com."""
2114
2115         # _VALID_URL matches Vimeo URLs
2116         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2117         IE_NAME = u'vimeo'
2118
2119         def __init__(self, downloader=None):
2120                 InfoExtractor.__init__(self, downloader)
2121
2122         def report_download_webpage(self, video_id):
2123                 """Report webpage download."""
2124                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2125
2126         def report_extraction(self, video_id):
2127                 """Report information extraction."""
2128                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2129
2130         def _real_extract(self, url, new_video=True):
2131                 # Extract ID from URL
2132                 mobj = re.match(self._VALID_URL, url)
2133                 if mobj is None:
2134                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2135                         return
2136
2137                 # At this point we have a new video
2138                 self._downloader.increment_downloads()
2139                 video_id = mobj.group(1)
2140
2141                 # Retrieve video webpage to extract further information
2142                 request = urllib2.Request(url, None, std_headers)
2143                 try:
2144                         self.report_download_webpage(video_id)
2145                         webpage = urllib2.urlopen(request).read()
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148                         return
2149
2150                 # Now we begin extracting as much information as we can from what we
2151                 # retrieved. First we extract the information common to all extractors,
2152                 # and latter we extract those that are Vimeo specific.
2153                 self.report_extraction(video_id)
2154
2155                 # Extract the config JSON
2156                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2157                 try:
2158                         config = json.loads(config)
2159                 except:
2160                         self._downloader.trouble(u'ERROR: unable to extract info section')
2161                         return
2162
2163                 # Extract title
2164                 video_title = config["video"]["title"]
2165                 simple_title = _simplify_title(video_title)
2166
2167                 # Extract uploader
2168                 video_uploader = config["video"]["owner"]["name"]
2169
2170                 # Extract video thumbnail
2171                 video_thumbnail = config["video"]["thumbnail"]
2172
2173                 # Extract video description
2174                 try:
2175                         lxml.etree
2176                 except NameError:
2177                         video_description = u'No description available.'
2178                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2179                         if mobj is not None:
2180                                 video_description = mobj.group(1)
2181                 else:
2182                         html_parser = lxml.etree.HTMLParser()
2183                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2184                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2185                         # TODO use another parser
2186
2187                 # Extract upload date
2188                 video_upload_date = u'NA'
2189                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2190                 if mobj is not None:
2191                         video_upload_date = mobj.group(1)
2192
2193                 # Vimeo specific: extract request signature and timestamp
2194                 sig = config['request']['signature']
2195                 timestamp = config['request']['timestamp']
2196
2197                 # Vimeo specific: extract video codec and quality information
2198                 # TODO bind to format param
2199                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2200                 for codec in codecs:
2201                         if codec[0] in config["video"]["files"]:
2202                                 video_codec = codec[0]
2203                                 video_extension = codec[1]
2204                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2205                                 else: quality = 'sd'
2206                                 break
2207                 else:
2208                         self._downloader.trouble(u'ERROR: no known codec found')
2209                         return
2210
2211                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2212                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2213
2214                 try:
2215                         # Process video information
2216                         self._downloader.process_info({
2217                                 'id':           video_id,
2218                                 'url':          video_url,
2219                                 'uploader':     video_uploader,
2220                                 'upload_date':  video_upload_date,
2221                                 'title':        video_title,
2222                                 'stitle':       simple_title,
2223                                 'ext':          video_extension,
2224                                 'thumbnail':    video_thumbnail,
2225                                 'description':  video_description,
2226                                 'player_url':   None,
2227                         })
2228                 except UnavailableVideoError:
2229                         self._downloader.trouble(u'ERROR: unable to download video')
2230
2231
2232 class GenericIE(InfoExtractor):
2233         """Generic last-resort information extractor."""
2234
2235         _VALID_URL = r'.*'
2236         IE_NAME = u'generic'
2237
2238         def __init__(self, downloader=None):
2239                 InfoExtractor.__init__(self, downloader)
2240
2241         def report_download_webpage(self, video_id):
2242                 """Report webpage download."""
2243                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2244                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2245
2246         def report_extraction(self, video_id):
2247                 """Report information extraction."""
2248                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2249
2250         def _real_extract(self, url):
2251                 # At this point we have a new video
2252                 self._downloader.increment_downloads()
2253
2254                 video_id = url.split('/')[-1]
2255                 request = urllib2.Request(url)
2256                 try:
2257                         self.report_download_webpage(video_id)
2258                         webpage = urllib2.urlopen(request).read()
2259                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2260                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2261                         return
2262                 except ValueError, err:
2263                         # since this is the last-resort InfoExtractor, if
2264                         # this error is thrown, it'll be thrown here
2265                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2266                         return
2267
2268                 self.report_extraction(video_id)
2269                 # Start with something easy: JW Player in SWFObject
2270                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2271                 if mobj is None:
2272                         # Broaden the search a little bit
2273                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2274                 if mobj is None:
2275                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2276                         return
2277
2278                 # It's possible that one of the regexes
2279                 # matched, but returned an empty group:
2280                 if mobj.group(1) is None:
2281                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2282                         return
2283
2284                 video_url = urllib.unquote(mobj.group(1))
2285                 video_id = os.path.basename(video_url)
2286
2287                 # here's a fun little line of code for you:
2288                 video_extension = os.path.splitext(video_id)[1][1:]
2289                 video_id = os.path.splitext(video_id)[0]
2290
2291                 # it's tempting to parse this further, but you would
2292                 # have to take into account all the variations like
2293                 #   Video Title - Site Name
2294                 #   Site Name | Video Title
2295                 #   Video Title - Tagline | Site Name
2296                 # and so on and so forth; it's just not practical
2297                 mobj = re.search(r'<title>(.*)</title>', webpage)
2298                 if mobj is None:
2299                         self._downloader.trouble(u'ERROR: unable to extract title')
2300                         return
2301                 video_title = mobj.group(1).decode('utf-8')
2302                 video_title = sanitize_title(video_title)
2303                 simple_title = _simplify_title(video_title)
2304
2305                 # video uploader is domain name
2306                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2307                 if mobj is None:
2308                         self._downloader.trouble(u'ERROR: unable to extract title')
2309                         return
2310                 video_uploader = mobj.group(1).decode('utf-8')
2311
2312                 try:
2313                         # Process video information
2314                         self._downloader.process_info({
2315                                 'id':           video_id.decode('utf-8'),
2316                                 'url':          video_url.decode('utf-8'),
2317                                 'uploader':     video_uploader,
2318                                 'upload_date':  u'NA',
2319                                 'title':        video_title,
2320                                 'stitle':       simple_title,
2321                                 'ext':          video_extension.decode('utf-8'),
2322                                 'format':       u'NA',
2323                                 'player_url':   None,
2324                         })
2325                 except UnavailableVideoError, err:
2326                         self._downloader.trouble(u'\nERROR: unable to download video')
2327
2328
2329 class YoutubeSearchIE(InfoExtractor):
2330         """Information Extractor for YouTube search queries."""
2331         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2332         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2333         _youtube_ie = None
2334         _max_youtube_results = 1000
2335         IE_NAME = u'youtube:search'
2336
2337         def __init__(self, youtube_ie, downloader=None):
2338                 InfoExtractor.__init__(self, downloader)
2339                 self._youtube_ie = youtube_ie
2340
2341         def report_download_page(self, query, pagenum):
2342                 """Report attempt to download playlist page with given number."""
2343                 query = query.decode(preferredencoding())
2344                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2345
2346         def _real_initialize(self):
2347                 self._youtube_ie.initialize()
2348
2349         def _real_extract(self, query):
2350                 mobj = re.match(self._VALID_URL, query)
2351                 if mobj is None:
2352                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2353                         return
2354
2355                 prefix, query = query.split(':')
2356                 prefix = prefix[8:]
2357                 query = query.encode('utf-8')
2358                 if prefix == '':
2359                         self._download_n_results(query, 1)
2360                         return
2361                 elif prefix == 'all':
2362                         self._download_n_results(query, self._max_youtube_results)
2363                         return
2364                 else:
2365                         try:
2366                                 n = long(prefix)
2367                                 if n <= 0:
2368                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2369                                         return
2370                                 elif n > self._max_youtube_results:
2371                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2372                                         n = self._max_youtube_results
2373                                 self._download_n_results(query, n)
2374                                 return
2375                         except ValueError: # parsing prefix as integer fails
2376                                 self._download_n_results(query, 1)
2377                                 return
2378
2379         def _download_n_results(self, query, n):
2380                 """Downloads a specified number of results for a query"""
2381
2382                 video_ids = []
2383                 pagenum = 0
2384                 limit = n
2385
2386                 while (50 * pagenum) < limit:
2387                         self.report_download_page(query, pagenum+1)
2388                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2389                         request = urllib2.Request(result_url)
2390                         try:
2391                                 data = urllib2.urlopen(request).read()
2392                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2393                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2394                                 return
2395                         api_response = json.loads(data)['data']
2396
2397                         new_ids = list(video['id'] for video in api_response['items'])
2398                         video_ids += new_ids
2399
2400                         limit = min(n, api_response['totalItems'])
2401                         pagenum += 1
2402
2403                 if len(video_ids) > n:
2404                         video_ids = video_ids[:n]
2405                 for id in video_ids:
2406                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2407                 return
2408
2409
2410 class GoogleSearchIE(InfoExtractor):
2411         """Information Extractor for Google Video search queries."""
2412         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2413         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2414         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2415         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2416         _google_ie = None
2417         _max_google_results = 1000
2418         IE_NAME = u'video.google:search'
2419
2420         def __init__(self, google_ie, downloader=None):
2421                 InfoExtractor.__init__(self, downloader)
2422                 self._google_ie = google_ie
2423
2424         def report_download_page(self, query, pagenum):
2425                 """Report attempt to download playlist page with given number."""
2426                 query = query.decode(preferredencoding())
2427                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2428
2429         def _real_initialize(self):
2430                 self._google_ie.initialize()
2431
2432         def _real_extract(self, query):
2433                 mobj = re.match(self._VALID_URL, query)
2434                 if mobj is None:
2435                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2436                         return
2437
2438                 prefix, query = query.split(':')
2439                 prefix = prefix[8:]
2440                 query = query.encode('utf-8')
2441                 if prefix == '':
2442                         self._download_n_results(query, 1)
2443                         return
2444                 elif prefix == 'all':
2445                         self._download_n_results(query, self._max_google_results)
2446                         return
2447                 else:
2448                         try:
2449                                 n = long(prefix)
2450                                 if n <= 0:
2451                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2452                                         return
2453                                 elif n > self._max_google_results:
2454                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2455                                         n = self._max_google_results
2456                                 self._download_n_results(query, n)
2457                                 return
2458                         except ValueError: # parsing prefix as integer fails
2459                                 self._download_n_results(query, 1)
2460                                 return
2461
2462         def _download_n_results(self, query, n):
2463                 """Downloads a specified number of results for a query"""
2464
2465                 video_ids = []
2466                 pagenum = 0
2467
2468                 while True:
2469                         self.report_download_page(query, pagenum)
2470                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2471                         request = urllib2.Request(result_url)
2472                         try:
2473                                 page = urllib2.urlopen(request).read()
2474                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2475                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2476                                 return
2477
2478                         # Extract video identifiers
2479                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2480                                 video_id = mobj.group(1)
2481                                 if video_id not in video_ids:
2482                                         video_ids.append(video_id)
2483                                         if len(video_ids) == n:
2484                                                 # Specified n videos reached
2485                                                 for id in video_ids:
2486                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2487                                                 return
2488
2489                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2490                                 for id in video_ids:
2491                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2492                                 return
2493
2494                         pagenum = pagenum + 1
2495
2496
2497 class YahooSearchIE(InfoExtractor):
2498         """Information Extractor for Yahoo! Video search queries."""
2499         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2500         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2501         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2502         _MORE_PAGES_INDICATOR = r'\s*Next'
2503         _yahoo_ie = None
2504         _max_yahoo_results = 1000
2505         IE_NAME = u'video.yahoo:search'
2506
2507         def __init__(self, yahoo_ie, downloader=None):
2508                 InfoExtractor.__init__(self, downloader)
2509                 self._yahoo_ie = yahoo_ie
2510
2511         def report_download_page(self, query, pagenum):
2512                 """Report attempt to download playlist page with given number."""
2513                 query = query.decode(preferredencoding())
2514                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2515
2516         def _real_initialize(self):
2517                 self._yahoo_ie.initialize()
2518
2519         def _real_extract(self, query):
2520                 mobj = re.match(self._VALID_URL, query)
2521                 if mobj is None:
2522                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2523                         return
2524
2525                 prefix, query = query.split(':')
2526                 prefix = prefix[8:]
2527                 query = query.encode('utf-8')
2528                 if prefix == '':
2529                         self._download_n_results(query, 1)
2530                         return
2531                 elif prefix == 'all':
2532                         self._download_n_results(query, self._max_yahoo_results)
2533                         return
2534                 else:
2535                         try:
2536                                 n = long(prefix)
2537                                 if n <= 0:
2538                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2539                                         return
2540                                 elif n > self._max_yahoo_results:
2541                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2542                                         n = self._max_yahoo_results
2543                                 self._download_n_results(query, n)
2544                                 return
2545                         except ValueError: # parsing prefix as integer fails
2546                                 self._download_n_results(query, 1)
2547                                 return
2548
2549         def _download_n_results(self, query, n):
2550                 """Downloads a specified number of results for a query"""
2551
2552                 video_ids = []
2553                 already_seen = set()
2554                 pagenum = 1
2555
2556                 while True:
2557                         self.report_download_page(query, pagenum)
2558                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2559                         request = urllib2.Request(result_url)
2560                         try:
2561                                 page = urllib2.urlopen(request).read()
2562                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2563                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2564                                 return
2565
2566                         # Extract video identifiers
2567                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2568                                 video_id = mobj.group(1)
2569                                 if video_id not in already_seen:
2570                                         video_ids.append(video_id)
2571                                         already_seen.add(video_id)
2572                                         if len(video_ids) == n:
2573                                                 # Specified n videos reached
2574                                                 for id in video_ids:
2575                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2576                                                 return
2577
2578                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2579                                 for id in video_ids:
2580                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2581                                 return
2582
2583                         pagenum = pagenum + 1
2584
2585
2586 class YoutubePlaylistIE(InfoExtractor):
2587         """Information Extractor for YouTube playlists."""
2588
2589         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2590         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2591         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2592         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2593         _youtube_ie = None
2594         IE_NAME = u'youtube:playlist'
2595
2596         def __init__(self, youtube_ie, downloader=None):
2597                 InfoExtractor.__init__(self, downloader)
2598                 self._youtube_ie = youtube_ie
2599
2600         def report_download_page(self, playlist_id, pagenum):
2601                 """Report attempt to download playlist page with given number."""
2602                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2603
2604         def _real_initialize(self):
2605                 self._youtube_ie.initialize()
2606
2607         def _real_extract(self, url):
2608                 # Extract playlist id
2609                 mobj = re.match(self._VALID_URL, url)
2610                 if mobj is None:
2611                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2612                         return
2613
2614                 # Single video case
2615                 if mobj.group(3) is not None:
2616                         self._youtube_ie.extract(mobj.group(3))
2617                         return
2618
2619                 # Download playlist pages
2620                 # prefix is 'p' as default for playlists but there are other types that need extra care
2621                 playlist_prefix = mobj.group(1)
2622                 if playlist_prefix == 'a':
2623                         playlist_access = 'artist'
2624                 else:
2625                         playlist_prefix = 'p'
2626                         playlist_access = 'view_play_list'
2627                 playlist_id = mobj.group(2)
2628                 video_ids = []
2629                 pagenum = 1
2630
2631                 while True:
2632                         self.report_download_page(playlist_id, pagenum)
2633                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2634                         request = urllib2.Request(url)
2635                         try:
2636                                 page = urllib2.urlopen(request).read()
2637                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2638                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2639                                 return
2640
2641                         # Extract video identifiers
2642                         ids_in_page = []
2643                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2644                                 if mobj.group(1) not in ids_in_page:
2645                                         ids_in_page.append(mobj.group(1))
2646                         video_ids.extend(ids_in_page)
2647
2648                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2649                                 break
2650                         pagenum = pagenum + 1
2651
2652                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2653                 playlistend = self._downloader.params.get('playlistend', -1)
2654                 if playlistend == -1:
2655                         video_ids = video_ids[playliststart:]
2656                 else:
2657                         video_ids = video_ids[playliststart:playlistend]
2658
2659                 for id in video_ids:
2660                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2661                 return
2662
2663
2664 class YoutubeUserIE(InfoExtractor):
2665         """Information Extractor for YouTube users."""
2666
2667         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2668         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2669         _GDATA_PAGE_SIZE = 50
2670         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2671         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2672         _youtube_ie = None
2673         IE_NAME = u'youtube:user'
2674
2675         def __init__(self, youtube_ie, downloader=None):
2676                 InfoExtractor.__init__(self, downloader)
2677                 self._youtube_ie = youtube_ie
2678
2679         def report_download_page(self, username, start_index):
2680                 """Report attempt to download user page."""
2681                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2682                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2683
2684         def _real_initialize(self):
2685                 self._youtube_ie.initialize()
2686
2687         def _real_extract(self, url):
2688                 # Extract username
2689                 mobj = re.match(self._VALID_URL, url)
2690                 if mobj is None:
2691                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2692                         return
2693
2694                 username = mobj.group(1)
2695
2696                 # Download video ids using YouTube Data API. Result size per
2697                 # query is limited (currently to 50 videos) so we need to query
2698                 # page by page until there are no video ids - it means we got
2699                 # all of them.
2700
2701                 video_ids = []
2702                 pagenum = 0
2703
2704                 while True:
2705                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2706                         self.report_download_page(username, start_index)
2707
2708                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2709
2710                         try:
2711                                 page = urllib2.urlopen(request).read()
2712                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2713                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2714                                 return
2715
2716                         # Extract video identifiers
2717                         ids_in_page = []
2718
2719                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2720                                 if mobj.group(1) not in ids_in_page:
2721                                         ids_in_page.append(mobj.group(1))
2722
2723                         video_ids.extend(ids_in_page)
2724
2725                         # A little optimization - if current page is not
2726                         # "full", ie. does not contain PAGE_SIZE video ids then
2727                         # we can assume that this page is the last one - there
2728                         # are no more ids on further pages - no need to query
2729                         # again.
2730
2731                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2732                                 break
2733
2734                         pagenum += 1
2735
2736                 all_ids_count = len(video_ids)
2737                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2738                 playlistend = self._downloader.params.get('playlistend', -1)
2739
2740                 if playlistend == -1:
2741                         video_ids = video_ids[playliststart:]
2742                 else:
2743                         video_ids = video_ids[playliststart:playlistend]
2744
2745                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2746                                 (username, all_ids_count, len(video_ids)))
2747
2748                 for video_id in video_ids:
2749                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2750
2751
2752 class DepositFilesIE(InfoExtractor):
2753         """Information extractor for depositfiles.com"""
2754
2755         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2756         IE_NAME = u'DepositFiles'
2757
2758         def __init__(self, downloader=None):
2759                 InfoExtractor.__init__(self, downloader)
2760
2761         def report_download_webpage(self, file_id):
2762                 """Report webpage download."""
2763                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2764
2765         def report_extraction(self, file_id):
2766                 """Report information extraction."""
2767                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2768
2769         def _real_extract(self, url):
2770                 # At this point we have a new file
2771                 self._downloader.increment_downloads()
2772
2773                 file_id = url.split('/')[-1]
2774                 # Rebuild url in english locale
2775                 url = 'http://depositfiles.com/en/files/' + file_id
2776
2777                 # Retrieve file webpage with 'Free download' button pressed
2778                 free_download_indication = { 'gateway_result' : '1' }
2779                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2780                 try:
2781                         self.report_download_webpage(file_id)
2782                         webpage = urllib2.urlopen(request).read()
2783                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2784                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2785                         return
2786
2787                 # Search for the real file URL
2788                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2789                 if (mobj is None) or (mobj.group(1) is None):
2790                         # Try to figure out reason of the error.
2791                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2792                         if (mobj is not None) and (mobj.group(1) is not None):
2793                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2794                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2795                         else:
2796                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2797                         return
2798
2799                 file_url = mobj.group(1)
2800                 file_extension = os.path.splitext(file_url)[1][1:]
2801
2802                 # Search for file title
2803                 mobj = re.search(r'<b title="(.*?)">', webpage)
2804                 if mobj is None:
2805                         self._downloader.trouble(u'ERROR: unable to extract title')
2806                         return
2807                 file_title = mobj.group(1).decode('utf-8')
2808
2809                 try:
2810                         # Process file information
2811                         self._downloader.process_info({
2812                                 'id':           file_id.decode('utf-8'),
2813                                 'url':          file_url.decode('utf-8'),
2814                                 'uploader':     u'NA',
2815                                 'upload_date':  u'NA',
2816                                 'title':        file_title,
2817                                 'stitle':       file_title,
2818                                 'ext':          file_extension.decode('utf-8'),
2819                                 'format':       u'NA',
2820                                 'player_url':   None,
2821                         })
2822                 except UnavailableVideoError, err:
2823                         self._downloader.trouble(u'ERROR: unable to download file')
2824
2825
2826 class FacebookIE(InfoExtractor):
2827         """Information Extractor for Facebook"""
2828
2829         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2830         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2831         _NETRC_MACHINE = 'facebook'
2832         _available_formats = ['video', 'highqual', 'lowqual']
2833         _video_extensions = {
2834                 'video': 'mp4',
2835                 'highqual': 'mp4',
2836                 'lowqual': 'mp4',
2837         }
2838         IE_NAME = u'facebook'
2839
2840         def __init__(self, downloader=None):
2841                 InfoExtractor.__init__(self, downloader)
2842
2843         def _reporter(self, message):
2844                 """Add header and report message."""
2845                 self._downloader.to_screen(u'[facebook] %s' % message)
2846
2847         def report_login(self):
2848                 """Report attempt to log in."""
2849                 self._reporter(u'Logging in')
2850
2851         def report_video_webpage_download(self, video_id):
2852                 """Report attempt to download video webpage."""
2853                 self._reporter(u'%s: Downloading video webpage' % video_id)
2854
2855         def report_information_extraction(self, video_id):
2856                 """Report attempt to extract video information."""
2857                 self._reporter(u'%s: Extracting video information' % video_id)
2858
2859         def _parse_page(self, video_webpage):
2860                 """Extract video information from page"""
2861                 # General data
2862                 data = {'title': r'\("video_title", "(.*?)"\)',
2863                         'description': r'<div class="datawrap">(.*?)</div>',
2864                         'owner': r'\("video_owner_name", "(.*?)"\)',
2865                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2866                         }
2867                 video_info = {}
2868                 for piece in data.keys():
2869                         mobj = re.search(data[piece], video_webpage)
2870                         if mobj is not None:
2871                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2872
2873                 # Video urls
2874                 video_urls = {}
2875                 for fmt in self._available_formats:
2876                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2877                         if mobj is not None:
2878                                 # URL is in a Javascript segment inside an escaped Unicode format within
2879                                 # the generally utf-8 page
2880                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2881                 video_info['video_urls'] = video_urls
2882
2883                 return video_info
2884
2885         def _real_initialize(self):
2886                 if self._downloader is None:
2887                         return
2888
2889                 useremail = None
2890                 password = None
2891                 downloader_params = self._downloader.params
2892
2893                 # Attempt to use provided username and password or .netrc data
2894                 if downloader_params.get('username', None) is not None:
2895                         useremail = downloader_params['username']
2896                         password = downloader_params['password']
2897                 elif downloader_params.get('usenetrc', False):
2898                         try:
2899                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2900                                 if info is not None:
2901                                         useremail = info[0]
2902                                         password = info[2]
2903                                 else:
2904                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2905                         except (IOError, netrc.NetrcParseError), err:
2906                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2907                                 return
2908
2909                 if useremail is None:
2910                         return
2911
2912                 # Log in
2913                 login_form = {
2914                         'email': useremail,
2915                         'pass': password,
2916                         'login': 'Log+In'
2917                         }
2918                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2919                 try:
2920                         self.report_login()
2921                         login_results = urllib2.urlopen(request).read()
2922                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2923                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2924                                 return
2925                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2926                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2927                         return
2928
2929         def _real_extract(self, url):
2930                 mobj = re.match(self._VALID_URL, url)
2931                 if mobj is None:
2932                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933                         return
2934                 video_id = mobj.group('ID')
2935
2936                 # Get video webpage
2937                 self.report_video_webpage_download(video_id)
2938                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2939                 try:
2940                         page = urllib2.urlopen(request)
2941                         video_webpage = page.read()
2942                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2943                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2944                         return
2945
2946                 # Start extracting information
2947                 self.report_information_extraction(video_id)
2948
2949                 # Extract information
2950                 video_info = self._parse_page(video_webpage)
2951
2952                 # uploader
2953                 if 'owner' not in video_info:
2954                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2955                         return
2956                 video_uploader = video_info['owner']
2957
2958                 # title
2959                 if 'title' not in video_info:
2960                         self._downloader.trouble(u'ERROR: unable to extract video title')
2961                         return
2962                 video_title = video_info['title']
2963                 video_title = video_title.decode('utf-8')
2964                 video_title = sanitize_title(video_title)
2965
2966                 simple_title = _simplify_title(video_title)
2967
2968                 # thumbnail image
2969                 if 'thumbnail' not in video_info:
2970                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2971                         video_thumbnail = ''
2972                 else:
2973                         video_thumbnail = video_info['thumbnail']
2974
2975                 # upload date
2976                 upload_date = u'NA'
2977                 if 'upload_date' in video_info:
2978                         upload_time = video_info['upload_date']
2979                         timetuple = email.utils.parsedate_tz(upload_time)
2980                         if timetuple is not None:
2981                                 try:
2982                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2983                                 except:
2984                                         pass
2985
2986                 # description
2987                 video_description = video_info.get('description', 'No description available.')
2988
2989                 url_map = video_info['video_urls']
2990                 if len(url_map.keys()) > 0:
2991                         # Decide which formats to download
2992                         req_format = self._downloader.params.get('format', None)
2993                         format_limit = self._downloader.params.get('format_limit', None)
2994
2995                         if format_limit is not None and format_limit in self._available_formats:
2996                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2997                         else:
2998                                 format_list = self._available_formats
2999                         existing_formats = [x for x in format_list if x in url_map]
3000                         if len(existing_formats) == 0:
3001                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3002                                 return
3003                         if req_format is None:
3004                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3005                         elif req_format == 'worst':
3006                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3007                         elif req_format == '-1':
3008                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3009                         else:
3010                                 # Specific format
3011                                 if req_format not in url_map:
3012                                         self._downloader.trouble(u'ERROR: requested format not available')
3013                                         return
3014                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3015
3016                 for format_param, video_real_url in video_url_list:
3017
3018                         # At this point we have a new video
3019                         self._downloader.increment_downloads()
3020
3021                         # Extension
3022                         video_extension = self._video_extensions.get(format_param, 'mp4')
3023
3024                         try:
3025                                 # Process video information
3026                                 self._downloader.process_info({
3027                                         'id':           video_id.decode('utf-8'),
3028                                         'url':          video_real_url.decode('utf-8'),
3029                                         'uploader':     video_uploader.decode('utf-8'),
3030                                         'upload_date':  upload_date,
3031                                         'title':        video_title,
3032                                         'stitle':       simple_title,
3033                                         'ext':          video_extension.decode('utf-8'),
3034                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3035                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3036                                         'description':  video_description.decode('utf-8'),
3037                                         'player_url':   None,
3038                                 })
3039                         except UnavailableVideoError, err:
3040                                 self._downloader.trouble(u'\nERROR: unable to download video')
3041
3042 class BlipTVIE(InfoExtractor):
3043         """Information extractor for blip.tv"""
3044
3045         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3046         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3047         IE_NAME = u'blip.tv'
3048
3049         def report_extraction(self, file_id):
3050                 """Report information extraction."""
3051                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3052
3053         def report_direct_download(self, title):
3054                 """Report information extraction."""
3055                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3056
3057         def _real_extract(self, url):
3058                 mobj = re.match(self._VALID_URL, url)
3059                 if mobj is None:
3060                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3061                         return
3062
3063                 if '?' in url:
3064                         cchar = '&'
3065                 else:
3066                         cchar = '?'
3067                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3068                 request = urllib2.Request(json_url)
3069                 self.report_extraction(mobj.group(1))
3070                 info = None
3071                 try:
3072                         urlh = urllib2.urlopen(request)
3073                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3074                                 basename = url.split('/')[-1]
3075                                 title,ext = os.path.splitext(basename)
3076                                 title = title.decode('UTF-8')
3077                                 ext = ext.replace('.', '')
3078                                 self.report_direct_download(title)
3079                                 info = {
3080                                         'id': title,
3081                                         'url': url,
3082                                         'title': title,
3083                                         'stitle': _simplify_title(title),
3084                                         'ext': ext,
3085                                         'urlhandle': urlh
3086                                 }
3087                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3089                         return
3090                 if info is None: # Regular URL
3091                         try:
3092                                 json_code = urlh.read()
3093                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3095                                 return
3096
3097                         try:
3098                                 json_data = json.loads(json_code)
3099                                 if 'Post' in json_data:
3100                                         data = json_data['Post']
3101                                 else:
3102                                         data = json_data
3103
3104                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3105                                 video_url = data['media']['url']
3106                                 umobj = re.match(self._URL_EXT, video_url)
3107                                 if umobj is None:
3108                                         raise ValueError('Can not determine filename extension')
3109                                 ext = umobj.group(1)
3110
3111                                 info = {
3112                                         'id': data['item_id'],
3113                                         'url': video_url,
3114                                         'uploader': data['display_name'],
3115                                         'upload_date': upload_date,
3116                                         'title': data['title'],
3117                                         'stitle': _simplify_title(data['title']),
3118                                         'ext': ext,
3119                                         'format': data['media']['mimeType'],
3120                                         'thumbnail': data['thumbnailUrl'],
3121                                         'description': data['description'],
3122                                         'player_url': data['embedUrl']
3123                                 }
3124                         except (ValueError,KeyError), err:
3125                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3126                                 return
3127
3128                 self._downloader.increment_downloads()
3129
3130                 try:
3131                         self._downloader.process_info(info)
3132                 except UnavailableVideoError, err:
3133                         self._downloader.trouble(u'\nERROR: unable to download video')
3134
3135
3136 class MyVideoIE(InfoExtractor):
3137         """Information Extractor for myvideo.de."""
3138
3139         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3140         IE_NAME = u'myvideo'
3141
3142         def __init__(self, downloader=None):
3143                 InfoExtractor.__init__(self, downloader)
3144
3145         def report_download_webpage(self, video_id):
3146                 """Report webpage download."""
3147                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3148
3149         def report_extraction(self, video_id):
3150                 """Report information extraction."""
3151                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3152
3153         def _real_extract(self,url):
3154                 mobj = re.match(self._VALID_URL, url)
3155                 if mobj is None:
3156                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3157                         return
3158
3159                 video_id = mobj.group(1)
3160
3161                 # Get video webpage
3162                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3163                 try:
3164                         self.report_download_webpage(video_id)
3165                         webpage = urllib2.urlopen(request).read()
3166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3167                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3168                         return
3169
3170                 self.report_extraction(video_id)
3171                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3172                                  webpage)
3173                 if mobj is None:
3174                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3175                         return
3176                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3177
3178                 mobj = re.search('<title>([^<]+)</title>', webpage)
3179                 if mobj is None:
3180                         self._downloader.trouble(u'ERROR: unable to extract title')
3181                         return
3182
3183                 video_title = mobj.group(1)
3184                 video_title = sanitize_title(video_title)
3185
3186                 simple_title = _simplify_title(video_title)
3187
3188                 try:
3189                         self._downloader.process_info({
3190                                 'id':           video_id,
3191                                 'url':          video_url,
3192                                 'uploader':     u'NA',
3193                                 'upload_date':  u'NA',
3194                                 'title':        video_title,
3195                                 'stitle':       simple_title,
3196                                 'ext':          u'flv',
3197                                 'format':       u'NA',
3198                                 'player_url':   None,
3199                         })
3200                 except UnavailableVideoError:
3201                         self._downloader.trouble(u'\nERROR: Unable to download video')
3202
3203 class ComedyCentralIE(InfoExtractor):
3204         """Information extractor for The Daily Show and Colbert Report """
3205
3206         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3207         IE_NAME = u'comedycentral'
3208
3209         def report_extraction(self, episode_id):
3210                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3211
3212         def report_config_download(self, episode_id):
3213                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3214
3215         def report_index_download(self, episode_id):
3216                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3217
3218         def report_player_url(self, episode_id):
3219                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3220
3221         def _real_extract(self, url):
3222                 mobj = re.match(self._VALID_URL, url)
3223                 if mobj is None:
3224                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3225                         return
3226
3227                 if mobj.group('shortname'):
3228                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3229                                 url = u'http://www.thedailyshow.com/full-episodes/'
3230                         else:
3231                                 url = u'http://www.colbertnation.com/full-episodes/'
3232                         mobj = re.match(self._VALID_URL, url)
3233                         assert mobj is not None
3234
3235                 dlNewest = not mobj.group('episode')
3236                 if dlNewest:
3237                         epTitle = mobj.group('showname')
3238                 else:
3239                         epTitle = mobj.group('episode')
3240
3241                 req = urllib2.Request(url)
3242                 self.report_extraction(epTitle)
3243                 try:
3244                         htmlHandle = urllib2.urlopen(req)
3245                         html = htmlHandle.read()
3246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3247                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3248                         return
3249                 if dlNewest:
3250                         url = htmlHandle.geturl()
3251                         mobj = re.match(self._VALID_URL, url)
3252                         if mobj is None:
3253                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3254                                 return
3255                         if mobj.group('episode') == '':
3256                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3257                                 return
3258                         epTitle = mobj.group('episode')
3259
3260                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3261                 if len(mMovieParams) == 0:
3262                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3263                         return
3264
3265                 playerUrl_raw = mMovieParams[0][0]
3266                 self.report_player_url(epTitle)
3267                 try:
3268                         urlHandle = urllib2.urlopen(playerUrl_raw)
3269                         playerUrl = urlHandle.geturl()
3270                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3271                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3272                         return
3273
3274                 uri = mMovieParams[0][1]
3275                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3276                 self.report_index_download(epTitle)
3277                 try:
3278                         indexXml = urllib2.urlopen(indexUrl).read()
3279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3280                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3281                         return
3282
3283                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3284                 itemEls = idoc.findall('.//item')
3285                 for itemEl in itemEls:
3286                         mediaId = itemEl.findall('./guid')[0].text
3287                         shortMediaId = mediaId.split(':')[-1]
3288                         showId = mediaId.split(':')[-2].replace('.com', '')
3289                         officialTitle = itemEl.findall('./title')[0].text
3290                         officialDate = itemEl.findall('./pubDate')[0].text
3291
3292                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3293                                                 urllib.urlencode({'uri': mediaId}))
3294                         configReq = urllib2.Request(configUrl)
3295                         self.report_config_download(epTitle)
3296                         try:
3297                                 configXml = urllib2.urlopen(configReq).read()
3298                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3299                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3300                                 return
3301
3302                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3303                         turls = []
3304                         for rendition in cdoc.findall('.//rendition'):
3305                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3306                                 turls.append(finfo)
3307
3308                         if len(turls) == 0:
3309                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3310                                 continue
3311
3312                         # For now, just pick the highest bitrate
3313                         format,video_url = turls[-1]
3314
3315                         self._downloader.increment_downloads()
3316
3317                         effTitle = showId + u'-' + epTitle
3318                         info = {
3319                                 'id': shortMediaId,
3320                                 'url': video_url,
3321                                 'uploader': showId,
3322                                 'upload_date': officialDate,
3323                                 'title': effTitle,
3324                                 'stitle': _simplify_title(effTitle),
3325                                 'ext': 'mp4',
3326                                 'format': format,
3327                                 'thumbnail': None,
3328                                 'description': officialTitle,
3329                                 'player_url': playerUrl
3330                         }
3331
3332                         try:
3333                                 self._downloader.process_info(info)
3334                         except UnavailableVideoError, err:
3335                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3336                                 continue
3337
3338
3339 class EscapistIE(InfoExtractor):
3340         """Information extractor for The Escapist """
3341
3342         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3343         IE_NAME = u'escapist'
3344
3345         def report_extraction(self, showName):
3346                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3347
3348         def report_config_download(self, showName):
3349                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3350
3351         def _real_extract(self, url):
3352                 htmlParser = HTMLParser.HTMLParser()
3353
3354                 mobj = re.match(self._VALID_URL, url)
3355                 if mobj is None:
3356                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3357                         return
3358                 showName = mobj.group('showname')
3359                 videoId = mobj.group('episode')
3360
3361                 self.report_extraction(showName)
3362                 try:
3363                         webPage = urllib2.urlopen(url).read()
3364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3365                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3366                         return
3367
3368                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3369                 description = htmlParser.unescape(descMatch.group(1))
3370                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3371                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3372                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3373                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3374                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3375                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3376
3377                 self.report_config_download(showName)
3378                 try:
3379                         configJSON = urllib2.urlopen(configUrl).read()
3380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3382                         return
3383
3384                 # Technically, it's JavaScript, not JSON
3385                 configJSON = configJSON.replace("'", '"')
3386
3387                 try:
3388                         config = json.loads(configJSON)
3389                 except (ValueError,), err:
3390                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3391                         return
3392
3393                 playlist = config['playlist']
3394                 videoUrl = playlist[1]['url']
3395
3396                 self._downloader.increment_downloads()
3397                 info = {
3398                         'id': videoId,
3399                         'url': videoUrl,
3400                         'uploader': showName,
3401                         'upload_date': None,
3402                         'title': showName,
3403                         'stitle': _simplify_title(showName),
3404                         'ext': 'flv',
3405                         'format': 'flv',
3406                         'thumbnail': imgUrl,
3407                         'description': description,
3408                         'player_url': playerUrl,
3409                 }
3410
3411                 try:
3412                         self._downloader.process_info(info)
3413                 except UnavailableVideoError, err:
3414                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3415
3416
3417 class CollegeHumorIE(InfoExtractor):
3418         """Information extractor for collegehumor.com"""
3419
3420         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3421         IE_NAME = u'collegehumor'
3422
3423         def report_webpage(self, video_id):
3424                 """Report information extraction."""
3425                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3426
3427         def report_extraction(self, video_id):
3428                 """Report information extraction."""
3429                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3430
3431         def _real_extract(self, url):
3432                 htmlParser = HTMLParser.HTMLParser()
3433
3434                 mobj = re.match(self._VALID_URL, url)
3435                 if mobj is None:
3436                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3437                         return
3438                 video_id = mobj.group('videoid')
3439
3440                 self.report_webpage(video_id)
3441                 request = urllib2.Request(url)
3442                 try:
3443                         webpage = urllib2.urlopen(request).read()
3444                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3445                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3446                         return
3447
3448                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3449                 if m is None:
3450                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3451                         return
3452                 internal_video_id = m.group('internalvideoid')
3453
3454                 info = {
3455                         'id': video_id,
3456                         'internal_id': internal_video_id,
3457                 }
3458
3459                 self.report_extraction(video_id)
3460                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3461                 try:
3462                         metaXml = urllib2.urlopen(xmlUrl).read()
3463                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3464                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3465                         return
3466
3467                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3468                 try:
3469                         videoNode = mdoc.findall('./video')[0]
3470                         info['description'] = videoNode.findall('./description')[0].text
3471                         info['title'] = videoNode.findall('./caption')[0].text
3472                         info['stitle'] = _simplify_title(info['title'])
3473                         info['url'] = videoNode.findall('./file')[0].text
3474                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3475                         info['ext'] = info['url'].rpartition('.')[2]
3476                         info['format'] = info['ext']
3477                 except IndexError:
3478                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3479                         return
3480
3481                 self._downloader.increment_downloads()
3482
3483                 try:
3484                         self._downloader.process_info(info)
3485                 except UnavailableVideoError, err:
3486                         self._downloader.trouble(u'\nERROR: unable to download video')
3487
3488
3489 class XVideosIE(InfoExtractor):
3490         """Information extractor for xvideos.com"""
3491
3492         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3493         IE_NAME = u'xvideos'
3494
3495         def report_webpage(self, video_id):
3496                 """Report information extraction."""
3497                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3498
3499         def report_extraction(self, video_id):
3500                 """Report information extraction."""
3501                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3502
3503         def _real_extract(self, url):
3504                 htmlParser = HTMLParser.HTMLParser()
3505
3506                 mobj = re.match(self._VALID_URL, url)
3507                 if mobj is None:
3508                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3509                         return
3510                 video_id = mobj.group(1).decode('utf-8')
3511
3512                 self.report_webpage(video_id)
3513
3514                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3515                 try:
3516                         webpage = urllib2.urlopen(request).read()
3517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3519                         return
3520
3521                 self.report_extraction(video_id)
3522
3523
3524                 # Extract video URL
3525                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3526                 if mobj is None:
3527                         self._downloader.trouble(u'ERROR: unable to extract video url')
3528                         return
3529                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3530
3531
3532                 # Extract title
3533                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3534                 if mobj is None:
3535                         self._downloader.trouble(u'ERROR: unable to extract video title')
3536                         return
3537                 video_title = mobj.group(1).decode('utf-8')
3538
3539
3540                 # Extract video thumbnail
3541                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3542                 if mobj is None:
3543                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3544                         return
3545                 video_thumbnail = mobj.group(1).decode('utf-8')
3546
3547
3548
3549                 self._downloader.increment_downloads()
3550                 info = {
3551                         'id': video_id,
3552                         'url': video_url,
3553                         'uploader': None,
3554                         'upload_date': None,
3555                         'title': video_title,
3556                         'stitle': _simplify_title(video_title),
3557                         'ext': 'flv',
3558                         'format': 'flv',
3559                         'thumbnail': video_thumbnail,
3560                         'description': None,
3561                         'player_url': None,
3562                 }
3563
3564                 try:
3565                         self._downloader.process_info(info)
3566                 except UnavailableVideoError, err:
3567                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3568
3569
3570 class SoundcloudIE(InfoExtractor):
3571         """Information extractor for soundcloud.com
3572            To access the media, the uid of the song and a stream token
3573            must be extracted from the page source and the script must make
3574            a request to media.soundcloud.com/crossdomain.xml. Then
3575            the media can be grabbed by requesting from an url composed
3576            of the stream token and uid
3577          """
3578
3579         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3580         IE_NAME = u'soundcloud'
3581
3582         def __init__(self, downloader=None):
3583                 InfoExtractor.__init__(self, downloader)
3584
3585         def report_webpage(self, video_id):
3586                 """Report information extraction."""
3587                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3588
3589         def report_extraction(self, video_id):
3590                 """Report information extraction."""
3591                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3592
3593         def _real_extract(self, url):
3594                 htmlParser = HTMLParser.HTMLParser()
3595
3596                 mobj = re.match(self._VALID_URL, url)
3597                 if mobj is None:
3598                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3599                         return
3600
3601                 # extract uploader (which is in the url)
3602                 uploader = mobj.group(1).decode('utf-8')
3603                 # extract simple title (uploader + slug of song title)
3604                 slug_title =  mobj.group(2).decode('utf-8')
3605                 simple_title = uploader + '-' + slug_title
3606
3607                 self.report_webpage('%s/%s' % (uploader, slug_title))
3608
3609                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3610                 try:
3611                         webpage = urllib2.urlopen(request).read()
3612                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3613                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3614                         return
3615
3616                 self.report_extraction('%s/%s' % (uploader, slug_title))
3617
3618                 # extract uid and stream token that soundcloud hands out for access
3619                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3620                 if mobj:
3621                         video_id = mobj.group(1)
3622                         stream_token = mobj.group(2)
3623
3624                 # extract unsimplified title
3625                 mobj = re.search('"title":"(.*?)",', webpage)
3626                 if mobj:
3627                         title = mobj.group(1)
3628
3629                 # construct media url (with uid/token)
3630                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3631                 mediaURL = mediaURL % (video_id, stream_token)
3632
3633                 # description
3634                 description = u'No description available'
3635                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3636                 if mobj:
3637                         description = mobj.group(1)
3638
3639                 # upload date
3640                 upload_date = None
3641                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3642                 if mobj:
3643                         try:
3644                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3645                         except Exception, e:
3646                                 print str(e)
3647
3648                 # for soundcloud, a request to a cross domain is required for cookies
3649                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3650
3651                 try:
3652                         self._downloader.process_info({
3653                                 'id':           video_id.decode('utf-8'),
3654                                 'url':          mediaURL,
3655                                 'uploader':     uploader.decode('utf-8'),
3656                                 'upload_date':  upload_date,
3657                                 'title':        simple_title.decode('utf-8'),
3658                                 'stitle':       simple_title.decode('utf-8'),
3659                                 'ext':          u'mp3',
3660                                 'format':       u'NA',
3661                                 'player_url':   None,
3662                                 'description': description.decode('utf-8')
3663                         })
3664                 except UnavailableVideoError:
3665                         self._downloader.trouble(u'\nERROR: unable to download video')
3666
3667
3668 class InfoQIE(InfoExtractor):
3669         """Information extractor for infoq.com"""
3670
3671         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3672         IE_NAME = u'infoq'
3673
3674         def report_webpage(self, video_id):
3675                 """Report information extraction."""
3676                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3677
3678         def report_extraction(self, video_id):
3679                 """Report information extraction."""
3680                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3681
3682         def _real_extract(self, url):
3683                 htmlParser = HTMLParser.HTMLParser()
3684
3685                 mobj = re.match(self._VALID_URL, url)
3686                 if mobj is None:
3687                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3688                         return
3689
3690                 self.report_webpage(url)
3691
3692                 request = urllib2.Request(url)
3693                 try:
3694                         webpage = urllib2.urlopen(request).read()
3695                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3696                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3697                         return
3698
3699                 self.report_extraction(url)
3700
3701
3702                 # Extract video URL
3703                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3704                 if mobj is None:
3705                         self._downloader.trouble(u'ERROR: unable to extract video url')
3706                         return
3707                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3708
3709
3710                 # Extract title
3711                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3712                 if mobj is None:
3713                         self._downloader.trouble(u'ERROR: unable to extract video title')
3714                         return
3715                 video_title = mobj.group(1).decode('utf-8')
3716
3717                 # Extract description
3718                 video_description = u'No description available.'
3719                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3720                 if mobj is not None:
3721                         video_description = mobj.group(1).decode('utf-8')
3722
3723                 video_filename = video_url.split('/')[-1]
3724                 video_id, extension = video_filename.split('.')
3725
3726                 self._downloader.increment_downloads()
3727                 info = {
3728                         'id': video_id,
3729                         'url': video_url,
3730                         'uploader': None,
3731                         'upload_date': None,
3732                         'title': video_title,
3733                         'stitle': _simplify_title(video_title),
3734                         'ext': extension,
3735                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3736                         'thumbnail': None,
3737                         'description': video_description,
3738                         'player_url': None,
3739                 }
3740
3741                 try:
3742                         self._downloader.process_info(info)
3743                 except UnavailableVideoError, err:
3744                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3745
3746 class MixcloudIE(InfoExtractor):
3747         """Information extractor for www.mixcloud.com"""
3748         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3749         IE_NAME = u'mixcloud'
3750
3751         def __init__(self, downloader=None):
3752                 InfoExtractor.__init__(self, downloader)
3753
3754         def report_download_json(self, file_id):
3755                 """Report JSON download."""
3756                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3757
3758         def report_extraction(self, file_id):
3759                 """Report information extraction."""
3760                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3761
3762         def get_urls(self, jsonData, fmt, bitrate='best'):
3763                 """Get urls from 'audio_formats' section in json"""
3764                 file_url = None
3765                 try:
3766                         bitrate_list = jsonData[fmt]
3767                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3768                                 bitrate = max(bitrate_list) # select highest
3769
3770                         url_list = jsonData[fmt][bitrate]
3771                 except TypeError: # we have no bitrate info.
3772                         url_list = jsonData[fmt]
3773
3774                 return url_list
3775
3776         def check_urls(self, url_list):
3777                 """Returns 1st active url from list"""
3778                 for url in url_list:
3779                         try:
3780                                 urllib2.urlopen(url)
3781                                 return url
3782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3783                                 url = None
3784
3785                 return None
3786
3787         def _print_formats(self, formats):
3788                 print 'Available formats:'
3789                 for fmt in formats.keys():
3790                         for b in formats[fmt]:
3791                                 try:
3792                                         ext = formats[fmt][b][0]
3793                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3794                                 except TypeError: # we have no bitrate info
3795                                         ext = formats[fmt][0]
3796                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3797                                         break
3798
3799         def _real_extract(self, url):
3800                 mobj = re.match(self._VALID_URL, url)
3801                 if mobj is None:
3802                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3803                         return
3804                 # extract uploader & filename from url
3805                 uploader = mobj.group(1).decode('utf-8')
3806                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3807
3808                 # construct API request
3809                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3810                 # retrieve .json file with links to files
3811                 request = urllib2.Request(file_url)
3812                 try:
3813                         self.report_download_json(file_url)
3814                         jsonData = urllib2.urlopen(request).read()
3815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3816                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3817                         return
3818
3819                 # parse JSON
3820                 json_data = json.loads(jsonData)
3821                 player_url = json_data['player_swf_url']
3822                 formats = dict(json_data['audio_formats'])
3823
3824                 req_format = self._downloader.params.get('format', None)
3825                 bitrate = None
3826
3827                 if self._downloader.params.get('listformats', None):
3828                         self._print_formats(formats)
3829                         return
3830
3831                 if req_format is None or req_format == 'best':
3832                         for format_param in formats.keys():
3833                                 url_list = self.get_urls(formats, format_param)
3834                                 # check urls
3835                                 file_url = self.check_urls(url_list)
3836                                 if file_url is not None:
3837                                         break # got it!
3838                 else:
3839                         if req_format not in formats.keys():
3840                                 self._downloader.trouble(u'ERROR: format is not available')
3841                                 return
3842
3843                         url_list = self.get_urls(formats, req_format)
3844                         file_url = self.check_urls(url_list)
3845                         format_param = req_format
3846
3847                 # We have audio
3848                 self._downloader.increment_downloads()
3849                 try:
3850                         # Process file information
3851                         self._downloader.process_info({
3852                                 'id': file_id.decode('utf-8'),
3853                                 'url': file_url.decode('utf-8'),
3854                                 'uploader':     uploader.decode('utf-8'),
3855                                 'upload_date': u'NA',
3856                                 'title': json_data['name'],
3857                                 'stitle': _simplify_title(json_data['name']),
3858                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3859                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3860                                 'thumbnail': json_data['thumbnail_url'],
3861                                 'description': json_data['description'],
3862                                 'player_url': player_url.decode('utf-8'),
3863                         })
3864                 except UnavailableVideoError, err:
3865                         self._downloader.trouble(u'ERROR: unable to download file')
3866
3867 class StanfordOpenClassroomIE(InfoExtractor):
3868         """Information extractor for Stanford's Open ClassRoom"""
3869
3870         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3871         IE_NAME = u'stanfordoc'
3872
3873         def report_download_webpage(self, objid):
3874                 """Report information extraction."""
3875                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3876
3877         def report_extraction(self, video_id):
3878                 """Report information extraction."""
3879                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3880
3881         def _real_extract(self, url):
3882                 mobj = re.match(self._VALID_URL, url)
3883                 if mobj is None:
3884                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3885                         return
3886
3887                 if mobj.group('course') and mobj.group('video'): # A specific video
3888                         course = mobj.group('course')
3889                         video = mobj.group('video')
3890                         info = {
3891                                 'id': _simplify_title(course + '_' + video),
3892                         }
3893
3894                         self.report_extraction(info['id'])
3895                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3896                         xmlUrl = baseUrl + video + '.xml'
3897                         try:
3898                                 metaXml = urllib2.urlopen(xmlUrl).read()
3899                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3900                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3901                                 return
3902                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3903                         try:
3904                                 info['title'] = mdoc.findall('./title')[0].text
3905                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3906                         except IndexError:
3907                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3908                                 return
3909                         info['stitle'] = _simplify_title(info['title'])
3910                         info['ext'] = info['url'].rpartition('.')[2]
3911                         info['format'] = info['ext']
3912                         self._downloader.increment_downloads()
3913                         try:
3914                                 self._downloader.process_info(info)
3915                         except UnavailableVideoError, err:
3916                                 self._downloader.trouble(u'\nERROR: unable to download video')
3917                 elif mobj.group('course'): # A course page
3918                         unescapeHTML = HTMLParser.HTMLParser().unescape
3919
3920                         course = mobj.group('course')
3921                         info = {
3922                                 'id': _simplify_title(course),
3923                                 'type': 'playlist',
3924                         }
3925
3926                         self.report_download_webpage(info['id'])
3927                         try:
3928                                 coursepage = urllib2.urlopen(url).read()
3929                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3930                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3931                                 return
3932
3933                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3934                         if m:
3935                                 info['title'] = unescapeHTML(m.group(1))
3936                         else:
3937                                 info['title'] = info['id']
3938                         info['stitle'] = _simplify_title(info['title'])
3939
3940                         m = re.search('<description>([^<]+)</description>', coursepage)
3941                         if m:
3942                                 info['description'] = unescapeHTML(m.group(1))
3943
3944                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3945                         info['list'] = [
3946                                 {
3947                                         'type': 'reference',
3948                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3949                                 }
3950                                         for vpage in links]
3951
3952                         for entry in info['list']:
3953                                 assert entry['type'] == 'reference'
3954                                 self.extract(entry['url'])
3955                 else: # Root page
3956                         unescapeHTML = HTMLParser.HTMLParser().unescape
3957
3958                         info = {
3959                                 'id': 'Stanford OpenClassroom',
3960                                 'type': 'playlist',
3961                         }
3962
3963                         self.report_download_webpage(info['id'])
3964                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3965                         try:
3966                                 rootpage = urllib2.urlopen(rootURL).read()
3967                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3968                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3969                                 return
3970
3971                         info['title'] = info['id']
3972                         info['stitle'] = _simplify_title(info['title'])
3973
3974                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3975                         info['list'] = [
3976                                 {
3977                                         'type': 'reference',
3978                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3979                                 }
3980                                         for cpage in links]
3981
3982                         for entry in info['list']:
3983                                 assert entry['type'] == 'reference'
3984                                 self.extract(entry['url'])
3985
3986 class MTVIE(InfoExtractor):
3987         """Information extractor for MTV.com"""
3988
3989         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3990         IE_NAME = u'mtv'
3991
3992         def report_webpage(self, video_id):
3993                 """Report information extraction."""
3994                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3995
3996         def report_extraction(self, video_id):
3997                 """Report information extraction."""
3998                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3999
4000         def _real_extract(self, url):
4001                 mobj = re.match(self._VALID_URL, url)
4002                 if mobj is None:
4003                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4004                         return
4005                 if not mobj.group('proto'):
4006                         url = 'http://' + url
4007                 video_id = mobj.group('videoid')
4008                 self.report_webpage(video_id)
4009
4010                 request = urllib2.Request(url)
4011                 try:
4012                         webpage = urllib2.urlopen(request).read()
4013                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4014                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4015                         return
4016
4017                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4018                 if mobj is None:
4019                         self._downloader.trouble(u'ERROR: unable to extract song name')
4020                         return
4021                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4023                 if mobj is None:
4024                         self._downloader.trouble(u'ERROR: unable to extract performer')
4025                         return
4026                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4027                 video_title = performer + ' - ' + song_name
4028
4029                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4030                 if mobj is None:
4031                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4032                         return
4033                 mtvn_uri = mobj.group(1)
4034
4035                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4036                 if mobj is None:
4037                         self._downloader.trouble(u'ERROR: unable to extract content id')
4038                         return
4039                 content_id = mobj.group(1)
4040
4041                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4042                 self.report_extraction(video_id)
4043                 request = urllib2.Request(videogen_url)
4044                 try:
4045                         metadataXml = urllib2.urlopen(request).read()
4046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4047                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4048                         return
4049
4050                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4051                 renditions = mdoc.findall('.//rendition')
4052
4053                 # For now, always pick the highest quality.
4054                 rendition = renditions[-1]
4055
4056                 try:
4057                         _,_,ext = rendition.attrib['type'].partition('/')
4058                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4059                         video_url = rendition.find('./src').text
4060                 except KeyError:
4061                         self._downloader.trouble('Invalid rendition field.')
4062                         return
4063
4064                 self._downloader.increment_downloads()
4065                 info = {
4066                         'id': video_id,
4067                         'url': video_url,
4068                         'uploader': performer,
4069                         'title': video_title,
4070                         'stitle': _simplify_title(video_title),
4071                         'ext': ext,
4072                         'format': format,
4073                 }
4074
4075                 try:
4076                         self._downloader.process_info(info)
4077                 except UnavailableVideoError, err:
4078                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4079
4080
4081 class PostProcessor(object):
4082         """Post Processor class.
4083
4084         PostProcessor objects can be added to downloaders with their
4085         add_post_processor() method. When the downloader has finished a
4086         successful download, it will take its internal chain of PostProcessors
4087         and start calling the run() method on each one of them, first with
4088         an initial argument and then with the returned value of the previous
4089         PostProcessor.
4090
4091         The chain will be stopped if one of them ever returns None or the end
4092         of the chain is reached.
4093
4094         PostProcessor objects follow a "mutual registration" process similar
4095         to InfoExtractor objects.
4096         """
4097
4098         _downloader = None
4099
4100         def __init__(self, downloader=None):
4101                 self._downloader = downloader
4102
4103         def set_downloader(self, downloader):
4104                 """Sets the downloader for this PP."""
4105                 self._downloader = downloader
4106
4107         def run(self, information):
4108                 """Run the PostProcessor.
4109
4110                 The "information" argument is a dictionary like the ones
4111                 composed by InfoExtractors. The only difference is that this
4112                 one has an extra field called "filepath" that points to the
4113                 downloaded file.
4114
4115                 When this method returns None, the postprocessing chain is
4116                 stopped. However, this method may return an information
4117                 dictionary that will be passed to the next postprocessing
4118                 object in the chain. It can be the one it received after
4119                 changing some fields.
4120
4121                 In addition, this method may raise a PostProcessingError
4122                 exception that will be taken into account by the downloader
4123                 it was called from.
4124                 """
4125                 return information # by default, do nothing
4126
4127 class AudioConversionError(BaseException):
4128         def __init__(self, message):
4129                 self.message = message
4130
4131 class FFmpegExtractAudioPP(PostProcessor):
4132
4133         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4134                 PostProcessor.__init__(self, downloader)
4135                 if preferredcodec is None:
4136                         preferredcodec = 'best'
4137                 self._preferredcodec = preferredcodec
4138                 self._preferredquality = preferredquality
4139                 self._keepvideo = keepvideo
4140
4141         @staticmethod
4142         def get_audio_codec(path):
4143                 try:
4144                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4145                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4146                         output = handle.communicate()[0]
4147                         if handle.wait() != 0:
4148                                 return None
4149                 except (IOError, OSError):
4150                         return None
4151                 audio_codec = None
4152                 for line in output.split('\n'):
4153                         if line.startswith('codec_name='):
4154                                 audio_codec = line.split('=')[1].strip()
4155                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4156                                 return audio_codec
4157                 return None
4158
4159         @staticmethod
4160         def run_ffmpeg(path, out_path, codec, more_opts):
4161                 if codec is None:
4162                         acodec_opts = []
4163                 else:
4164                         acodec_opts = ['-acodec', codec]
4165                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4166                 try:
4167                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4168                         stdout,stderr = p.communicate()
4169                 except (IOError, OSError):
4170                         e = sys.exc_info()[1]
4171                         if isinstance(e, OSError) and e.errno == 2:
4172                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4173                         else:
4174                                 raise e
4175                 if p.returncode != 0:
4176                         msg = stderr.strip().split('\n')[-1]
4177                         raise AudioConversionError(msg)
4178
4179         def run(self, information):
4180                 path = information['filepath']
4181
4182                 filecodec = self.get_audio_codec(path)
4183                 if filecodec is None:
4184                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4185                         return None
4186
4187                 more_opts = []
4188                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4189                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4190                                 # Lossless, but in another container
4191                                 acodec = 'copy'
4192                                 extension = self._preferredcodec
4193                                 more_opts = ['-absf', 'aac_adtstoasc']
4194                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4195                                 # Lossless if possible
4196                                 acodec = 'copy'
4197                                 extension = filecodec
4198                                 if filecodec == 'aac':
4199                                         more_opts = ['-f', 'adts']
4200                                 if filecodec == 'vorbis':
4201                                         extension = 'ogg'
4202                         else:
4203                                 # MP3 otherwise.
4204                                 acodec = 'libmp3lame'
4205                                 extension = 'mp3'
4206                                 more_opts = []
4207                                 if self._preferredquality is not None:
4208                                         more_opts += ['-ab', self._preferredquality]
4209                 else:
4210                         # We convert the audio (lossy)
4211                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4212                         extension = self._preferredcodec
4213                         more_opts = []
4214                         if self._preferredquality is not None:
4215                                 more_opts += ['-ab', self._preferredquality]
4216                         if self._preferredcodec == 'aac':
4217                                 more_opts += ['-f', 'adts']
4218                         if self._preferredcodec == 'm4a':
4219                                 more_opts += ['-absf', 'aac_adtstoasc']
4220                         if self._preferredcodec == 'vorbis':
4221                                 extension = 'ogg'
4222                         if self._preferredcodec == 'wav':
4223                                 extension = 'wav'
4224                                 more_opts += ['-f', 'wav']
4225
4226                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4227                 new_path = prefix + sep + extension
4228                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4229                 try:
4230                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4231                 except:
4232                         etype,e,tb = sys.exc_info()
4233                         if isinstance(e, AudioConversionError):
4234                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4235                         else:
4236                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4237                         return None
4238
4239                 # Try to update the date time for extracted audio file.
4240                 if information.get('filetime') is not None:
4241                         try:
4242                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4243                         except:
4244                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4245
4246                 if not self._keepvideo:
4247                         try:
4248                                 os.remove(_encodeFilename(path))
4249                         except (IOError, OSError):
4250                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4251                                 return None
4252
4253                 information['filepath'] = new_path
4254                 return information
4255
4256
4257 def updateSelf(downloader, filename):
4258         ''' Update the program file with the latest version from the repository '''
4259         # Note: downloader only used for options
4260         if not os.access(filename, os.W_OK):
4261                 sys.exit('ERROR: no write permissions on %s' % filename)
4262
4263         downloader.to_screen(u'Updating to latest version...')
4264
4265         try:
4266                 try:
4267                         urlh = urllib.urlopen(UPDATE_URL)
4268                         newcontent = urlh.read()
4269
4270                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4271                         if vmatch is not None and vmatch.group(1) == __version__:
4272                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4273                                 return
4274                 finally:
4275                         urlh.close()
4276         except (IOError, OSError), err:
4277                 sys.exit('ERROR: unable to download latest version')
4278
4279         try:
4280                 outf = open(filename, 'wb')
4281                 try:
4282                         outf.write(newcontent)
4283                 finally:
4284                         outf.close()
4285         except (IOError, OSError), err:
4286                 sys.exit('ERROR: unable to overwrite current version')
4287
4288         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4289
4290 def parseOpts():
4291         def _readOptions(filename_bytes):
4292                 try:
4293                         optionf = open(filename_bytes)
4294                 except IOError:
4295                         return [] # silently skip if file is not present
4296                 try:
4297                         res = []
4298                         for l in optionf:
4299                                 res += shlex.split(l, comments=True)
4300                 finally:
4301                         optionf.close()
4302                 return res
4303
4304         def _format_option_string(option):
4305                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4306
4307                 opts = []
4308
4309                 if option._short_opts: opts.append(option._short_opts[0])
4310                 if option._long_opts: opts.append(option._long_opts[0])
4311                 if len(opts) > 1: opts.insert(1, ', ')
4312
4313                 if option.takes_value(): opts.append(' %s' % option.metavar)
4314
4315                 return "".join(opts)
4316
4317         def _find_term_columns():
4318                 columns = os.environ.get('COLUMNS', None)
4319                 if columns:
4320                         return int(columns)
4321
4322                 try:
4323                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4324                         out,err = sp.communicate()
4325                         return int(out.split()[1])
4326                 except:
4327                         pass
4328                 return None
4329
4330         max_width = 80
4331         max_help_position = 80
4332
4333         # No need to wrap help messages if we're on a wide console
4334         columns = _find_term_columns()
4335         if columns: max_width = columns
4336
4337         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4338         fmt.format_option_strings = _format_option_string
4339
4340         kw = {
4341                 'version'   : __version__,
4342                 'formatter' : fmt,
4343                 'usage' : '%prog [options] url [url...]',
4344                 'conflict_handler' : 'resolve',
4345         }
4346
4347         parser = optparse.OptionParser(**kw)
4348
4349         # option groups
4350         general        = optparse.OptionGroup(parser, 'General Options')
4351         selection      = optparse.OptionGroup(parser, 'Video Selection')
4352         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4353         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4354         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4355         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4356         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4357
4358         general.add_option('-h', '--help',
4359                         action='help', help='print this help text and exit')
4360         general.add_option('-v', '--version',
4361                         action='version', help='print program version and exit')
4362         general.add_option('-U', '--update',
4363                         action='store_true', dest='update_self', help='update this program to latest version')
4364         general.add_option('-i', '--ignore-errors',
4365                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4366         general.add_option('-r', '--rate-limit',
4367                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4368         general.add_option('-R', '--retries',
4369                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4370         general.add_option('--dump-user-agent',
4371                         action='store_true', dest='dump_user_agent',
4372                         help='display the current browser identification', default=False)
4373         general.add_option('--list-extractors',
4374                         action='store_true', dest='list_extractors',
4375                         help='List all supported extractors and the URLs they would handle', default=False)
4376
4377         selection.add_option('--playlist-start',
4378                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4379         selection.add_option('--playlist-end',
4380                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4381         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4382         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4383         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4384
4385         authentication.add_option('-u', '--username',
4386                         dest='username', metavar='USERNAME', help='account username')
4387         authentication.add_option('-p', '--password',
4388                         dest='password', metavar='PASSWORD', help='account password')
4389         authentication.add_option('-n', '--netrc',
4390                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4391
4392
4393         video_format.add_option('-f', '--format',
4394                         action='store', dest='format', metavar='FORMAT', help='video format code')
4395         video_format.add_option('--all-formats',
4396                         action='store_const', dest='format', help='download all available video formats', const='all')
4397         video_format.add_option('--prefer-free-formats',
4398                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4399         video_format.add_option('--max-quality',
4400                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4401         video_format.add_option('-F', '--list-formats',
4402                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4403         video_format.add_option('--write-srt',
4404                         action='store_true', dest='writesubtitles',
4405                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4406         video_format.add_option('--srt-lang',
4407                         action='store', dest='subtitleslang', metavar='LANG',
4408                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4409
4410
4411         verbosity.add_option('-q', '--quiet',
4412                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4413         verbosity.add_option('-s', '--simulate',
4414                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4415         verbosity.add_option('--skip-download',
4416                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4417         verbosity.add_option('-g', '--get-url',
4418                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4419         verbosity.add_option('-e', '--get-title',
4420                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4421         verbosity.add_option('--get-thumbnail',
4422                         action='store_true', dest='getthumbnail',
4423                         help='simulate, quiet but print thumbnail URL', default=False)
4424         verbosity.add_option('--get-description',
4425                         action='store_true', dest='getdescription',
4426                         help='simulate, quiet but print video description', default=False)
4427         verbosity.add_option('--get-filename',
4428                         action='store_true', dest='getfilename',
4429                         help='simulate, quiet but print output filename', default=False)
4430         verbosity.add_option('--get-format',
4431                         action='store_true', dest='getformat',
4432                         help='simulate, quiet but print output format', default=False)
4433         verbosity.add_option('--no-progress',
4434                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4435         verbosity.add_option('--console-title',
4436                         action='store_true', dest='consoletitle',
4437                         help='display progress in console titlebar', default=False)
4438         verbosity.add_option('-v', '--verbose',
4439                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4440
4441
4442         filesystem.add_option('-t', '--title',
4443                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4444         filesystem.add_option('-l', '--literal',
4445                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4446         filesystem.add_option('-A', '--auto-number',
4447                         action='store_true', dest='autonumber',
4448                         help='number downloaded files starting from 00000', default=False)
4449         filesystem.add_option('-o', '--output',
4450                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4451         filesystem.add_option('-a', '--batch-file',
4452                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4453         filesystem.add_option('-w', '--no-overwrites',
4454                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4455         filesystem.add_option('-c', '--continue',
4456                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4457         filesystem.add_option('--no-continue',
4458                         action='store_false', dest='continue_dl',
4459                         help='do not resume partially downloaded files (restart from beginning)')
4460         filesystem.add_option('--cookies',
4461                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4462         filesystem.add_option('--no-part',
4463                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4464         filesystem.add_option('--no-mtime',
4465                         action='store_false', dest='updatetime',
4466                         help='do not use the Last-modified header to set the file modification time', default=True)
4467         filesystem.add_option('--write-description',
4468                         action='store_true', dest='writedescription',
4469                         help='write video description to a .description file', default=False)
4470         filesystem.add_option('--write-info-json',
4471                         action='store_true', dest='writeinfojson',
4472                         help='write video metadata to a .info.json file', default=False)
4473
4474
4475         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4476                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4477         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4478                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4479         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4480                         help='ffmpeg audio bitrate specification, 128k by default')
4481         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4482                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4483
4484
4485         parser.add_option_group(general)
4486         parser.add_option_group(selection)
4487         parser.add_option_group(filesystem)
4488         parser.add_option_group(verbosity)
4489         parser.add_option_group(video_format)
4490         parser.add_option_group(authentication)
4491         parser.add_option_group(postproc)
4492
4493         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4494         if xdg_config_home:
4495                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4496         else:
4497                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4498         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4499         opts, args = parser.parse_args(argv)
4500
4501         return parser, opts, args
4502
4503 def gen_extractors():
4504         """ Return a list of an instance of every supported extractor.
4505         The order does matter; the first extractor matched is the one handling the URL.
4506         """
4507         youtube_ie = YoutubeIE()
4508         google_ie = GoogleIE()
4509         yahoo_ie = YahooIE()
4510         return [
4511                 YoutubePlaylistIE(youtube_ie),
4512                 YoutubeUserIE(youtube_ie),
4513                 YoutubeSearchIE(youtube_ie),
4514                 youtube_ie,
4515                 MetacafeIE(youtube_ie),
4516                 DailymotionIE(),
4517                 google_ie,
4518                 GoogleSearchIE(google_ie),
4519                 PhotobucketIE(),
4520                 yahoo_ie,
4521                 YahooSearchIE(yahoo_ie),
4522                 DepositFilesIE(),
4523                 FacebookIE(),
4524                 BlipTVIE(),
4525                 VimeoIE(),
4526                 MyVideoIE(),
4527                 ComedyCentralIE(),
4528                 EscapistIE(),
4529                 CollegeHumorIE(),
4530                 XVideosIE(),
4531                 SoundcloudIE(),
4532                 InfoQIE(),
4533                 MixcloudIE(),
4534                 StanfordOpenClassroomIE(),
4535                 MTVIE(),
4536
4537                 GenericIE()
4538         ]
4539
4540 def _real_main():
4541         parser, opts, args = parseOpts()
4542
4543         # Open appropriate CookieJar
4544         if opts.cookiefile is None:
4545                 jar = cookielib.CookieJar()
4546         else:
4547                 try:
4548                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4549                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4550                                 jar.load()
4551                 except (IOError, OSError), err:
4552                         sys.exit(u'ERROR: unable to open cookie file')
4553
4554         # Dump user agent
4555         if opts.dump_user_agent:
4556                 print std_headers['User-Agent']
4557                 sys.exit(0)
4558
4559         # Batch file verification
4560         batchurls = []
4561         if opts.batchfile is not None:
4562                 try:
4563                         if opts.batchfile == '-':
4564                                 batchfd = sys.stdin
4565                         else:
4566                                 batchfd = open(opts.batchfile, 'r')
4567                         batchurls = batchfd.readlines()
4568                         batchurls = [x.strip() for x in batchurls]
4569                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4570                 except IOError:
4571                         sys.exit(u'ERROR: batch file could not be read')
4572         all_urls = batchurls + args
4573         all_urls = map(lambda url: url.strip(), all_urls)
4574
4575         # General configuration
4576         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4577         proxy_handler = urllib2.ProxyHandler()
4578         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4579         urllib2.install_opener(opener)
4580         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4581
4582         if opts.verbose:
4583                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4584
4585         extractors = gen_extractors()
4586
4587         if opts.list_extractors:
4588                 for ie in extractors:
4589                         print(ie.IE_NAME)
4590                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4591                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4592                         for mu in matchedUrls:
4593                                 print(u'  ' + mu)
4594                 sys.exit(0)
4595
4596         # Conflicting, missing and erroneous options
4597         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4598                 parser.error(u'using .netrc conflicts with giving username/password')
4599         if opts.password is not None and opts.username is None:
4600                 parser.error(u'account username missing')
4601         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4602                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4603         if opts.usetitle and opts.useliteral:
4604                 parser.error(u'using title conflicts with using literal title')
4605         if opts.username is not None and opts.password is None:
4606                 opts.password = getpass.getpass(u'Type account password and press return:')
4607         if opts.ratelimit is not None:
4608                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4609                 if numeric_limit is None:
4610                         parser.error(u'invalid rate limit specified')
4611                 opts.ratelimit = numeric_limit
4612         if opts.retries is not None:
4613                 try:
4614                         opts.retries = long(opts.retries)
4615                 except (TypeError, ValueError), err:
4616                         parser.error(u'invalid retry count specified')
4617         try:
4618                 opts.playliststart = int(opts.playliststart)
4619                 if opts.playliststart <= 0:
4620                         raise ValueError(u'Playlist start must be positive')
4621         except (TypeError, ValueError), err:
4622                 parser.error(u'invalid playlist start number specified')
4623         try:
4624                 opts.playlistend = int(opts.playlistend)
4625                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4626                         raise ValueError(u'Playlist end must be greater than playlist start')
4627         except (TypeError, ValueError), err:
4628                 parser.error(u'invalid playlist end number specified')
4629         if opts.extractaudio:
4630                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4631                         parser.error(u'invalid audio format specified')
4632
4633         # File downloader
4634         fd = FileDownloader({
4635                 'usenetrc': opts.usenetrc,
4636                 'username': opts.username,
4637                 'password': opts.password,
4638                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639                 'forceurl': opts.geturl,
4640                 'forcetitle': opts.gettitle,
4641                 'forcethumbnail': opts.getthumbnail,
4642                 'forcedescription': opts.getdescription,
4643                 'forcefilename': opts.getfilename,
4644                 'forceformat': opts.getformat,
4645                 'simulate': opts.simulate,
4646                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4647                 'format': opts.format,
4648                 'format_limit': opts.format_limit,
4649                 'listformats': opts.listformats,
4650                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4651                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4652                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4653                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4654                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4655                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4656                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4657                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4658                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4659                         or u'%(id)s.%(ext)s'),
4660                 'ignoreerrors': opts.ignoreerrors,
4661                 'ratelimit': opts.ratelimit,
4662                 'nooverwrites': opts.nooverwrites,
4663                 'retries': opts.retries,
4664                 'continuedl': opts.continue_dl,
4665                 'noprogress': opts.noprogress,
4666                 'playliststart': opts.playliststart,
4667                 'playlistend': opts.playlistend,
4668                 'logtostderr': opts.outtmpl == '-',
4669                 'consoletitle': opts.consoletitle,
4670                 'nopart': opts.nopart,
4671                 'updatetime': opts.updatetime,
4672                 'writedescription': opts.writedescription,
4673                 'writeinfojson': opts.writeinfojson,
4674                 'writesubtitles': opts.writesubtitles,
4675                 'subtitleslang': opts.subtitleslang,
4676                 'matchtitle': opts.matchtitle,
4677                 'rejecttitle': opts.rejecttitle,
4678                 'max_downloads': opts.max_downloads,
4679                 'prefer_free_formats': opts.prefer_free_formats,
4680                 'verbose': opts.verbose,
4681                 })
4682         for extractor in extractors:
4683                 fd.add_info_extractor(extractor)
4684
4685         # PostProcessors
4686         if opts.extractaudio:
4687                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4688
4689         # Update version
4690         if opts.update_self:
4691                 updateSelf(fd, sys.argv[0])
4692
4693         # Maybe do nothing
4694         if len(all_urls) < 1:
4695                 if not opts.update_self:
4696                         parser.error(u'you must provide at least one URL')
4697                 else:
4698                         sys.exit()
4699
4700         try:
4701                 retcode = fd.download(all_urls)
4702         except MaxDownloadsReached:
4703                 fd.to_screen(u'--max-download limit reached, aborting.')
4704                 retcode = 101
4705
4706         # Dump cookie jar if requested
4707         if opts.cookiefile is not None:
4708                 try:
4709                         jar.save()
4710                 except (IOError, OSError), err:
4711                         sys.exit(u'ERROR: unable to save cookie jar')
4712
4713         sys.exit(retcode)
4714
4715 def main():
4716         try:
4717                 _real_main()
4718         except DownloadError:
4719                 sys.exit(1)
4720         except SameFileError:
4721                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4722         except KeyboardInterrupt:
4723                 sys.exit(u'\nERROR: Interrupted by user')
4724
4725 if __name__ == '__main__':
4726         main()
4727
4728 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: