_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import lxml.etree
  72 except ImportError:
  73         pass # Handled below
  74
  75 try:
  76         import xml.etree.ElementTree
  77 except ImportError: # Python<2.5: Not officially supported, but let it slip
  78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  79
  80 std_headers = {
  81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84         'Accept-Encoding': 'gzip, deflate',
  85         'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88 try:
  89         import json
  90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  91         import re
  92         class json(object):
  93                 @staticmethod
  94                 def loads(s):
  95                         s = s.decode('UTF-8')
  96                         def raiseError(msg, i):
  97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  98                         def skipSpace(i, expectMore=True):
  99                                 while i < len(s) and s[i] in ' \t\r\n':
 100                                         i += 1
 101                                 if expectMore:
 102                                         if i >= len(s):
 103                                                 raiseError('Premature end', i)
 104                                 return i
 105                         def decodeEscape(match):
 106                                 esc = match.group(1)
 107                                 _STATIC = {
 108                                         '"': '"',
 109                                         '\\': '\\',
 110                                         '/': '/',
 111                                         'b': unichr(0x8),
 112                                         'f': unichr(0xc),
 113                                         'n': '\n',
 114                                         'r': '\r',
 115                                         't': '\t',
 116                                 }
 117                                 if esc in _STATIC:
 118                                         return _STATIC[esc]
 119                                 if esc[0] == 'u':
 120                                         if len(esc) == 1+4:
 121                                                 return unichr(int(esc[1:5], 16))
 122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 123                                                 hi = int(esc[1:5], 16)
 124                                                 low = int(esc[7:11], 16)
 125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 126                                 raise ValueError('Unknown escape ' + str(esc))
 127                         def parseString(i):
 128                                 i += 1
 129                                 e = i
 130                                 while True:
 131                                         e = s.index('"', e)
 132                                         bslashes = 0
 133                                         while s[e-bslashes-1] == '\\':
 134                                                 bslashes += 1
 135                                         if bslashes % 2 == 1:
 136                                                 e += 1
 137                                                 continue
 138                                         break
 139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 140                                 stri = rexp.sub(decodeEscape, s[i:e])
 141                                 return (e+1,stri)
 142                         def parseObj(i):
 143                                 i += 1
 144                                 res = {}
 145                                 i = skipSpace(i)
 146                                 if s[i] == '}': # Empty dictionary
 147                                         return (i+1,res)
 148                                 while True:
 149                                         if s[i] != '"':
 150                                                 raiseError('Expected a string object key', i)
 151                                         i,key = parseString(i)
 152                                         i = skipSpace(i)
 153                                         if i >= len(s) or s[i] != ':':
 154                                                 raiseError('Expected a colon', i)
 155                                         i,val = parse(i+1)
 156                                         res[key] = val
 157                                         i = skipSpace(i)
 158                                         if s[i] == '}':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected comma or closing curly brace', i)
 162                                         i = skipSpace(i+1)
 163                         def parseArray(i):
 164                                 res = []
 165                                 i = skipSpace(i+1)
 166                                 if s[i] == ']': # Empty array
 167                                         return (i+1,res)
 168                                 while True:
 169                                         i,val = parse(i)
 170                                         res.append(val)
 171                                         i = skipSpace(i) # Raise exception if premature end
 172                                         if s[i] == ']':
 173                                                 return (i+1, res)
 174                                         if s[i] != ',':
 175                                                 raiseError('Expected a comma or closing bracket', i)
 176                                         i = skipSpace(i+1)
 177                         def parseDiscrete(i):
 178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 179                                         if s.startswith(k, i):
 180                                                 return (i+len(k), v)
 181                                 raiseError('Not a boolean (or null)', i)
 182                         def parseNumber(i):
 183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 184                                 if mobj is None:
 185                                         raiseError('Not a number', i)
 186                                 nums = mobj.group(1)
 187                                 if '.' in nums or 'e' in nums or 'E' in nums:
 188                                         return (i+len(nums), float(nums))
 189                                 return (i+len(nums), int(nums))
 190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 191                         def parse(i):
 192                                 i = skipSpace(i)
 193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 194                                 i = skipSpace(i, False)
 195                                 return (i,res)
 196                         i,res = parse(0)
 197                         if i < len(s):
 198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 199                         return res
 200
 201 def preferredencoding():
 202         """Get preferred encoding.
 203
 204         Returns the best encoding scheme for the system, based on
 205         locale.getpreferredencoding() and some further tweaks.
 206         """
 207         def yield_preferredencoding():
 208                 try:
 209                         pref = locale.getpreferredencoding()
 210                         u'TEST'.encode(pref)
 211                 except:
 212                         pref = 'UTF-8'
 213                 while True:
 214                         yield pref
 215         return yield_preferredencoding().next()
 216
 217
 218 def htmlentity_transform(matchobj):
 219         """Transforms an HTML entity to a Unicode character.
 220
 221         This function receives a match object and is intended to be used with
 222         the re.sub() function.
 223         """
 224         entity = matchobj.group(1)
 225
 226         # Known non-numeric HTML entity
 227         if entity in htmlentitydefs.name2codepoint:
 228                 return unichr(htmlentitydefs.name2codepoint[entity])
 229
 230         # Unicode character
 231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 232         if mobj is not None:
 233                 numstr = mobj.group(1)
 234                 if numstr.startswith(u'x'):
 235                         base = 16
 236                         numstr = u'0%s' % numstr
 237                 else:
 238                         base = 10
 239                 return unichr(long(numstr, base))
 240
 241         # Unknown entity in name, return its literal representation
 242         return (u'&%s;' % entity)
 243
 244
 245 def sanitize_title(utitle):
 246         """Sanitizes a video title so it could be used as part of a filename."""
 247         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 248         return utitle.replace(unicode(os.sep), u'%')
 249
 250
 251 def sanitize_open(filename, open_mode):
 252         """Try to open the given filename, and slightly tweak it if this fails.
 253
 254         Attempts to open the given filename. If this fails, it tries to change
 255         the filename slightly, step by step, until it's either able to open it
 256         or it fails and raises a final exception, like the standard open()
 257         function.
 258
 259         It returns the tuple (stream, definitive_file_name).
 260         """
 261         try:
 262                 if filename == u'-':
 263                         if sys.platform == 'win32':
 264                                 import msvcrt
 265                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 266                         return (sys.stdout, filename)
 267                 stream = open(_encodeFilename(filename), open_mode)
 268                 return (stream, filename)
 269         except (IOError, OSError), err:
 270                 # In case of error, try to remove win32 forbidden chars
 271                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 272
 273                 # An exception here should be caught in the caller
 274                 stream = open(_encodeFilename(filename), open_mode)
 275                 return (stream, filename)
 276
 277
 278 def timeconvert(timestr):
 279         """Convert RFC 2822 defined time string into system timestamp"""
 280         timestamp = None
 281         timetuple = email.utils.parsedate_tz(timestr)
 282         if timetuple is not None:
 283                 timestamp = email.utils.mktime_tz(timetuple)
 284         return timestamp
 285
 286 def _simplify_title(title):
 287         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 288         return expr.sub(u'_', title).strip(u'_')
 289
 290 def _orderedSet(iterable):
 291         """ Remove all duplicates from the input iterable """
 292         res = []
 293         for el in iterable:
 294                 if el not in res:
 295                         res.append(el)
 296         return res
 297
 298 def _unescapeHTML(s):
 299         """
 300         @param s a string (of type unicode)
 301         """
 302         assert type(s) == type(u'')
 303
 304         htmlParser = HTMLParser.HTMLParser()
 305         return htmlParser.unescape(s)
 306
 307 def _encodeFilename(s):
 308         """
 309         @param s The name of the file (of type unicode)
 310         """
 311
 312         assert type(s) == type(u'')
 313
 314         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 315                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 316                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 317                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 318                 return s
 319         else:
 320                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 321
 322 class DownloadError(Exception):
 323         """Download Error exception.
 324
 325         This exception may be thrown by FileDownloader objects if they are not
 326         configured to continue on errors. They will contain the appropriate
 327         error message.
 328         """
 329         pass
 330
 331
 332 class SameFileError(Exception):
 333         """Same File exception.
 334
 335         This exception will be thrown by FileDownloader objects if they detect
 336         multiple files would have to be downloaded to the same file on disk.
 337         """
 338         pass
 339
 340
 341 class PostProcessingError(Exception):
 342         """Post Processing exception.
 343
 344         This exception may be raised by PostProcessor's .run() method to
 345         indicate an error in the postprocessing task.
 346         """
 347         pass
 348
 349 class MaxDownloadsReached(Exception):
 350         """ --max-downloads limit has been reached. """
 351         pass
 352
 353
 354 class UnavailableVideoError(Exception):
 355         """Unavailable Format exception.
 356
 357         This exception will be thrown when a video is requested
 358         in a format that is not available for that video.
 359         """
 360         pass
 361
 362
 363 class ContentTooShortError(Exception):
 364         """Content Too Short exception.
 365
 366         This exception may be raised by FileDownloader objects when a file they
 367         download is too small for what the server announced first, indicating
 368         the connection was probably interrupted.
 369         """
 370         # Both in bytes
 371         downloaded = None
 372         expected = None
 373
 374         def __init__(self, downloaded, expected):
 375                 self.downloaded = downloaded
 376                 self.expected = expected
 377
 378
 379 class YoutubeDLHandler(urllib2.HTTPHandler):
 380         """Handler for HTTP requests and responses.
 381
 382         This class, when installed with an OpenerDirector, automatically adds
 383         the standard headers to every HTTP request and handles gzipped and
 384         deflated responses from web servers. If compression is to be avoided in
 385         a particular request, the original request in the program code only has
 386         to include the HTTP header "Youtubedl-No-Compression", which will be
 387         removed before making the real request.
 388
 389         Part of this code was copied from:
 390
 391         http://techknack.net/python-urllib2-handlers/
 392
 393         Andrew Rowls, the author of that code, agreed to release it to the
 394         public domain.
 395         """
 396
 397         @staticmethod
 398         def deflate(data):
 399                 try:
 400                         return zlib.decompress(data, -zlib.MAX_WBITS)
 401                 except zlib.error:
 402                         return zlib.decompress(data)
 403
 404         @staticmethod
 405         def addinfourl_wrapper(stream, headers, url, code):
 406                 if hasattr(urllib2.addinfourl, 'getcode'):
 407                         return urllib2.addinfourl(stream, headers, url, code)
 408                 ret = urllib2.addinfourl(stream, headers, url)
 409                 ret.code = code
 410                 return ret
 411
 412         def http_request(self, req):
 413                 for h in std_headers:
 414                         if h in req.headers:
 415                                 del req.headers[h]
 416                         req.add_header(h, std_headers[h])
 417                 if 'Youtubedl-no-compression' in req.headers:
 418                         if 'Accept-encoding' in req.headers:
 419                                 del req.headers['Accept-encoding']
 420                         del req.headers['Youtubedl-no-compression']
 421                 return req
 422
 423         def http_response(self, req, resp):
 424                 old_resp = resp
 425                 # gzip
 426                 if resp.headers.get('Content-encoding', '') == 'gzip':
 427                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 428                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 429                         resp.msg = old_resp.msg
 430                 # deflate
 431                 if resp.headers.get('Content-encoding', '') == 'deflate':
 432                         gz = StringIO.StringIO(self.deflate(resp.read()))
 433                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 434                         resp.msg = old_resp.msg
 435                 return resp
 436
 437
 438 class FileDownloader(object):
 439         """File Downloader class.
 440
 441         File downloader objects are the ones responsible of downloading the
 442         actual video file and writing it to disk if the user has requested
 443         it, among some other tasks. In most cases there should be one per
 444         program. As, given a video URL, the downloader doesn't know how to
 445         extract all the needed information, task that InfoExtractors do, it
 446         has to pass the URL to one of them.
 447
 448         For this, file downloader objects have a method that allows
 449         InfoExtractors to be registered in a given order. When it is passed
 450         a URL, the file downloader handles it to the first InfoExtractor it
 451         finds that reports being able to handle it. The InfoExtractor extracts
 452         all the information about the video or videos the URL refers to, and
 453         asks the FileDownloader to process the video information, possibly
 454         downloading the video.
 455
 456         File downloaders accept a lot of parameters. In order not to saturate
 457         the object constructor with arguments, it receives a dictionary of
 458         options instead. These options are available through the params
 459         attribute for the InfoExtractors to use. The FileDownloader also
 460         registers itself as the downloader in charge for the InfoExtractors
 461         that are added to it, so this is a "mutual registration".
 462
 463         Available options:
 464
 465         username:         Username for authentication purposes.
 466         password:         Password for authentication purposes.
 467         usenetrc:         Use netrc for authentication instead.
 468         quiet:            Do not print messages to stdout.
 469         forceurl:         Force printing final URL.
 470         forcetitle:       Force printing title.
 471         forcethumbnail:   Force printing thumbnail URL.
 472         forcedescription: Force printing description.
 473         forcefilename:    Force printing final filename.
 474         simulate:         Do not download the video files.
 475         format:           Video format code.
 476         format_limit:     Highest quality format to try.
 477         outtmpl:          Template for output names.
 478         ignoreerrors:     Do not stop on download errors.
 479         ratelimit:        Download speed limit, in bytes/sec.
 480         nooverwrites:     Prevent overwriting files.
 481         retries:          Number of times to retry for HTTP error 5xx
 482         continuedl:       Try to continue downloads if possible.
 483         noprogress:       Do not print the progress bar.
 484         playliststart:    Playlist item to start at.
 485         playlistend:      Playlist item to end at.
 486         matchtitle:       Download only matching titles.
 487         rejecttitle:      Reject downloads for matching titles.
 488         logtostderr:      Log messages to stderr instead of stdout.
 489         consoletitle:     Display progress in console window's titlebar.
 490         nopart:           Do not use temporary .part files.
 491         updatetime:       Use the Last-modified header to set output file timestamps.
 492         writedescription: Write the video description to a .description file
 493         writeinfojson:    Write the video description to a .info.json file
 494         writesubtitles:   Write the video subtitles to a .srt file
 495         subtitleslang:    Language of the subtitles to download
 496         """
 497
 498         params = None
 499         _ies = []
 500         _pps = []
 501         _download_retcode = None
 502         _num_downloads = None
 503         _screen_file = None
 504
 505         def __init__(self, params):
 506                 """Create a FileDownloader object with the given options."""
 507                 self._ies = []
 508                 self._pps = []
 509                 self._download_retcode = 0
 510                 self._num_downloads = 0
 511                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 512                 self.params = params
 513
 514         @staticmethod
 515         def format_bytes(bytes):
 516                 if bytes is None:
 517                         return 'N/A'
 518                 if type(bytes) is str:
 519                         bytes = float(bytes)
 520                 if bytes == 0.0:
 521                         exponent = 0
 522                 else:
 523                         exponent = long(math.log(bytes, 1024.0))
 524                 suffix = 'bkMGTPEZY'[exponent]
 525                 converted = float(bytes) / float(1024 ** exponent)
 526                 return '%.2f%s' % (converted, suffix)
 527
 528         @staticmethod
 529         def calc_percent(byte_counter, data_len):
 530                 if data_len is None:
 531                         return '---.-%'
 532                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 533
 534         @staticmethod
 535         def calc_eta(start, now, total, current):
 536                 if total is None:
 537                         return '--:--'
 538                 dif = now - start
 539                 if current == 0 or dif < 0.001: # One millisecond
 540                         return '--:--'
 541                 rate = float(current) / dif
 542                 eta = long((float(total) - float(current)) / rate)
 543                 (eta_mins, eta_secs) = divmod(eta, 60)
 544                 if eta_mins > 99:
 545                         return '--:--'
 546                 return '%02d:%02d' % (eta_mins, eta_secs)
 547
 548         @staticmethod
 549         def calc_speed(start, now, bytes):
 550                 dif = now - start
 551                 if bytes == 0 or dif < 0.001: # One millisecond
 552                         return '%10s' % '---b/s'
 553                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 554
 555         @staticmethod
 556         def best_block_size(elapsed_time, bytes):
 557                 new_min = max(bytes / 2.0, 1.0)
 558                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 559                 if elapsed_time < 0.001:
 560                         return long(new_max)
 561                 rate = bytes / elapsed_time
 562                 if rate > new_max:
 563                         return long(new_max)
 564                 if rate < new_min:
 565                         return long(new_min)
 566                 return long(rate)
 567
 568         @staticmethod
 569         def parse_bytes(bytestr):
 570                 """Parse a string indicating a byte quantity into a long integer."""
 571                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 572                 if matchobj is None:
 573                         return None
 574                 number = float(matchobj.group(1))
 575                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 576                 return long(round(number * multiplier))
 577
 578         def add_info_extractor(self, ie):
 579                 """Add an InfoExtractor object to the end of the list."""
 580                 self._ies.append(ie)
 581                 ie.set_downloader(self)
 582
 583         def add_post_processor(self, pp):
 584                 """Add a PostProcessor object to the end of the chain."""
 585                 self._pps.append(pp)
 586                 pp.set_downloader(self)
 587
 588         def to_screen(self, message, skip_eol=False):
 589                 """Print message to stdout if not in quiet mode."""
 590                 assert type(message) == type(u'')
 591                 if not self.params.get('quiet', False):
 592                         terminator = [u'\n', u''][skip_eol]
 593                         output = message + terminator
 594
 595                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 596                                 output = output.encode(preferredencoding(), 'ignore')
 597                         self._screen_file.write(output)
 598                         self._screen_file.flush()
 599
 600         def to_stderr(self, message):
 601                 """Print message to stderr."""
 602                 print >>sys.stderr, message.encode(preferredencoding())
 603
 604         def to_cons_title(self, message):
 605                 """Set console/terminal window title to message."""
 606                 if not self.params.get('consoletitle', False):
 607                         return
 608                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 609                         # c_wchar_p() might not be necessary if `message` is
 610                         # already of type unicode()
 611                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 612                 elif 'TERM' in os.environ:
 613                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 614
 615         def fixed_template(self):
 616                 """Checks if the output template is fixed."""
 617                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 618
 619         def trouble(self, message=None):
 620                 """Determine action to take when a download problem appears.
 621
 622                 Depending on if the downloader has been configured to ignore
 623                 download errors or not, this method may throw an exception or
 624                 not when errors are found, after printing the message.
 625                 """
 626                 if message is not None:
 627                         self.to_stderr(message)
 628                 if not self.params.get('ignoreerrors', False):
 629                         raise DownloadError(message)
 630                 self._download_retcode = 1
 631
 632         def slow_down(self, start_time, byte_counter):
 633                 """Sleep if the download speed is over the rate limit."""
 634                 rate_limit = self.params.get('ratelimit', None)
 635                 if rate_limit is None or byte_counter == 0:
 636                         return
 637                 now = time.time()
 638                 elapsed = now - start_time
 639                 if elapsed <= 0.0:
 640                         return
 641                 speed = float(byte_counter) / elapsed
 642                 if speed > rate_limit:
 643                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 644
 645         def temp_name(self, filename):
 646                 """Returns a temporary filename for the given filename."""
 647                 if self.params.get('nopart', False) or filename == u'-' or \
 648                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 649                         return filename
 650                 return filename + u'.part'
 651
 652         def undo_temp_name(self, filename):
 653                 if filename.endswith(u'.part'):
 654                         return filename[:-len(u'.part')]
 655                 return filename
 656
 657         def try_rename(self, old_filename, new_filename):
 658                 try:
 659                         if old_filename == new_filename:
 660                                 return
 661                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 662                 except (IOError, OSError), err:
 663                         self.trouble(u'ERROR: unable to rename file')
 664
 665         def try_utime(self, filename, last_modified_hdr):
 666                 """Try to set the last-modified time of the given file."""
 667                 if last_modified_hdr is None:
 668                         return
 669                 if not os.path.isfile(_encodeFilename(filename)):
 670                         return
 671                 timestr = last_modified_hdr
 672                 if timestr is None:
 673                         return
 674                 filetime = timeconvert(timestr)
 675                 if filetime is None:
 676                         return filetime
 677                 try:
 678                         os.utime(filename, (time.time(), filetime))
 679                 except:
 680                         pass
 681                 return filetime
 682
 683         def report_writedescription(self, descfn):
 684                 """ Report that the description file is being written """
 685                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 686
 687         def report_writesubtitles(self, srtfn):
 688                 """ Report that the subtitles file is being written """
 689                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 690
 691         def report_writeinfojson(self, infofn):
 692                 """ Report that the metadata file has been written """
 693                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 694
 695         def report_destination(self, filename):
 696                 """Report destination filename."""
 697                 self.to_screen(u'[download] Destination: ' + filename)
 698
 699         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 700                 """Report download progress."""
 701                 if self.params.get('noprogress', False):
 702                         return
 703                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 704                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 705                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 706                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 707
 708         def report_resuming_byte(self, resume_len):
 709                 """Report attempt to resume at given byte."""
 710                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 711
 712         def report_retry(self, count, retries):
 713                 """Report retry in case of HTTP error 5xx"""
 714                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 715
 716         def report_file_already_downloaded(self, file_name):
 717                 """Report file has already been fully downloaded."""
 718                 try:
 719                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 720                 except (UnicodeEncodeError), err:
 721                         self.to_screen(u'[download] The file has already been downloaded')
 722
 723         def report_unable_to_resume(self):
 724                 """Report it was impossible to resume download."""
 725                 self.to_screen(u'[download] Unable to resume')
 726
 727         def report_finish(self):
 728                 """Report download finished."""
 729                 if self.params.get('noprogress', False):
 730                         self.to_screen(u'[download] Download completed')
 731                 else:
 732                         self.to_screen(u'')
 733
 734         def increment_downloads(self):
 735                 """Increment the ordinal that assigns a number to each file."""
 736                 self._num_downloads += 1
 737
 738         def prepare_filename(self, info_dict):
 739                 """Generate the output filename."""
 740                 try:
 741                         template_dict = dict(info_dict)
 742                         template_dict['epoch'] = unicode(long(time.time()))
 743                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 744                         filename = self.params['outtmpl'] % template_dict
 745                         return filename
 746                 except (ValueError, KeyError), err:
 747                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 748                         return None
 749
 750         def _match_entry(self, info_dict):
 751                 """ Returns None iff the file should be downloaded """
 752
 753                 title = info_dict['title']
 754                 matchtitle = self.params.get('matchtitle', False)
 755                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 756                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 757                 rejecttitle = self.params.get('rejecttitle', False)
 758                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 759                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 760                 return None
 761
 762         def process_info(self, info_dict):
 763                 """Process a single dictionary returned by an InfoExtractor."""
 764
 765                 reason = self._match_entry(info_dict)
 766                 if reason is not None:
 767                         self.to_screen(u'[download] ' + reason)
 768                         return
 769
 770                 max_downloads = self.params.get('max_downloads')
 771                 if max_downloads is not None:
 772                         if self._num_downloads > int(max_downloads):
 773                                 raise MaxDownloadsReached()
 774
 775                 filename = self.prepare_filename(info_dict)
 776
 777                 # Forced printings
 778                 if self.params.get('forcetitle', False):
 779                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 780                 if self.params.get('forceurl', False):
 781                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 782                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 783                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 784                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 785                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 786                 if self.params.get('forcefilename', False) and filename is not None:
 787                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 788                 if self.params.get('forceformat', False):
 789                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 790
 791                 # Do nothing else if in simulate mode
 792                 if self.params.get('simulate', False):
 793                         return
 794
 795                 if filename is None:
 796                         return
 797
 798                 try:
 799                         dn = os.path.dirname(_encodeFilename(filename))
 800                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 801                                 os.makedirs(dn)
 802                 except (OSError, IOError), err:
 803                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 804                         return
 805
 806                 if self.params.get('writedescription', False):
 807                         try:
 808                                 descfn = filename + u'.description'
 809                                 self.report_writedescription(descfn)
 810                                 descfile = open(_encodeFilename(descfn), 'wb')
 811                                 try:
 812                                         descfile.write(info_dict['description'].encode('utf-8'))
 813                                 finally:
 814                                         descfile.close()
 815                         except (OSError, IOError):
 816                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 817                                 return
 818
 819                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 820                         # subtitles download errors are already managed as troubles in relevant IE
 821                         # that way it will silently go on when used with unsupporting IE
 822                         try:
 823                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 824                                 self.report_writesubtitles(srtfn)
 825                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 826                                 try:
 827                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 828                                 finally:
 829                                         srtfile.close()
 830                         except (OSError, IOError):
 831                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 832                                 return
 833
 834                 if self.params.get('writeinfojson', False):
 835                         infofn = filename + u'.info.json'
 836                         self.report_writeinfojson(infofn)
 837                         try:
 838                                 json.dump
 839                         except (NameError,AttributeError):
 840                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 841                                 return
 842                         try:
 843                                 infof = open(_encodeFilename(infofn), 'wb')
 844                                 try:
 845                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 846                                         json.dump(json_info_dict, infof)
 847                                 finally:
 848                                         infof.close()
 849                         except (OSError, IOError):
 850                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 851                                 return
 852
 853                 if not self.params.get('skip_download', False):
 854                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 855                                 success = True
 856                         else:
 857                                 try:
 858                                         success = self._do_download(filename, info_dict)
 859                                 except (OSError, IOError), err:
 860                                         raise UnavailableVideoError
 861                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 863                                         return
 864                                 except (ContentTooShortError, ), err:
 865                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 866                                         return
 867
 868                         if success:
 869                                 try:
 870                                         self.post_process(filename, info_dict)
 871                                 except (PostProcessingError), err:
 872                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 873                                         return
 874
 875         def download(self, url_list):
 876                 """Download a given list of URLs."""
 877                 if len(url_list) > 1 and self.fixed_template():
 878                         raise SameFileError(self.params['outtmpl'])
 879
 880                 for url in url_list:
 881                         suitable_found = False
 882                         for ie in self._ies:
 883                                 # Go to next InfoExtractor if not suitable
 884                                 if not ie.suitable(url):
 885                                         continue
 886
 887                                 # Suitable InfoExtractor found
 888                                 suitable_found = True
 889
 890                                 # Extract information from URL and process it
 891                                 ie.extract(url)
 892
 893                                 # Suitable InfoExtractor had been found; go to next URL
 894                                 break
 895
 896                         if not suitable_found:
 897                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 898
 899                 return self._download_retcode
 900
 901         def post_process(self, filename, ie_info):
 902                 """Run the postprocessing chain on the given file."""
 903                 info = dict(ie_info)
 904                 info['filepath'] = filename
 905                 for pp in self._pps:
 906                         info = pp.run(info)
 907                         if info is None:
 908                                 break
 909
 910         def _download_with_rtmpdump(self, filename, url, player_url):
 911                 self.report_destination(filename)
 912                 tmpfilename = self.temp_name(filename)
 913
 914                 # Check for rtmpdump first
 915                 try:
 916                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 917                 except (OSError, IOError):
 918                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 919                         return False
 920
 921                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 922                 # the connection was interrumpted and resuming appears to be
 923                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 924                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 925                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 926                 if self.params.get('verbose', False):
 927                         try:
 928                                 import pipes
 929                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 930                         except ImportError:
 931                                 shell_quote = repr
 932                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 933                 retval = subprocess.call(args)
 934                 while retval == 2 or retval == 1:
 935                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 936                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 937                         time.sleep(5.0) # This seems to be needed
 938                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 939                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 940                         if prevsize == cursize and retval == 1:
 941                                 break
 942                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 943                         if prevsize == cursize and retval == 2 and cursize > 1024:
 944                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 945                                 retval = 0
 946                                 break
 947                 if retval == 0:
 948                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 949                         self.try_rename(tmpfilename, filename)
 950                         return True
 951                 else:
 952                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 953                         return False
 954
 955         def _do_download(self, filename, info_dict):
 956                 url = info_dict['url']
 957                 player_url = info_dict.get('player_url', None)
 958
 959                 # Check file already present
 960                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 961                         self.report_file_already_downloaded(filename)
 962                         return True
 963
 964                 # Attempt to download using rtmpdump
 965                 if url.startswith('rtmp'):
 966                         return self._download_with_rtmpdump(filename, url, player_url)
 967
 968                 tmpfilename = self.temp_name(filename)
 969                 stream = None
 970
 971                 # Do not include the Accept-Encoding header
 972                 headers = {'Youtubedl-no-compression': 'True'}
 973                 basic_request = urllib2.Request(url, None, headers)
 974                 request = urllib2.Request(url, None, headers)
 975
 976                 # Establish possible resume length
 977                 if os.path.isfile(_encodeFilename(tmpfilename)):
 978                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 979                 else:
 980                         resume_len = 0
 981
 982                 open_mode = 'wb'
 983                 if resume_len != 0:
 984                         if self.params.get('continuedl', False):
 985                                 self.report_resuming_byte(resume_len)
 986                                 request.add_header('Range','bytes=%d-' % resume_len)
 987                                 open_mode = 'ab'
 988                         else:
 989                                 resume_len = 0
 990
 991                 count = 0
 992                 retries = self.params.get('retries', 0)
 993                 while count <= retries:
 994                         # Establish connection
 995                         try:
 996                                 if count == 0 and 'urlhandle' in info_dict:
 997                                         data = info_dict['urlhandle']
 998                                 data = urllib2.urlopen(request)
 999                                 break
1000                         except (urllib2.HTTPError, ), err:
1001                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002                                         # Unexpected HTTP error
1003                                         raise
1004                                 elif err.code == 416:
1005                                         # Unable to resume (requested range not satisfiable)
1006                                         try:
1007                                                 # Open the connection again without the range header
1008                                                 data = urllib2.urlopen(basic_request)
1009                                                 content_length = data.info()['Content-Length']
1010                                         except (urllib2.HTTPError, ), err:
1011                                                 if err.code < 500 or err.code >= 600:
1012                                                         raise
1013                                         else:
1014                                                 # Examine the reported length
1015                                                 if (content_length is not None and
1016                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017                                                         # The file had already been fully downloaded.
1018                                                         # Explanation to the above condition: in issue #175 it was revealed that
1019                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1020                                                         # changing the file size slightly and causing problems for some users. So
1021                                                         # I decided to implement a suggested change and consider the file
1022                                                         # completely downloaded if the file size differs less than 100 bytes from
1023                                                         # the one in the hard drive.
1024                                                         self.report_file_already_downloaded(filename)
1025                                                         self.try_rename(tmpfilename, filename)
1026                                                         return True
1027                                                 else:
1028                                                         # The length does not match, we start the download over
1029                                                         self.report_unable_to_resume()
1030                                                         open_mode = 'wb'
1031                                                         break
1032                         # Retry
1033                         count += 1
1034                         if count <= retries:
1035                                 self.report_retry(count, retries)
1036
1037                 if count > retries:
1038                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1039                         return False
1040
1041                 data_len = data.info().get('Content-length', None)
1042                 if data_len is not None:
1043                         data_len = long(data_len) + resume_len
1044                 data_len_str = self.format_bytes(data_len)
1045                 byte_counter = 0 + resume_len
1046                 block_size = 1024
1047                 start = time.time()
1048                 while True:
1049                         # Download and write
1050                         before = time.time()
1051                         data_block = data.read(block_size)
1052                         after = time.time()
1053                         if len(data_block) == 0:
1054                                 break
1055                         byte_counter += len(data_block)
1056
1057                         # Open file just in time
1058                         if stream is None:
1059                                 try:
1060                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061                                         assert stream is not None
1062                                         filename = self.undo_temp_name(tmpfilename)
1063                                         self.report_destination(filename)
1064                                 except (OSError, IOError), err:
1065                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066                                         return False
1067                         try:
1068                                 stream.write(data_block)
1069                         except (IOError, OSError), err:
1070                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071                                 return False
1072                         block_size = self.best_block_size(after - before, len(data_block))
1073
1074                         # Progress message
1075                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076                         if data_len is None:
1077                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078                         else:
1079                                 percent_str = self.calc_percent(byte_counter, data_len)
1080                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083                         # Apply rate limit
1084                         self.slow_down(start, byte_counter - resume_len)
1085
1086                 if stream is None:
1087                         self.trouble(u'\nERROR: Did not get any data blocks')
1088                         return False
1089                 stream.close()
1090                 self.report_finish()
1091                 if data_len is not None and byte_counter != data_len:
1092                         raise ContentTooShortError(byte_counter, long(data_len))
1093                 self.try_rename(tmpfilename, filename)
1094
1095                 # Update file modification time
1096                 if self.params.get('updatetime', True):
1097                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099                 return True
1100
1101
1102 class InfoExtractor(object):
1103         """Information Extractor class.
1104
1105         Information extractors are the classes that, given a URL, extract
1106         information from the video (or videos) the URL refers to. This
1107         information includes the real video URL, the video title and simplified
1108         title, author and others. The information is stored in a dictionary
1109         which is then passed to the FileDownloader. The FileDownloader
1110         processes this information possibly downloading the video to the file
1111         system, among other possible outcomes. The dictionaries must include
1112         the following fields:
1113
1114         id:             Video identifier.
1115         url:            Final video URL.
1116         uploader:       Nickname of the video uploader.
1117         title:          Literal title.
1118         stitle:         Simplified title.
1119         ext:            Video filename extension.
1120         format:         Video format.
1121         player_url:     SWF Player URL (may be None).
1122
1123         The following fields are optional. Their primary purpose is to allow
1124         youtube-dl to serve as the backend for a video search function, such
1125         as the one in youtube2mp3.  They are only used when their respective
1126         forced printing functions are called:
1127
1128         thumbnail:      Full URL to a video thumbnail image.
1129         description:    One-line video description.
1130
1131         Subclasses of this one should re-define the _real_initialize() and
1132         _real_extract() methods and define a _VALID_URL regexp.
1133         Probably, they should also be added to the list of extractors.
1134         """
1135
1136         _ready = False
1137         _downloader = None
1138
1139         def __init__(self, downloader=None):
1140                 """Constructor. Receives an optional downloader."""
1141                 self._ready = False
1142                 self.set_downloader(downloader)
1143
1144         def suitable(self, url):
1145                 """Receives a URL and returns True if suitable for this IE."""
1146                 return re.match(self._VALID_URL, url) is not None
1147
1148         def initialize(self):
1149                 """Initializes an instance (authentication, etc)."""
1150                 if not self._ready:
1151                         self._real_initialize()
1152                         self._ready = True
1153
1154         def extract(self, url):
1155                 """Extracts URL information and returns it in list of dicts."""
1156                 self.initialize()
1157                 return self._real_extract(url)
1158
1159         def set_downloader(self, downloader):
1160                 """Sets the downloader for this IE."""
1161                 self._downloader = downloader
1162
1163         def _real_initialize(self):
1164                 """Real initialization process. Redefine in subclasses."""
1165                 pass
1166
1167         def _real_extract(self, url):
1168                 """Real extraction process. Redefine in subclasses."""
1169                 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173         """Information extractor for youtube.com."""
1174
1175         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179         _NETRC_MACHINE = 'youtube'
1180         # Listed in order of quality
1181         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183         _video_extensions = {
1184                 '13': '3gp',
1185                 '17': 'mp4',
1186                 '18': 'mp4',
1187                 '22': 'mp4',
1188                 '37': 'mp4',
1189                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190                 '43': 'webm',
1191                 '44': 'webm',
1192                 '45': 'webm',
1193                 '46': 'webm',
1194         }
1195         _video_dimensions = {
1196                 '5': '240x400',
1197                 '6': '???',
1198                 '13': '???',
1199                 '17': '144x176',
1200                 '18': '360x640',
1201                 '22': '720x1280',
1202                 '34': '360x640',
1203                 '35': '480x854',
1204                 '37': '1080x1920',
1205                 '38': '3072x4096',
1206                 '43': '360x640',
1207                 '44': '480x854',
1208                 '45': '720x1280',
1209                 '46': '1080x1920',
1210         }
1211         IE_NAME = u'youtube'
1212
1213         def report_lang(self):
1214                 """Report attempt to set language."""
1215                 self._downloader.to_screen(u'[youtube] Setting language')
1216
1217         def report_login(self):
1218                 """Report attempt to log in."""
1219                 self._downloader.to_screen(u'[youtube] Logging in')
1220
1221         def report_age_confirmation(self):
1222                 """Report attempt to confirm age."""
1223                 self._downloader.to_screen(u'[youtube] Confirming age')
1224
1225         def report_video_webpage_download(self, video_id):
1226                 """Report attempt to download video webpage."""
1227                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1228
1229         def report_video_info_webpage_download(self, video_id):
1230                 """Report attempt to download video info webpage."""
1231                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1232
1233         def report_video_subtitles_download(self, video_id):
1234                 """Report attempt to download video info webpage."""
1235                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1236
1237         def report_information_extraction(self, video_id):
1238                 """Report attempt to extract video information."""
1239                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1240
1241         def report_unavailable_format(self, video_id, format):
1242                 """Report extracted video URL."""
1243                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1244
1245         def report_rtmp_download(self):
1246                 """Indicate the download will use the RTMP protocol."""
1247                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1248
1249         def _closed_captions_xml_to_srt(self, xml_string):
1250                 srt = ''
1251                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1252                 # TODO parse xml instead of regex
1253                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1254                         if not dur: dur = '4'
1255                         start = float(start)
1256                         end = start + float(dur)
1257                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1258                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1259                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1260                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1261                         srt += str(n) + '\n'
1262                         srt += start + ' --> ' + end + '\n'
1263                         srt += caption + '\n\n'
1264                 return srt
1265
1266         def _print_formats(self, formats):
1267                 print 'Available formats:'
1268                 for x in formats:
1269                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1270
1271         def _real_initialize(self):
1272                 if self._downloader is None:
1273                         return
1274
1275                 username = None
1276                 password = None
1277                 downloader_params = self._downloader.params
1278
1279                 # Attempt to use provided username and password or .netrc data
1280                 if downloader_params.get('username', None) is not None:
1281                         username = downloader_params['username']
1282                         password = downloader_params['password']
1283                 elif downloader_params.get('usenetrc', False):
1284                         try:
1285                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1286                                 if info is not None:
1287                                         username = info[0]
1288                                         password = info[2]
1289                                 else:
1290                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1291                         except (IOError, netrc.NetrcParseError), err:
1292                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1293                                 return
1294
1295                 # Set language
1296                 request = urllib2.Request(self._LANG_URL)
1297                 try:
1298                         self.report_lang()
1299                         urllib2.urlopen(request).read()
1300                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1301                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1302                         return
1303
1304                 # No authentication to be performed
1305                 if username is None:
1306                         return
1307
1308                 # Log in
1309                 login_form = {
1310                                 'current_form': 'loginForm',
1311                                 'next':         '/',
1312                                 'action_login': 'Log In',
1313                                 'username':     username,
1314                                 'password':     password,
1315                                 }
1316                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1317                 try:
1318                         self.report_login()
1319                         login_results = urllib2.urlopen(request).read()
1320                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1321                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1322                                 return
1323                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1325                         return
1326
1327                 # Confirm age
1328                 age_form = {
1329                                 'next_url':             '/',
1330                                 'action_confirm':       'Confirm',
1331                                 }
1332                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1333                 try:
1334                         self.report_age_confirmation()
1335                         age_results = urllib2.urlopen(request).read()
1336                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1337                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1338                         return
1339
1340         def _real_extract(self, url):
1341                 # Extract video id from URL
1342                 mobj = re.match(self._VALID_URL, url)
1343                 if mobj is None:
1344                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1345                         return
1346                 video_id = mobj.group(2)
1347
1348                 # Get video webpage
1349                 self.report_video_webpage_download(video_id)
1350                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1351                 try:
1352                         video_webpage = urllib2.urlopen(request).read()
1353                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1355                         return
1356
1357                 # Attempt to extract SWF player URL
1358                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1359                 if mobj is not None:
1360                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1361                 else:
1362                         player_url = None
1363
1364                 # Get video info
1365                 self.report_video_info_webpage_download(video_id)
1366                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1367                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1368                                         % (video_id, el_type))
1369                         request = urllib2.Request(video_info_url)
1370                         try:
1371                                 video_info_webpage = urllib2.urlopen(request).read()
1372                                 video_info = parse_qs(video_info_webpage)
1373                                 if 'token' in video_info:
1374                                         break
1375                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1376                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1377                                 return
1378                 if 'token' not in video_info:
1379                         if 'reason' in video_info:
1380                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1381                         else:
1382                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1383                         return
1384
1385                 # Start extracting information
1386                 self.report_information_extraction(video_id)
1387
1388                 # uploader
1389                 if 'author' not in video_info:
1390                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1391                         return
1392                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1393
1394                 # title
1395                 if 'title' not in video_info:
1396                         self._downloader.trouble(u'ERROR: unable to extract video title')
1397                         return
1398                 video_title = urllib.unquote_plus(video_info['title'][0])
1399                 video_title = video_title.decode('utf-8')
1400                 video_title = sanitize_title(video_title)
1401
1402                 # simplified title
1403                 simple_title = _simplify_title(video_title)
1404
1405                 # thumbnail image
1406                 if 'thumbnail_url' not in video_info:
1407                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1408                         video_thumbnail = ''
1409                 else:   # don't panic if we can't find it
1410                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1411
1412                 # upload date
1413                 upload_date = u'NA'
1414                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1415                 if mobj is not None:
1416                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1417                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1418                         for expression in format_expressions:
1419                                 try:
1420                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1421                                 except:
1422                                         pass
1423
1424                 # description
1425                 try:
1426                         lxml.etree
1427                 except NameError:
1428                         video_description = u'No description available.'
1429                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1430                         if mobj is not None:
1431                                 video_description = mobj.group(1).decode('utf-8')
1432                 else:
1433                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1434                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1435                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1436                         # TODO use another parser
1437
1438                 # closed captions
1439                 video_subtitles = None
1440                 if self._downloader.params.get('writesubtitles', False):
1441                         self.report_video_subtitles_download(video_id)
1442                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1443                         try:
1444                                 srt_list = urllib2.urlopen(request).read()
1445                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1447                         else:
1448                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1449                                 if srt_lang_list:
1450                                         if self._downloader.params.get('subtitleslang', False):
1451                                                 srt_lang = self._downloader.params.get('subtitleslang')
1452                                         elif 'en' in srt_lang_list:
1453                                                 srt_lang = 'en'
1454                                         else:
1455                                                 srt_lang = srt_lang_list[0]
1456                                         if not srt_lang in srt_lang_list:
1457                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1458                                         else:
1459                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1460                                                 try:
1461                                                         srt_xml = urllib2.urlopen(request).read()
1462                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1464                                                 else:
1465                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1466                                 else:
1467                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1468
1469                 # token
1470                 video_token = urllib.unquote_plus(video_info['token'][0])
1471
1472                 # Decide which formats to download
1473                 req_format = self._downloader.params.get('format', None)
1474
1475                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1476                         self.report_rtmp_download()
1477                         video_url_list = [(None, video_info['conn'][0])]
1478                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1479                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1480                         url_data = [parse_qs(uds) for uds in url_data_strs]
1481                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1482                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1483
1484                         format_limit = self._downloader.params.get('format_limit', None)
1485                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1486                         if format_limit is not None and format_limit in available_formats:
1487                                 format_list = available_formats[available_formats.index(format_limit):]
1488                         else:
1489                                 format_list = available_formats
1490                         existing_formats = [x for x in format_list if x in url_map]
1491                         if len(existing_formats) == 0:
1492                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1493                                 return
1494                         if self._downloader.params.get('listformats', None):
1495                                 self._print_formats(existing_formats)
1496                                 return
1497                         if req_format is None or req_format == 'best':
1498                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1499                         elif req_format == 'worst':
1500                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1501                         elif req_format in ('-1', 'all'):
1502                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1503                         else:
1504                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1505                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1506                                 req_formats = req_format.split('/')
1507                                 video_url_list = None
1508                                 for rf in req_formats:
1509                                         if rf in url_map:
1510                                                 video_url_list = [(rf, url_map[rf])]
1511                                                 break
1512                                 if video_url_list is None:
1513                                         self._downloader.trouble(u'ERROR: requested format not available')
1514                                         return
1515                 else:
1516                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1517                         return
1518
1519                 for format_param, video_real_url in video_url_list:
1520                         # At this point we have a new video
1521                         self._downloader.increment_downloads()
1522
1523                         # Extension
1524                         video_extension = self._video_extensions.get(format_param, 'flv')
1525
1526                         try:
1527                                 # Process video information
1528                                 self._downloader.process_info({
1529                                         'id':           video_id.decode('utf-8'),
1530                                         'url':          video_real_url.decode('utf-8'),
1531                                         'uploader':     video_uploader.decode('utf-8'),
1532                                         'upload_date':  upload_date,
1533                                         'title':        video_title,
1534                                         'stitle':       simple_title,
1535                                         'ext':          video_extension.decode('utf-8'),
1536                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1537                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1538                                         'description':  video_description,
1539                                         'player_url':   player_url,
1540                                         'subtitles':    video_subtitles
1541                                 })
1542                         except UnavailableVideoError, err:
1543                                 self._downloader.trouble(u'\nERROR: unable to download video')
1544
1545
1546 class MetacafeIE(InfoExtractor):
1547         """Information Extractor for metacafe.com."""
1548
1549         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1550         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1551         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1552         _youtube_ie = None
1553         IE_NAME = u'metacafe'
1554
1555         def __init__(self, youtube_ie, downloader=None):
1556                 InfoExtractor.__init__(self, downloader)
1557                 self._youtube_ie = youtube_ie
1558
1559         def report_disclaimer(self):
1560                 """Report disclaimer retrieval."""
1561                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1562
1563         def report_age_confirmation(self):
1564                 """Report attempt to confirm age."""
1565                 self._downloader.to_screen(u'[metacafe] Confirming age')
1566
1567         def report_download_webpage(self, video_id):
1568                 """Report webpage download."""
1569                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1570
1571         def report_extraction(self, video_id):
1572                 """Report information extraction."""
1573                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1574
1575         def _real_initialize(self):
1576                 # Retrieve disclaimer
1577                 request = urllib2.Request(self._DISCLAIMER)
1578                 try:
1579                         self.report_disclaimer()
1580                         disclaimer = urllib2.urlopen(request).read()
1581                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1582                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1583                         return
1584
1585                 # Confirm age
1586                 disclaimer_form = {
1587                         'filters': '0',
1588                         'submit': "Continue - I'm over 18",
1589                         }
1590                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1591                 try:
1592                         self.report_age_confirmation()
1593                         disclaimer = urllib2.urlopen(request).read()
1594                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1595                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1596                         return
1597
1598         def _real_extract(self, url):
1599                 # Extract id and simplified title from URL
1600                 mobj = re.match(self._VALID_URL, url)
1601                 if mobj is None:
1602                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1603                         return
1604
1605                 video_id = mobj.group(1)
1606
1607                 # Check if video comes from YouTube
1608                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1609                 if mobj2 is not None:
1610                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1611                         return
1612
1613                 # At this point we have a new video
1614                 self._downloader.increment_downloads()
1615
1616                 simple_title = mobj.group(2).decode('utf-8')
1617
1618                 # Retrieve video webpage to extract further information
1619                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1620                 try:
1621                         self.report_download_webpage(video_id)
1622                         webpage = urllib2.urlopen(request).read()
1623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1625                         return
1626
1627                 # Extract URL, uploader and title from webpage
1628                 self.report_extraction(video_id)
1629                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1630                 if mobj is not None:
1631                         mediaURL = urllib.unquote(mobj.group(1))
1632                         video_extension = mediaURL[-3:]
1633
1634                         # Extract gdaKey if available
1635                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1636                         if mobj is None:
1637                                 video_url = mediaURL
1638                         else:
1639                                 gdaKey = mobj.group(1)
1640                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1641                 else:
1642                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1643                         if mobj is None:
1644                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1645                                 return
1646                         vardict = parse_qs(mobj.group(1))
1647                         if 'mediaData' not in vardict:
1648                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649                                 return
1650                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1651                         if mobj is None:
1652                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653                                 return
1654                         mediaURL = mobj.group(1).replace('\\/', '/')
1655                         video_extension = mediaURL[-3:]
1656                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1657
1658                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract title')
1661                         return
1662                 video_title = mobj.group(1).decode('utf-8')
1663                 video_title = sanitize_title(video_title)
1664
1665                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1666                 if mobj is None:
1667                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1668                         return
1669                 video_uploader = mobj.group(1)
1670
1671                 try:
1672                         # Process video information
1673                         self._downloader.process_info({
1674                                 'id':           video_id.decode('utf-8'),
1675                                 'url':          video_url.decode('utf-8'),
1676                                 'uploader':     video_uploader.decode('utf-8'),
1677                                 'upload_date':  u'NA',
1678                                 'title':        video_title,
1679                                 'stitle':       simple_title,
1680                                 'ext':          video_extension.decode('utf-8'),
1681                                 'format':       u'NA',
1682                                 'player_url':   None,
1683                         })
1684                 except UnavailableVideoError:
1685                         self._downloader.trouble(u'\nERROR: unable to download video')
1686
1687
1688 class DailymotionIE(InfoExtractor):
1689         """Information Extractor for Dailymotion"""
1690
1691         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1692         IE_NAME = u'dailymotion'
1693
1694         def __init__(self, downloader=None):
1695                 InfoExtractor.__init__(self, downloader)
1696
1697         def report_download_webpage(self, video_id):
1698                 """Report webpage download."""
1699                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1700
1701         def report_extraction(self, video_id):
1702                 """Report information extraction."""
1703                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1704
1705         def _real_extract(self, url):
1706                 # Extract id and simplified title from URL
1707                 mobj = re.match(self._VALID_URL, url)
1708                 if mobj is None:
1709                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1710                         return
1711
1712                 # At this point we have a new video
1713                 self._downloader.increment_downloads()
1714                 video_id = mobj.group(1)
1715
1716                 video_extension = 'flv'
1717
1718                 # Retrieve video webpage to extract further information
1719                 request = urllib2.Request(url)
1720                 request.add_header('Cookie', 'family_filter=off')
1721                 try:
1722                         self.report_download_webpage(video_id)
1723                         webpage = urllib2.urlopen(request).read()
1724                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1725                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1726                         return
1727
1728                 # Extract URL, uploader and title from webpage
1729                 self.report_extraction(video_id)
1730                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1731                 if mobj is None:
1732                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1733                         return
1734                 sequence = urllib.unquote(mobj.group(1))
1735                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1738                         return
1739                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1740
1741                 # if needed add http://www.dailymotion.com/ if relative URL
1742
1743                 video_url = mediaURL
1744
1745                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1746                 if mobj is None:
1747                         self._downloader.trouble(u'ERROR: unable to extract title')
1748                         return
1749                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1750                 video_title = sanitize_title(video_title)
1751                 simple_title = _simplify_title(video_title)
1752
1753                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1754                 if mobj is None:
1755                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1756                         return
1757                 video_uploader = mobj.group(1)
1758
1759                 try:
1760                         # Process video information
1761                         self._downloader.process_info({
1762                                 'id':           video_id.decode('utf-8'),
1763                                 'url':          video_url.decode('utf-8'),
1764                                 'uploader':     video_uploader.decode('utf-8'),
1765                                 'upload_date':  u'NA',
1766                                 'title':        video_title,
1767                                 'stitle':       simple_title,
1768                                 'ext':          video_extension.decode('utf-8'),
1769                                 'format':       u'NA',
1770                                 'player_url':   None,
1771                         })
1772                 except UnavailableVideoError:
1773                         self._downloader.trouble(u'\nERROR: unable to download video')
1774
1775
1776 class GoogleIE(InfoExtractor):
1777         """Information extractor for video.google.com."""
1778
1779         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1780         IE_NAME = u'video.google'
1781
1782         def __init__(self, downloader=None):
1783                 InfoExtractor.__init__(self, downloader)
1784
1785         def report_download_webpage(self, video_id):
1786                 """Report webpage download."""
1787                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1788
1789         def report_extraction(self, video_id):
1790                 """Report information extraction."""
1791                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1792
1793         def _real_extract(self, url):
1794                 # Extract id from URL
1795                 mobj = re.match(self._VALID_URL, url)
1796                 if mobj is None:
1797                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798                         return
1799
1800                 # At this point we have a new video
1801                 self._downloader.increment_downloads()
1802                 video_id = mobj.group(1)
1803
1804                 video_extension = 'mp4'
1805
1806                 # Retrieve video webpage to extract further information
1807                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1808                 try:
1809                         self.report_download_webpage(video_id)
1810                         webpage = urllib2.urlopen(request).read()
1811                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1812                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813                         return
1814
1815                 # Extract URL, uploader, and title from webpage
1816                 self.report_extraction(video_id)
1817                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1818                 if mobj is None:
1819                         video_extension = 'flv'
1820                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1821                 if mobj is None:
1822                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1823                         return
1824                 mediaURL = urllib.unquote(mobj.group(1))
1825                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1826                 mediaURL = mediaURL.replace('\\x26', '\x26')
1827
1828                 video_url = mediaURL
1829
1830                 mobj = re.search(r'<title>(.*)</title>', webpage)
1831                 if mobj is None:
1832                         self._downloader.trouble(u'ERROR: unable to extract title')
1833                         return
1834                 video_title = mobj.group(1).decode('utf-8')
1835                 video_title = sanitize_title(video_title)
1836                 simple_title = _simplify_title(video_title)
1837
1838                 # Extract video description
1839                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1840                 if mobj is None:
1841                         self._downloader.trouble(u'ERROR: unable to extract video description')
1842                         return
1843                 video_description = mobj.group(1).decode('utf-8')
1844                 if not video_description:
1845                         video_description = 'No description available.'
1846
1847                 # Extract video thumbnail
1848                 if self._downloader.params.get('forcethumbnail', False):
1849                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1850                         try:
1851                                 webpage = urllib2.urlopen(request).read()
1852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1854                                 return
1855                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1856                         if mobj is None:
1857                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1858                                 return
1859                         video_thumbnail = mobj.group(1)
1860                 else:   # we need something to pass to process_info
1861                         video_thumbnail = ''
1862
1863                 try:
1864                         # Process video information
1865                         self._downloader.process_info({
1866                                 'id':           video_id.decode('utf-8'),
1867                                 'url':          video_url.decode('utf-8'),
1868                                 'uploader':     u'NA',
1869                                 'upload_date':  u'NA',
1870                                 'title':        video_title,
1871                                 'stitle':       simple_title,
1872                                 'ext':          video_extension.decode('utf-8'),
1873                                 'format':       u'NA',
1874                                 'player_url':   None,
1875                         })
1876                 except UnavailableVideoError:
1877                         self._downloader.trouble(u'\nERROR: unable to download video')
1878
1879
1880 class PhotobucketIE(InfoExtractor):
1881         """Information extractor for photobucket.com."""
1882
1883         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1884         IE_NAME = u'photobucket'
1885
1886         def __init__(self, downloader=None):
1887                 InfoExtractor.__init__(self, downloader)
1888
1889         def report_download_webpage(self, video_id):
1890                 """Report webpage download."""
1891                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1892
1893         def report_extraction(self, video_id):
1894                 """Report information extraction."""
1895                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1896
1897         def _real_extract(self, url):
1898                 # Extract id from URL
1899                 mobj = re.match(self._VALID_URL, url)
1900                 if mobj is None:
1901                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1902                         return
1903
1904                 # At this point we have a new video
1905                 self._downloader.increment_downloads()
1906                 video_id = mobj.group(1)
1907
1908                 video_extension = 'flv'
1909
1910                 # Retrieve video webpage to extract further information
1911                 request = urllib2.Request(url)
1912                 try:
1913                         self.report_download_webpage(video_id)
1914                         webpage = urllib2.urlopen(request).read()
1915                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1916                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1917                         return
1918
1919                 # Extract URL, uploader, and title from webpage
1920                 self.report_extraction(video_id)
1921                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1922                 if mobj is None:
1923                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1924                         return
1925                 mediaURL = urllib.unquote(mobj.group(1))
1926
1927                 video_url = mediaURL
1928
1929                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1930                 if mobj is None:
1931                         self._downloader.trouble(u'ERROR: unable to extract title')
1932                         return
1933                 video_title = mobj.group(1).decode('utf-8')
1934                 video_title = sanitize_title(video_title)
1935                 simple_title = _simplify_title(vide_title)
1936
1937                 video_uploader = mobj.group(2).decode('utf-8')
1938
1939                 try:
1940                         # Process video information
1941                         self._downloader.process_info({
1942                                 'id':           video_id.decode('utf-8'),
1943                                 'url':          video_url.decode('utf-8'),
1944                                 'uploader':     video_uploader,
1945                                 'upload_date':  u'NA',
1946                                 'title':        video_title,
1947                                 'stitle':       simple_title,
1948                                 'ext':          video_extension.decode('utf-8'),
1949                                 'format':       u'NA',
1950                                 'player_url':   None,
1951                         })
1952                 except UnavailableVideoError:
1953                         self._downloader.trouble(u'\nERROR: unable to download video')
1954
1955
1956 class YahooIE(InfoExtractor):
1957         """Information extractor for video.yahoo.com."""
1958
1959         # _VALID_URL matches all Yahoo! Video URLs
1960         # _VPAGE_URL matches only the extractable '/watch/' URLs
1961         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1962         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1963         IE_NAME = u'video.yahoo'
1964
1965         def __init__(self, downloader=None):
1966                 InfoExtractor.__init__(self, downloader)
1967
1968         def report_download_webpage(self, video_id):
1969                 """Report webpage download."""
1970                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1971
1972         def report_extraction(self, video_id):
1973                 """Report information extraction."""
1974                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1975
1976         def _real_extract(self, url, new_video=True):
1977                 # Extract ID from URL
1978                 mobj = re.match(self._VALID_URL, url)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981                         return
1982
1983                 # At this point we have a new video
1984                 self._downloader.increment_downloads()
1985                 video_id = mobj.group(2)
1986                 video_extension = 'flv'
1987
1988                 # Rewrite valid but non-extractable URLs as
1989                 # extractable English language /watch/ URLs
1990                 if re.match(self._VPAGE_URL, url) is None:
1991                         request = urllib2.Request(url)
1992                         try:
1993                                 webpage = urllib2.urlopen(request).read()
1994                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1995                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1996                                 return
1997
1998                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1999                         if mobj is None:
2000                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2001                                 return
2002                         yahoo_id = mobj.group(1)
2003
2004                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2005                         if mobj is None:
2006                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2007                                 return
2008                         yahoo_vid = mobj.group(1)
2009
2010                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2011                         return self._real_extract(url, new_video=False)
2012
2013                 # Retrieve video webpage to extract further information
2014                 request = urllib2.Request(url)
2015                 try:
2016                         self.report_download_webpage(video_id)
2017                         webpage = urllib2.urlopen(request).read()
2018                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2019                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2020                         return
2021
2022                 # Extract uploader and title from webpage
2023                 self.report_extraction(video_id)
2024                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2025                 if mobj is None:
2026                         self._downloader.trouble(u'ERROR: unable to extract video title')
2027                         return
2028                 video_title = mobj.group(1).decode('utf-8')
2029                 simple_title = _simplify_title(video_title)
2030
2031                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2032                 if mobj is None:
2033                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2034                         return
2035                 video_uploader = mobj.group(1).decode('utf-8')
2036
2037                 # Extract video thumbnail
2038                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2039                 if mobj is None:
2040                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2041                         return
2042                 video_thumbnail = mobj.group(1).decode('utf-8')
2043
2044                 # Extract video description
2045                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2046                 if mobj is None:
2047                         self._downloader.trouble(u'ERROR: unable to extract video description')
2048                         return
2049                 video_description = mobj.group(1).decode('utf-8')
2050                 if not video_description:
2051                         video_description = 'No description available.'
2052
2053                 # Extract video height and width
2054                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2055                 if mobj is None:
2056                         self._downloader.trouble(u'ERROR: unable to extract video height')
2057                         return
2058                 yv_video_height = mobj.group(1)
2059
2060                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2061                 if mobj is None:
2062                         self._downloader.trouble(u'ERROR: unable to extract video width')
2063                         return
2064                 yv_video_width = mobj.group(1)
2065
2066                 # Retrieve video playlist to extract media URL
2067                 # I'm not completely sure what all these options are, but we
2068                 # seem to need most of them, otherwise the server sends a 401.
2069                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2070                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2071                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2072                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2073                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2074                 try:
2075                         self.report_download_webpage(video_id)
2076                         webpage = urllib2.urlopen(request).read()
2077                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2078                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2079                         return
2080
2081                 # Extract media URL from playlist XML
2082                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2083                 if mobj is None:
2084                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2085                         return
2086                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2087                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2088
2089                 try:
2090                         # Process video information
2091                         self._downloader.process_info({
2092                                 'id':           video_id.decode('utf-8'),
2093                                 'url':          video_url,
2094                                 'uploader':     video_uploader,
2095                                 'upload_date':  u'NA',
2096                                 'title':        video_title,
2097                                 'stitle':       simple_title,
2098                                 'ext':          video_extension.decode('utf-8'),
2099                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2100                                 'description':  video_description,
2101                                 'thumbnail':    video_thumbnail,
2102                                 'player_url':   None,
2103                         })
2104                 except UnavailableVideoError:
2105                         self._downloader.trouble(u'\nERROR: unable to download video')
2106
2107
2108 class VimeoIE(InfoExtractor):
2109         """Information extractor for vimeo.com."""
2110
2111         # _VALID_URL matches Vimeo URLs
2112         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2113         IE_NAME = u'vimeo'
2114
2115         def __init__(self, downloader=None):
2116                 InfoExtractor.__init__(self, downloader)
2117
2118         def report_download_webpage(self, video_id):
2119                 """Report webpage download."""
2120                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2121
2122         def report_extraction(self, video_id):
2123                 """Report information extraction."""
2124                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2125
2126         def _real_extract(self, url, new_video=True):
2127                 # Extract ID from URL
2128                 mobj = re.match(self._VALID_URL, url)
2129                 if mobj is None:
2130                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2131                         return
2132
2133                 # At this point we have a new video
2134                 self._downloader.increment_downloads()
2135                 video_id = mobj.group(1)
2136
2137                 # Retrieve video webpage to extract further information
2138                 request = urllib2.Request(url, None, std_headers)
2139                 try:
2140                         self.report_download_webpage(video_id)
2141                         webpage = urllib2.urlopen(request).read()
2142                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2144                         return
2145
2146                 # Now we begin extracting as much information as we can from what we
2147                 # retrieved. First we extract the information common to all extractors,
2148                 # and latter we extract those that are Vimeo specific.
2149                 self.report_extraction(video_id)
2150
2151                 # Extract the config JSON
2152                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2153                 try:
2154                         config = json.loads(config)
2155                 except:
2156                         self._downloader.trouble(u'ERROR: unable to extract info section')
2157                         return
2158
2159                 # Extract title
2160                 video_title = config["video"]["title"]
2161                 simple_title = _simplify_title(video_title)
2162
2163                 # Extract uploader
2164                 video_uploader = config["video"]["owner"]["name"]
2165
2166                 # Extract video thumbnail
2167                 video_thumbnail = config["video"]["thumbnail"]
2168
2169                 # Extract video description
2170                 try:
2171                         lxml.etree
2172                 except NameError:
2173                         video_description = u'No description available.'
2174                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2175                         if mobj is not None:
2176                                 video_description = mobj.group(1)
2177                 else:
2178                         html_parser = lxml.etree.HTMLParser()
2179                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2180                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2181                         # TODO use another parser
2182
2183                 # Extract upload date
2184                 video_upload_date = u'NA'
2185                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2186                 if mobj is not None:
2187                         video_upload_date = mobj.group(1)
2188
2189                 # Vimeo specific: extract request signature and timestamp
2190                 sig = config['request']['signature']
2191                 timestamp = config['request']['timestamp']
2192
2193                 # Vimeo specific: extract video codec and quality information
2194                 # TODO bind to format param
2195                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2196                 for codec in codecs:
2197                         if codec[0] in config["video"]["files"]:
2198                                 video_codec = codec[0]
2199                                 video_extension = codec[1]
2200                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2201                                 else: quality = 'sd'
2202                                 break
2203                 else:
2204                         self._downloader.trouble(u'ERROR: no known codec found')
2205                         return
2206
2207                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2208                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2209
2210                 try:
2211                         # Process video information
2212                         self._downloader.process_info({
2213                                 'id':           video_id,
2214                                 'url':          video_url,
2215                                 'uploader':     video_uploader,
2216                                 'upload_date':  video_upload_date,
2217                                 'title':        video_title,
2218                                 'stitle':       simple_title,
2219                                 'ext':          video_extension,
2220                                 'thumbnail':    video_thumbnail,
2221                                 'description':  video_description,
2222                                 'player_url':   None,
2223                         })
2224                 except UnavailableVideoError:
2225                         self._downloader.trouble(u'ERROR: unable to download video')
2226
2227
2228 class GenericIE(InfoExtractor):
2229         """Generic last-resort information extractor."""
2230
2231         _VALID_URL = r'.*'
2232         IE_NAME = u'generic'
2233
2234         def __init__(self, downloader=None):
2235                 InfoExtractor.__init__(self, downloader)
2236
2237         def report_download_webpage(self, video_id):
2238                 """Report webpage download."""
2239                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2240                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2241
2242         def report_extraction(self, video_id):
2243                 """Report information extraction."""
2244                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2245
2246         def _real_extract(self, url):
2247                 # At this point we have a new video
2248                 self._downloader.increment_downloads()
2249
2250                 video_id = url.split('/')[-1]
2251                 request = urllib2.Request(url)
2252                 try:
2253                         self.report_download_webpage(video_id)
2254                         webpage = urllib2.urlopen(request).read()
2255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2257                         return
2258                 except ValueError, err:
2259                         # since this is the last-resort InfoExtractor, if
2260                         # this error is thrown, it'll be thrown here
2261                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2262                         return
2263
2264                 self.report_extraction(video_id)
2265                 # Start with something easy: JW Player in SWFObject
2266                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2267                 if mobj is None:
2268                         # Broaden the search a little bit
2269                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2270                 if mobj is None:
2271                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2272                         return
2273
2274                 # It's possible that one of the regexes
2275                 # matched, but returned an empty group:
2276                 if mobj.group(1) is None:
2277                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2278                         return
2279
2280                 video_url = urllib.unquote(mobj.group(1))
2281                 video_id = os.path.basename(video_url)
2282
2283                 # here's a fun little line of code for you:
2284                 video_extension = os.path.splitext(video_id)[1][1:]
2285                 video_id = os.path.splitext(video_id)[0]
2286
2287                 # it's tempting to parse this further, but you would
2288                 # have to take into account all the variations like
2289                 #   Video Title - Site Name
2290                 #   Site Name | Video Title
2291                 #   Video Title - Tagline | Site Name
2292                 # and so on and so forth; it's just not practical
2293                 mobj = re.search(r'<title>(.*)</title>', webpage)
2294                 if mobj is None:
2295                         self._downloader.trouble(u'ERROR: unable to extract title')
2296                         return
2297                 video_title = mobj.group(1).decode('utf-8')
2298                 video_title = sanitize_title(video_title)
2299                 simple_title = _simplify_title(video_title)
2300
2301                 # video uploader is domain name
2302                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2303                 if mobj is None:
2304                         self._downloader.trouble(u'ERROR: unable to extract title')
2305                         return
2306                 video_uploader = mobj.group(1).decode('utf-8')
2307
2308                 try:
2309                         # Process video information
2310                         self._downloader.process_info({
2311                                 'id':           video_id.decode('utf-8'),
2312                                 'url':          video_url.decode('utf-8'),
2313                                 'uploader':     video_uploader,
2314                                 'upload_date':  u'NA',
2315                                 'title':        video_title,
2316                                 'stitle':       simple_title,
2317                                 'ext':          video_extension.decode('utf-8'),
2318                                 'format':       u'NA',
2319                                 'player_url':   None,
2320                         })
2321                 except UnavailableVideoError, err:
2322                         self._downloader.trouble(u'\nERROR: unable to download video')
2323
2324
2325 class YoutubeSearchIE(InfoExtractor):
2326         """Information Extractor for YouTube search queries."""
2327         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2328         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2329         _youtube_ie = None
2330         _max_youtube_results = 1000
2331         IE_NAME = u'youtube:search'
2332
2333         def __init__(self, youtube_ie, downloader=None):
2334                 InfoExtractor.__init__(self, downloader)
2335                 self._youtube_ie = youtube_ie
2336
2337         def report_download_page(self, query, pagenum):
2338                 """Report attempt to download playlist page with given number."""
2339                 query = query.decode(preferredencoding())
2340                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2341
2342         def _real_initialize(self):
2343                 self._youtube_ie.initialize()
2344
2345         def _real_extract(self, query):
2346                 mobj = re.match(self._VALID_URL, query)
2347                 if mobj is None:
2348                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349                         return
2350
2351                 prefix, query = query.split(':')
2352                 prefix = prefix[8:]
2353                 query = query.encode('utf-8')
2354                 if prefix == '':
2355                         self._download_n_results(query, 1)
2356                         return
2357                 elif prefix == 'all':
2358                         self._download_n_results(query, self._max_youtube_results)
2359                         return
2360                 else:
2361                         try:
2362                                 n = long(prefix)
2363                                 if n <= 0:
2364                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2365                                         return
2366                                 elif n > self._max_youtube_results:
2367                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2368                                         n = self._max_youtube_results
2369                                 self._download_n_results(query, n)
2370                                 return
2371                         except ValueError: # parsing prefix as integer fails
2372                                 self._download_n_results(query, 1)
2373                                 return
2374
2375         def _download_n_results(self, query, n):
2376                 """Downloads a specified number of results for a query"""
2377
2378                 video_ids = []
2379                 pagenum = 0
2380                 limit = n
2381
2382                 while (50 * pagenum) < limit:
2383                         self.report_download_page(query, pagenum+1)
2384                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2385                         request = urllib2.Request(result_url)
2386                         try:
2387                                 data = urllib2.urlopen(request).read()
2388                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2390                                 return
2391                         api_response = json.loads(data)['data']
2392
2393                         new_ids = list(video['id'] for video in api_response['items'])
2394                         video_ids += new_ids
2395
2396                         limit = min(n, api_response['totalItems'])
2397                         pagenum += 1
2398
2399                 if len(video_ids) > n:
2400                         video_ids = video_ids[:n]
2401                 for id in video_ids:
2402                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2403                 return
2404
2405
2406 class GoogleSearchIE(InfoExtractor):
2407         """Information Extractor for Google Video search queries."""
2408         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2409         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2410         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2411         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2412         _google_ie = None
2413         _max_google_results = 1000
2414         IE_NAME = u'video.google:search'
2415
2416         def __init__(self, google_ie, downloader=None):
2417                 InfoExtractor.__init__(self, downloader)
2418                 self._google_ie = google_ie
2419
2420         def report_download_page(self, query, pagenum):
2421                 """Report attempt to download playlist page with given number."""
2422                 query = query.decode(preferredencoding())
2423                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2424
2425         def _real_initialize(self):
2426                 self._google_ie.initialize()
2427
2428         def _real_extract(self, query):
2429                 mobj = re.match(self._VALID_URL, query)
2430                 if mobj is None:
2431                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2432                         return
2433
2434                 prefix, query = query.split(':')
2435                 prefix = prefix[8:]
2436                 query = query.encode('utf-8')
2437                 if prefix == '':
2438                         self._download_n_results(query, 1)
2439                         return
2440                 elif prefix == 'all':
2441                         self._download_n_results(query, self._max_google_results)
2442                         return
2443                 else:
2444                         try:
2445                                 n = long(prefix)
2446                                 if n <= 0:
2447                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2448                                         return
2449                                 elif n > self._max_google_results:
2450                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2451                                         n = self._max_google_results
2452                                 self._download_n_results(query, n)
2453                                 return
2454                         except ValueError: # parsing prefix as integer fails
2455                                 self._download_n_results(query, 1)
2456                                 return
2457
2458         def _download_n_results(self, query, n):
2459                 """Downloads a specified number of results for a query"""
2460
2461                 video_ids = []
2462                 pagenum = 0
2463
2464                 while True:
2465                         self.report_download_page(query, pagenum)
2466                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2467                         request = urllib2.Request(result_url)
2468                         try:
2469                                 page = urllib2.urlopen(request).read()
2470                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2471                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2472                                 return
2473
2474                         # Extract video identifiers
2475                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2476                                 video_id = mobj.group(1)
2477                                 if video_id not in video_ids:
2478                                         video_ids.append(video_id)
2479                                         if len(video_ids) == n:
2480                                                 # Specified n videos reached
2481                                                 for id in video_ids:
2482                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2483                                                 return
2484
2485                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2486                                 for id in video_ids:
2487                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2488                                 return
2489
2490                         pagenum = pagenum + 1
2491
2492
2493 class YahooSearchIE(InfoExtractor):
2494         """Information Extractor for Yahoo! Video search queries."""
2495         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2496         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2497         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2498         _MORE_PAGES_INDICATOR = r'\s*Next'
2499         _yahoo_ie = None
2500         _max_yahoo_results = 1000
2501         IE_NAME = u'video.yahoo:search'
2502
2503         def __init__(self, yahoo_ie, downloader=None):
2504                 InfoExtractor.__init__(self, downloader)
2505                 self._yahoo_ie = yahoo_ie
2506
2507         def report_download_page(self, query, pagenum):
2508                 """Report attempt to download playlist page with given number."""
2509                 query = query.decode(preferredencoding())
2510                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2511
2512         def _real_initialize(self):
2513                 self._yahoo_ie.initialize()
2514
2515         def _real_extract(self, query):
2516                 mobj = re.match(self._VALID_URL, query)
2517                 if mobj is None:
2518                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2519                         return
2520
2521                 prefix, query = query.split(':')
2522                 prefix = prefix[8:]
2523                 query = query.encode('utf-8')
2524                 if prefix == '':
2525                         self._download_n_results(query, 1)
2526                         return
2527                 elif prefix == 'all':
2528                         self._download_n_results(query, self._max_yahoo_results)
2529                         return
2530                 else:
2531                         try:
2532                                 n = long(prefix)
2533                                 if n <= 0:
2534                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2535                                         return
2536                                 elif n > self._max_yahoo_results:
2537                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2538                                         n = self._max_yahoo_results
2539                                 self._download_n_results(query, n)
2540                                 return
2541                         except ValueError: # parsing prefix as integer fails
2542                                 self._download_n_results(query, 1)
2543                                 return
2544
2545         def _download_n_results(self, query, n):
2546                 """Downloads a specified number of results for a query"""
2547
2548                 video_ids = []
2549                 already_seen = set()
2550                 pagenum = 1
2551
2552                 while True:
2553                         self.report_download_page(query, pagenum)
2554                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2555                         request = urllib2.Request(result_url)
2556                         try:
2557                                 page = urllib2.urlopen(request).read()
2558                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2559                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2560                                 return
2561
2562                         # Extract video identifiers
2563                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2564                                 video_id = mobj.group(1)
2565                                 if video_id not in already_seen:
2566                                         video_ids.append(video_id)
2567                                         already_seen.add(video_id)
2568                                         if len(video_ids) == n:
2569                                                 # Specified n videos reached
2570                                                 for id in video_ids:
2571                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2572                                                 return
2573
2574                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2575                                 for id in video_ids:
2576                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2577                                 return
2578
2579                         pagenum = pagenum + 1
2580
2581
2582 class YoutubePlaylistIE(InfoExtractor):
2583         """Information Extractor for YouTube playlists."""
2584
2585         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2586         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2587         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2588         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2589         _youtube_ie = None
2590         IE_NAME = u'youtube:playlist'
2591
2592         def __init__(self, youtube_ie, downloader=None):
2593                 InfoExtractor.__init__(self, downloader)
2594                 self._youtube_ie = youtube_ie
2595
2596         def report_download_page(self, playlist_id, pagenum):
2597                 """Report attempt to download playlist page with given number."""
2598                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2599
2600         def _real_initialize(self):
2601                 self._youtube_ie.initialize()
2602
2603         def _real_extract(self, url):
2604                 # Extract playlist id
2605                 mobj = re.match(self._VALID_URL, url)
2606                 if mobj is None:
2607                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2608                         return
2609
2610                 # Single video case
2611                 if mobj.group(3) is not None:
2612                         self._youtube_ie.extract(mobj.group(3))
2613                         return
2614
2615                 # Download playlist pages
2616                 # prefix is 'p' as default for playlists but there are other types that need extra care
2617                 playlist_prefix = mobj.group(1)
2618                 if playlist_prefix == 'a':
2619                         playlist_access = 'artist'
2620                 else:
2621                         playlist_prefix = 'p'
2622                         playlist_access = 'view_play_list'
2623                 playlist_id = mobj.group(2)
2624                 video_ids = []
2625                 pagenum = 1
2626
2627                 while True:
2628                         self.report_download_page(playlist_id, pagenum)
2629                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2630                         request = urllib2.Request(url)
2631                         try:
2632                                 page = urllib2.urlopen(request).read()
2633                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2634                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2635                                 return
2636
2637                         # Extract video identifiers
2638                         ids_in_page = []
2639                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2640                                 if mobj.group(1) not in ids_in_page:
2641                                         ids_in_page.append(mobj.group(1))
2642                         video_ids.extend(ids_in_page)
2643
2644                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2645                                 break
2646                         pagenum = pagenum + 1
2647
2648                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2649                 playlistend = self._downloader.params.get('playlistend', -1)
2650                 if playlistend == -1:
2651                         video_ids = video_ids[playliststart:]
2652                 else:
2653                         video_ids = video_ids[playliststart:playlistend]
2654
2655                 for id in video_ids:
2656                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2657                 return
2658
2659
2660 class YoutubeUserIE(InfoExtractor):
2661         """Information Extractor for YouTube users."""
2662
2663         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2664         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2665         _GDATA_PAGE_SIZE = 50
2666         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2667         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2668         _youtube_ie = None
2669         IE_NAME = u'youtube:user'
2670
2671         def __init__(self, youtube_ie, downloader=None):
2672                 InfoExtractor.__init__(self, downloader)
2673                 self._youtube_ie = youtube_ie
2674
2675         def report_download_page(self, username, start_index):
2676                 """Report attempt to download user page."""
2677                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2678                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2679
2680         def _real_initialize(self):
2681                 self._youtube_ie.initialize()
2682
2683         def _real_extract(self, url):
2684                 # Extract username
2685                 mobj = re.match(self._VALID_URL, url)
2686                 if mobj is None:
2687                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2688                         return
2689
2690                 username = mobj.group(1)
2691
2692                 # Download video ids using YouTube Data API. Result size per
2693                 # query is limited (currently to 50 videos) so we need to query
2694                 # page by page until there are no video ids - it means we got
2695                 # all of them.
2696
2697                 video_ids = []
2698                 pagenum = 0
2699
2700                 while True:
2701                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2702                         self.report_download_page(username, start_index)
2703
2704                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2705
2706                         try:
2707                                 page = urllib2.urlopen(request).read()
2708                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2709                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2710                                 return
2711
2712                         # Extract video identifiers
2713                         ids_in_page = []
2714
2715                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2716                                 if mobj.group(1) not in ids_in_page:
2717                                         ids_in_page.append(mobj.group(1))
2718
2719                         video_ids.extend(ids_in_page)
2720
2721                         # A little optimization - if current page is not
2722                         # "full", ie. does not contain PAGE_SIZE video ids then
2723                         # we can assume that this page is the last one - there
2724                         # are no more ids on further pages - no need to query
2725                         # again.
2726
2727                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2728                                 break
2729
2730                         pagenum += 1
2731
2732                 all_ids_count = len(video_ids)
2733                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2734                 playlistend = self._downloader.params.get('playlistend', -1)
2735
2736                 if playlistend == -1:
2737                         video_ids = video_ids[playliststart:]
2738                 else:
2739                         video_ids = video_ids[playliststart:playlistend]
2740
2741                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2742                                 (username, all_ids_count, len(video_ids)))
2743
2744                 for video_id in video_ids:
2745                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2746
2747
2748 class DepositFilesIE(InfoExtractor):
2749         """Information extractor for depositfiles.com"""
2750
2751         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2752         IE_NAME = u'DepositFiles'
2753
2754         def __init__(self, downloader=None):
2755                 InfoExtractor.__init__(self, downloader)
2756
2757         def report_download_webpage(self, file_id):
2758                 """Report webpage download."""
2759                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2760
2761         def report_extraction(self, file_id):
2762                 """Report information extraction."""
2763                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2764
2765         def _real_extract(self, url):
2766                 # At this point we have a new file
2767                 self._downloader.increment_downloads()
2768
2769                 file_id = url.split('/')[-1]
2770                 # Rebuild url in english locale
2771                 url = 'http://depositfiles.com/en/files/' + file_id
2772
2773                 # Retrieve file webpage with 'Free download' button pressed
2774                 free_download_indication = { 'gateway_result' : '1' }
2775                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2776                 try:
2777                         self.report_download_webpage(file_id)
2778                         webpage = urllib2.urlopen(request).read()
2779                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2781                         return
2782
2783                 # Search for the real file URL
2784                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2785                 if (mobj is None) or (mobj.group(1) is None):
2786                         # Try to figure out reason of the error.
2787                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2788                         if (mobj is not None) and (mobj.group(1) is not None):
2789                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2790                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2791                         else:
2792                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2793                         return
2794
2795                 file_url = mobj.group(1)
2796                 file_extension = os.path.splitext(file_url)[1][1:]
2797
2798                 # Search for file title
2799                 mobj = re.search(r'<b title="(.*?)">', webpage)
2800                 if mobj is None:
2801                         self._downloader.trouble(u'ERROR: unable to extract title')
2802                         return
2803                 file_title = mobj.group(1).decode('utf-8')
2804
2805                 try:
2806                         # Process file information
2807                         self._downloader.process_info({
2808                                 'id':           file_id.decode('utf-8'),
2809                                 'url':          file_url.decode('utf-8'),
2810                                 'uploader':     u'NA',
2811                                 'upload_date':  u'NA',
2812                                 'title':        file_title,
2813                                 'stitle':       file_title,
2814                                 'ext':          file_extension.decode('utf-8'),
2815                                 'format':       u'NA',
2816                                 'player_url':   None,
2817                         })
2818                 except UnavailableVideoError, err:
2819                         self._downloader.trouble(u'ERROR: unable to download file')
2820
2821
2822 class FacebookIE(InfoExtractor):
2823         """Information Extractor for Facebook"""
2824
2825         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2826         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2827         _NETRC_MACHINE = 'facebook'
2828         _available_formats = ['video', 'highqual', 'lowqual']
2829         _video_extensions = {
2830                 'video': 'mp4',
2831                 'highqual': 'mp4',
2832                 'lowqual': 'mp4',
2833         }
2834         IE_NAME = u'facebook'
2835
2836         def __init__(self, downloader=None):
2837                 InfoExtractor.__init__(self, downloader)
2838
2839         def _reporter(self, message):
2840                 """Add header and report message."""
2841                 self._downloader.to_screen(u'[facebook] %s' % message)
2842
2843         def report_login(self):
2844                 """Report attempt to log in."""
2845                 self._reporter(u'Logging in')
2846
2847         def report_video_webpage_download(self, video_id):
2848                 """Report attempt to download video webpage."""
2849                 self._reporter(u'%s: Downloading video webpage' % video_id)
2850
2851         def report_information_extraction(self, video_id):
2852                 """Report attempt to extract video information."""
2853                 self._reporter(u'%s: Extracting video information' % video_id)
2854
2855         def _parse_page(self, video_webpage):
2856                 """Extract video information from page"""
2857                 # General data
2858                 data = {'title': r'\("video_title", "(.*?)"\)',
2859                         'description': r'<div class="datawrap">(.*?)</div>',
2860                         'owner': r'\("video_owner_name", "(.*?)"\)',
2861                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2862                         }
2863                 video_info = {}
2864                 for piece in data.keys():
2865                         mobj = re.search(data[piece], video_webpage)
2866                         if mobj is not None:
2867                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2868
2869                 # Video urls
2870                 video_urls = {}
2871                 for fmt in self._available_formats:
2872                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2873                         if mobj is not None:
2874                                 # URL is in a Javascript segment inside an escaped Unicode format within
2875                                 # the generally utf-8 page
2876                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2877                 video_info['video_urls'] = video_urls
2878
2879                 return video_info
2880
2881         def _real_initialize(self):
2882                 if self._downloader is None:
2883                         return
2884
2885                 useremail = None
2886                 password = None
2887                 downloader_params = self._downloader.params
2888
2889                 # Attempt to use provided username and password or .netrc data
2890                 if downloader_params.get('username', None) is not None:
2891                         useremail = downloader_params['username']
2892                         password = downloader_params['password']
2893                 elif downloader_params.get('usenetrc', False):
2894                         try:
2895                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2896                                 if info is not None:
2897                                         useremail = info[0]
2898                                         password = info[2]
2899                                 else:
2900                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2901                         except (IOError, netrc.NetrcParseError), err:
2902                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2903                                 return
2904
2905                 if useremail is None:
2906                         return
2907
2908                 # Log in
2909                 login_form = {
2910                         'email': useremail,
2911                         'pass': password,
2912                         'login': 'Log+In'
2913                         }
2914                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2915                 try:
2916                         self.report_login()
2917                         login_results = urllib2.urlopen(request).read()
2918                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2919                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2920                                 return
2921                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2922                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2923                         return
2924
2925         def _real_extract(self, url):
2926                 mobj = re.match(self._VALID_URL, url)
2927                 if mobj is None:
2928                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2929                         return
2930                 video_id = mobj.group('ID')
2931
2932                 # Get video webpage
2933                 self.report_video_webpage_download(video_id)
2934                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2935                 try:
2936                         page = urllib2.urlopen(request)
2937                         video_webpage = page.read()
2938                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2939                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2940                         return
2941
2942                 # Start extracting information
2943                 self.report_information_extraction(video_id)
2944
2945                 # Extract information
2946                 video_info = self._parse_page(video_webpage)
2947
2948                 # uploader
2949                 if 'owner' not in video_info:
2950                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2951                         return
2952                 video_uploader = video_info['owner']
2953
2954                 # title
2955                 if 'title' not in video_info:
2956                         self._downloader.trouble(u'ERROR: unable to extract video title')
2957                         return
2958                 video_title = video_info['title']
2959                 video_title = video_title.decode('utf-8')
2960                 video_title = sanitize_title(video_title)
2961
2962                 simple_title = _simplify_title(video_title)
2963
2964                 # thumbnail image
2965                 if 'thumbnail' not in video_info:
2966                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2967                         video_thumbnail = ''
2968                 else:
2969                         video_thumbnail = video_info['thumbnail']
2970
2971                 # upload date
2972                 upload_date = u'NA'
2973                 if 'upload_date' in video_info:
2974                         upload_time = video_info['upload_date']
2975                         timetuple = email.utils.parsedate_tz(upload_time)
2976                         if timetuple is not None:
2977                                 try:
2978                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2979                                 except:
2980                                         pass
2981
2982                 # description
2983                 video_description = video_info.get('description', 'No description available.')
2984
2985                 url_map = video_info['video_urls']
2986                 if len(url_map.keys()) > 0:
2987                         # Decide which formats to download
2988                         req_format = self._downloader.params.get('format', None)
2989                         format_limit = self._downloader.params.get('format_limit', None)
2990
2991                         if format_limit is not None and format_limit in self._available_formats:
2992                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2993                         else:
2994                                 format_list = self._available_formats
2995                         existing_formats = [x for x in format_list if x in url_map]
2996                         if len(existing_formats) == 0:
2997                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2998                                 return
2999                         if req_format is None:
3000                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3001                         elif req_format == 'worst':
3002                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3003                         elif req_format == '-1':
3004                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3005                         else:
3006                                 # Specific format
3007                                 if req_format not in url_map:
3008                                         self._downloader.trouble(u'ERROR: requested format not available')
3009                                         return
3010                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3011
3012                 for format_param, video_real_url in video_url_list:
3013
3014                         # At this point we have a new video
3015                         self._downloader.increment_downloads()
3016
3017                         # Extension
3018                         video_extension = self._video_extensions.get(format_param, 'mp4')
3019
3020                         try:
3021                                 # Process video information
3022                                 self._downloader.process_info({
3023                                         'id':           video_id.decode('utf-8'),
3024                                         'url':          video_real_url.decode('utf-8'),
3025                                         'uploader':     video_uploader.decode('utf-8'),
3026                                         'upload_date':  upload_date,
3027                                         'title':        video_title,
3028                                         'stitle':       simple_title,
3029                                         'ext':          video_extension.decode('utf-8'),
3030                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3031                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3032                                         'description':  video_description.decode('utf-8'),
3033                                         'player_url':   None,
3034                                 })
3035                         except UnavailableVideoError, err:
3036                                 self._downloader.trouble(u'\nERROR: unable to download video')
3037
3038 class BlipTVIE(InfoExtractor):
3039         """Information extractor for blip.tv"""
3040
3041         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3042         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3043         IE_NAME = u'blip.tv'
3044
3045         def report_extraction(self, file_id):
3046                 """Report information extraction."""
3047                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3048
3049         def report_direct_download(self, title):
3050                 """Report information extraction."""
3051                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3052
3053         def _real_extract(self, url):
3054                 mobj = re.match(self._VALID_URL, url)
3055                 if mobj is None:
3056                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3057                         return
3058
3059                 if '?' in url:
3060                         cchar = '&'
3061                 else:
3062                         cchar = '?'
3063                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3064                 request = urllib2.Request(json_url)
3065                 self.report_extraction(mobj.group(1))
3066                 info = None
3067                 try:
3068                         urlh = urllib2.urlopen(request)
3069                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3070                                 basename = url.split('/')[-1]
3071                                 title,ext = os.path.splitext(basename)
3072                                 title = title.decode('UTF-8')
3073                                 ext = ext.replace('.', '')
3074                                 self.report_direct_download(title)
3075                                 info = {
3076                                         'id': title,
3077                                         'url': url,
3078                                         'title': title,
3079                                         'stitle': _simplify_title(title),
3080                                         'ext': ext,
3081                                         'urlhandle': urlh
3082                                 }
3083                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3085                         return
3086                 if info is None: # Regular URL
3087                         try:
3088                                 json_code = urlh.read()
3089                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3090                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3091                                 return
3092
3093                         try:
3094                                 json_data = json.loads(json_code)
3095                                 if 'Post' in json_data:
3096                                         data = json_data['Post']
3097                                 else:
3098                                         data = json_data
3099
3100                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3101                                 video_url = data['media']['url']
3102                                 umobj = re.match(self._URL_EXT, video_url)
3103                                 if umobj is None:
3104                                         raise ValueError('Can not determine filename extension')
3105                                 ext = umobj.group(1)
3106
3107                                 info = {
3108                                         'id': data['item_id'],
3109                                         'url': video_url,
3110                                         'uploader': data['display_name'],
3111                                         'upload_date': upload_date,
3112                                         'title': data['title'],
3113                                         'stitle': _simplify_title(data['title']),
3114                                         'ext': ext,
3115                                         'format': data['media']['mimeType'],
3116                                         'thumbnail': data['thumbnailUrl'],
3117                                         'description': data['description'],
3118                                         'player_url': data['embedUrl']
3119                                 }
3120                         except (ValueError,KeyError), err:
3121                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3122                                 return
3123
3124                 self._downloader.increment_downloads()
3125
3126                 try:
3127                         self._downloader.process_info(info)
3128                 except UnavailableVideoError, err:
3129                         self._downloader.trouble(u'\nERROR: unable to download video')
3130
3131
3132 class MyVideoIE(InfoExtractor):
3133         """Information Extractor for myvideo.de."""
3134
3135         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3136         IE_NAME = u'myvideo'
3137
3138         def __init__(self, downloader=None):
3139                 InfoExtractor.__init__(self, downloader)
3140
3141         def report_download_webpage(self, video_id):
3142                 """Report webpage download."""
3143                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3144
3145         def report_extraction(self, video_id):
3146                 """Report information extraction."""
3147                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3148
3149         def _real_extract(self,url):
3150                 mobj = re.match(self._VALID_URL, url)
3151                 if mobj is None:
3152                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3153                         return
3154
3155                 video_id = mobj.group(1)
3156
3157                 # Get video webpage
3158                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3159                 try:
3160                         self.report_download_webpage(video_id)
3161                         webpage = urllib2.urlopen(request).read()
3162                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3163                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3164                         return
3165
3166                 self.report_extraction(video_id)
3167                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3168                                  webpage)
3169                 if mobj is None:
3170                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3171                         return
3172                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3173
3174                 mobj = re.search('<title>([^<]+)</title>', webpage)
3175                 if mobj is None:
3176                         self._downloader.trouble(u'ERROR: unable to extract title')
3177                         return
3178
3179                 video_title = mobj.group(1)
3180                 video_title = sanitize_title(video_title)
3181
3182                 simple_title = _simplify_title(video_title)
3183
3184                 try:
3185                         self._downloader.process_info({
3186                                 'id':           video_id,
3187                                 'url':          video_url,
3188                                 'uploader':     u'NA',
3189                                 'upload_date':  u'NA',
3190                                 'title':        video_title,
3191                                 'stitle':       simple_title,
3192                                 'ext':          u'flv',
3193                                 'format':       u'NA',
3194                                 'player_url':   None,
3195                         })
3196                 except UnavailableVideoError:
3197                         self._downloader.trouble(u'\nERROR: Unable to download video')
3198
3199 class ComedyCentralIE(InfoExtractor):
3200         """Information extractor for The Daily Show and Colbert Report """
3201
3202         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3203         IE_NAME = u'comedycentral'
3204
3205         def report_extraction(self, episode_id):
3206                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3207
3208         def report_config_download(self, episode_id):
3209                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3210
3211         def report_index_download(self, episode_id):
3212                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3213
3214         def report_player_url(self, episode_id):
3215                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3216
3217         def _real_extract(self, url):
3218                 mobj = re.match(self._VALID_URL, url)
3219                 if mobj is None:
3220                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3221                         return
3222
3223                 if mobj.group('shortname'):
3224                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3225                                 url = u'http://www.thedailyshow.com/full-episodes/'
3226                         else:
3227                                 url = u'http://www.colbertnation.com/full-episodes/'
3228                         mobj = re.match(self._VALID_URL, url)
3229                         assert mobj is not None
3230
3231                 dlNewest = not mobj.group('episode')
3232                 if dlNewest:
3233                         epTitle = mobj.group('showname')
3234                 else:
3235                         epTitle = mobj.group('episode')
3236
3237                 req = urllib2.Request(url)
3238                 self.report_extraction(epTitle)
3239                 try:
3240                         htmlHandle = urllib2.urlopen(req)
3241                         html = htmlHandle.read()
3242                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3243                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3244                         return
3245                 if dlNewest:
3246                         url = htmlHandle.geturl()
3247                         mobj = re.match(self._VALID_URL, url)
3248                         if mobj is None:
3249                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3250                                 return
3251                         if mobj.group('episode') == '':
3252                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3253                                 return
3254                         epTitle = mobj.group('episode')
3255
3256                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3257                 if len(mMovieParams) == 0:
3258                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3259                         return
3260
3261                 playerUrl_raw = mMovieParams[0][0]
3262                 self.report_player_url(epTitle)
3263                 try:
3264                         urlHandle = urllib2.urlopen(playerUrl_raw)
3265                         playerUrl = urlHandle.geturl()
3266                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3267                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3268                         return
3269
3270                 uri = mMovieParams[0][1]
3271                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3272                 self.report_index_download(epTitle)
3273                 try:
3274                         indexXml = urllib2.urlopen(indexUrl).read()
3275                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3276                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3277                         return
3278
3279                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3280                 itemEls = idoc.findall('.//item')
3281                 for itemEl in itemEls:
3282                         mediaId = itemEl.findall('./guid')[0].text
3283                         shortMediaId = mediaId.split(':')[-1]
3284                         showId = mediaId.split(':')[-2].replace('.com', '')
3285                         officialTitle = itemEl.findall('./title')[0].text
3286                         officialDate = itemEl.findall('./pubDate')[0].text
3287
3288                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3289                                                 urllib.urlencode({'uri': mediaId}))
3290                         configReq = urllib2.Request(configUrl)
3291                         self.report_config_download(epTitle)
3292                         try:
3293                                 configXml = urllib2.urlopen(configReq).read()
3294                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3295                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3296                                 return
3297
3298                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3299                         turls = []
3300                         for rendition in cdoc.findall('.//rendition'):
3301                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3302                                 turls.append(finfo)
3303
3304                         if len(turls) == 0:
3305                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3306                                 continue
3307
3308                         # For now, just pick the highest bitrate
3309                         format,video_url = turls[-1]
3310
3311                         self._downloader.increment_downloads()
3312
3313                         effTitle = showId + u'-' + epTitle
3314                         info = {
3315                                 'id': shortMediaId,
3316                                 'url': video_url,
3317                                 'uploader': showId,
3318                                 'upload_date': officialDate,
3319                                 'title': effTitle,
3320                                 'stitle': _simplify_title(effTitle),
3321                                 'ext': 'mp4',
3322                                 'format': format,
3323                                 'thumbnail': None,
3324                                 'description': officialTitle,
3325                                 'player_url': playerUrl
3326                         }
3327
3328                         try:
3329                                 self._downloader.process_info(info)
3330                         except UnavailableVideoError, err:
3331                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3332                                 continue
3333
3334
3335 class EscapistIE(InfoExtractor):
3336         """Information extractor for The Escapist """
3337
3338         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3339         IE_NAME = u'escapist'
3340
3341         def report_extraction(self, showName):
3342                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3343
3344         def report_config_download(self, showName):
3345                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3346
3347         def _real_extract(self, url):
3348                 htmlParser = HTMLParser.HTMLParser()
3349
3350                 mobj = re.match(self._VALID_URL, url)
3351                 if mobj is None:
3352                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3353                         return
3354                 showName = mobj.group('showname')
3355                 videoId = mobj.group('episode')
3356
3357                 self.report_extraction(showName)
3358                 try:
3359                         webPage = urllib2.urlopen(url).read()
3360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3361                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3362                         return
3363
3364                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3365                 description = htmlParser.unescape(descMatch.group(1))
3366                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3367                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3368                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3369                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3370                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3371                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3372
3373                 self.report_config_download(showName)
3374                 try:
3375                         configJSON = urllib2.urlopen(configUrl).read()
3376                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3377                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3378                         return
3379
3380                 # Technically, it's JavaScript, not JSON
3381                 configJSON = configJSON.replace("'", '"')
3382
3383                 try:
3384                         config = json.loads(configJSON)
3385                 except (ValueError,), err:
3386                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3387                         return
3388
3389                 playlist = config['playlist']
3390                 videoUrl = playlist[1]['url']
3391
3392                 self._downloader.increment_downloads()
3393                 info = {
3394                         'id': videoId,
3395                         'url': videoUrl,
3396                         'uploader': showName,
3397                         'upload_date': None,
3398                         'title': showName,
3399                         'stitle': _simplify_title(showName),
3400                         'ext': 'flv',
3401                         'format': 'flv',
3402                         'thumbnail': imgUrl,
3403                         'description': description,
3404                         'player_url': playerUrl,
3405                 }
3406
3407                 try:
3408                         self._downloader.process_info(info)
3409                 except UnavailableVideoError, err:
3410                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3411
3412
3413 class CollegeHumorIE(InfoExtractor):
3414         """Information extractor for collegehumor.com"""
3415
3416         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3417         IE_NAME = u'collegehumor'
3418
3419         def report_webpage(self, video_id):
3420                 """Report information extraction."""
3421                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3422
3423         def report_extraction(self, video_id):
3424                 """Report information extraction."""
3425                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3426
3427         def _real_extract(self, url):
3428                 htmlParser = HTMLParser.HTMLParser()
3429
3430                 mobj = re.match(self._VALID_URL, url)
3431                 if mobj is None:
3432                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3433                         return
3434                 video_id = mobj.group('videoid')
3435
3436                 self.report_webpage(video_id)
3437                 request = urllib2.Request(url)
3438                 try:
3439                         webpage = urllib2.urlopen(request).read()
3440                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3441                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3442                         return
3443
3444                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3445                 if m is None:
3446                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3447                         return
3448                 internal_video_id = m.group('internalvideoid')
3449
3450                 info = {
3451                         'id': video_id,
3452                         'internal_id': internal_video_id,
3453                 }
3454
3455                 self.report_extraction(video_id)
3456                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3457                 try:
3458                         metaXml = urllib2.urlopen(xmlUrl).read()
3459                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3460                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3461                         return
3462
3463                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3464                 try:
3465                         videoNode = mdoc.findall('./video')[0]
3466                         info['description'] = videoNode.findall('./description')[0].text
3467                         info['title'] = videoNode.findall('./caption')[0].text
3468                         info['stitle'] = _simplify_title(info['title'])
3469                         info['url'] = videoNode.findall('./file')[0].text
3470                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3471                         info['ext'] = info['url'].rpartition('.')[2]
3472                         info['format'] = info['ext']
3473                 except IndexError:
3474                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3475                         return
3476
3477                 self._downloader.increment_downloads()
3478
3479                 try:
3480                         self._downloader.process_info(info)
3481                 except UnavailableVideoError, err:
3482                         self._downloader.trouble(u'\nERROR: unable to download video')
3483
3484
3485 class XVideosIE(InfoExtractor):
3486         """Information extractor for xvideos.com"""
3487
3488         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3489         IE_NAME = u'xvideos'
3490
3491         def report_webpage(self, video_id):
3492                 """Report information extraction."""
3493                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3494
3495         def report_extraction(self, video_id):
3496                 """Report information extraction."""
3497                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3498
3499         def _real_extract(self, url):
3500                 htmlParser = HTMLParser.HTMLParser()
3501
3502                 mobj = re.match(self._VALID_URL, url)
3503                 if mobj is None:
3504                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3505                         return
3506                 video_id = mobj.group(1).decode('utf-8')
3507
3508                 self.report_webpage(video_id)
3509
3510                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3511                 try:
3512                         webpage = urllib2.urlopen(request).read()
3513                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3514                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515                         return
3516
3517                 self.report_extraction(video_id)
3518
3519
3520                 # Extract video URL
3521                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3522                 if mobj is None:
3523                         self._downloader.trouble(u'ERROR: unable to extract video url')
3524                         return
3525                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3526
3527
3528                 # Extract title
3529                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3530                 if mobj is None:
3531                         self._downloader.trouble(u'ERROR: unable to extract video title')
3532                         return
3533                 video_title = mobj.group(1).decode('utf-8')
3534
3535
3536                 # Extract video thumbnail
3537                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3538                 if mobj is None:
3539                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3540                         return
3541                 video_thumbnail = mobj.group(1).decode('utf-8')
3542
3543
3544
3545                 self._downloader.increment_downloads()
3546                 info = {
3547                         'id': video_id,
3548                         'url': video_url,
3549                         'uploader': None,
3550                         'upload_date': None,
3551                         'title': video_title,
3552                         'stitle': _simplify_title(video_title),
3553                         'ext': 'flv',
3554                         'format': 'flv',
3555                         'thumbnail': video_thumbnail,
3556                         'description': None,
3557                         'player_url': None,
3558                 }
3559
3560                 try:
3561                         self._downloader.process_info(info)
3562                 except UnavailableVideoError, err:
3563                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3564
3565
3566 class SoundcloudIE(InfoExtractor):
3567         """Information extractor for soundcloud.com
3568            To access the media, the uid of the song and a stream token
3569            must be extracted from the page source and the script must make
3570            a request to media.soundcloud.com/crossdomain.xml. Then
3571            the media can be grabbed by requesting from an url composed
3572            of the stream token and uid
3573          """
3574
3575         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3576         IE_NAME = u'soundcloud'
3577
3578         def __init__(self, downloader=None):
3579                 InfoExtractor.__init__(self, downloader)
3580
3581         def report_webpage(self, video_id):
3582                 """Report information extraction."""
3583                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3584
3585         def report_extraction(self, video_id):
3586                 """Report information extraction."""
3587                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3588
3589         def _real_extract(self, url):
3590                 htmlParser = HTMLParser.HTMLParser()
3591
3592                 mobj = re.match(self._VALID_URL, url)
3593                 if mobj is None:
3594                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3595                         return
3596
3597                 # extract uploader (which is in the url)
3598                 uploader = mobj.group(1).decode('utf-8')
3599                 # extract simple title (uploader + slug of song title)
3600                 slug_title =  mobj.group(2).decode('utf-8')
3601                 simple_title = uploader + '-' + slug_title
3602
3603                 self.report_webpage('%s/%s' % (uploader, slug_title))
3604
3605                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3606                 try:
3607                         webpage = urllib2.urlopen(request).read()
3608                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3609                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3610                         return
3611
3612                 self.report_extraction('%s/%s' % (uploader, slug_title))
3613
3614                 # extract uid and stream token that soundcloud hands out for access
3615                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3616                 if mobj:
3617                         video_id = mobj.group(1)
3618                         stream_token = mobj.group(2)
3619
3620                 # extract unsimplified title
3621                 mobj = re.search('"title":"(.*?)",', webpage)
3622                 if mobj:
3623                         title = mobj.group(1)
3624
3625                 # construct media url (with uid/token)
3626                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3627                 mediaURL = mediaURL % (video_id, stream_token)
3628
3629                 # description
3630                 description = u'No description available'
3631                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3632                 if mobj:
3633                         description = mobj.group(1)
3634
3635                 # upload date
3636                 upload_date = None
3637                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3638                 if mobj:
3639                         try:
3640                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3641                         except Exception, e:
3642                                 print str(e)
3643
3644                 # for soundcloud, a request to a cross domain is required for cookies
3645                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3646
3647                 try:
3648                         self._downloader.process_info({
3649                                 'id':           video_id.decode('utf-8'),
3650                                 'url':          mediaURL,
3651                                 'uploader':     uploader.decode('utf-8'),
3652                                 'upload_date':  upload_date,
3653                                 'title':        simple_title.decode('utf-8'),
3654                                 'stitle':       simple_title.decode('utf-8'),
3655                                 'ext':          u'mp3',
3656                                 'format':       u'NA',
3657                                 'player_url':   None,
3658                                 'description': description.decode('utf-8')
3659                         })
3660                 except UnavailableVideoError:
3661                         self._downloader.trouble(u'\nERROR: unable to download video')
3662
3663
3664 class InfoQIE(InfoExtractor):
3665         """Information extractor for infoq.com"""
3666
3667         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3668         IE_NAME = u'infoq'
3669
3670         def report_webpage(self, video_id):
3671                 """Report information extraction."""
3672                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3673
3674         def report_extraction(self, video_id):
3675                 """Report information extraction."""
3676                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3677
3678         def _real_extract(self, url):
3679                 htmlParser = HTMLParser.HTMLParser()
3680
3681                 mobj = re.match(self._VALID_URL, url)
3682                 if mobj is None:
3683                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3684                         return
3685
3686                 self.report_webpage(url)
3687
3688                 request = urllib2.Request(url)
3689                 try:
3690                         webpage = urllib2.urlopen(request).read()
3691                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3692                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3693                         return
3694
3695                 self.report_extraction(url)
3696
3697
3698                 # Extract video URL
3699                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3700                 if mobj is None:
3701                         self._downloader.trouble(u'ERROR: unable to extract video url')
3702                         return
3703                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3704
3705
3706                 # Extract title
3707                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3708                 if mobj is None:
3709                         self._downloader.trouble(u'ERROR: unable to extract video title')
3710                         return
3711                 video_title = mobj.group(1).decode('utf-8')
3712
3713                 # Extract description
3714                 video_description = u'No description available.'
3715                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3716                 if mobj is not None:
3717                         video_description = mobj.group(1).decode('utf-8')
3718
3719                 video_filename = video_url.split('/')[-1]
3720                 video_id, extension = video_filename.split('.')
3721
3722                 self._downloader.increment_downloads()
3723                 info = {
3724                         'id': video_id,
3725                         'url': video_url,
3726                         'uploader': None,
3727                         'upload_date': None,
3728                         'title': video_title,
3729                         'stitle': _simplify_title(video_title),
3730                         'ext': extension,
3731                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3732                         'thumbnail': None,
3733                         'description': video_description,
3734                         'player_url': None,
3735                 }
3736
3737                 try:
3738                         self._downloader.process_info(info)
3739                 except UnavailableVideoError, err:
3740                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3741
3742 class MixcloudIE(InfoExtractor):
3743         """Information extractor for www.mixcloud.com"""
3744         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3745         IE_NAME = u'mixcloud'
3746
3747         def __init__(self, downloader=None):
3748                 InfoExtractor.__init__(self, downloader)
3749
3750         def report_download_json(self, file_id):
3751                 """Report JSON download."""
3752                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3753
3754         def report_extraction(self, file_id):
3755                 """Report information extraction."""
3756                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3757
3758         def get_urls(self, jsonData, fmt, bitrate='best'):
3759                 """Get urls from 'audio_formats' section in json"""
3760                 file_url = None
3761                 try:
3762                         bitrate_list = jsonData[fmt]
3763                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3764                                 bitrate = max(bitrate_list) # select highest
3765
3766                         url_list = jsonData[fmt][bitrate]
3767                 except TypeError: # we have no bitrate info.
3768                         url_list = jsonData[fmt]
3769                 return url_list
3770
3771         def check_urls(self, url_list):
3772                 """Returns 1st active url from list"""
3773                 for url in url_list:
3774                         try:
3775                                 urllib2.urlopen(url)
3776                                 return url
3777                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3778                                 url = None
3779
3780                 return None
3781
3782         def _print_formats(self, formats):
3783                 print 'Available formats:'
3784                 for fmt in formats.keys():
3785                         for b in formats[fmt]:
3786                                 try:
3787                                         ext = formats[fmt][b][0]
3788                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3789                                 except TypeError: # we have no bitrate info
3790                                         ext = formats[fmt][0]
3791                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3792                                         break
3793
3794         def _real_extract(self, url):
3795                 mobj = re.match(self._VALID_URL, url)
3796                 if mobj is None:
3797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3798                         return
3799                 # extract uploader & filename from url
3800                 uploader = mobj.group(1).decode('utf-8')
3801                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3802
3803                 # construct API request
3804                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3805                 # retrieve .json file with links to files
3806                 request = urllib2.Request(file_url)
3807                 try:
3808                         self.report_download_json(file_url)
3809                         jsonData = urllib2.urlopen(request).read()
3810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3811                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3812                         return
3813
3814                 # parse JSON
3815                 json_data = json.loads(jsonData)
3816                 player_url = json_data['player_swf_url']
3817                 formats = dict(json_data['audio_formats'])
3818
3819                 req_format = self._downloader.params.get('format', None)
3820                 bitrate = None
3821
3822                 if self._downloader.params.get('listformats', None):
3823                         self._print_formats(formats)
3824                         return
3825
3826                 if req_format is None or req_format == 'best':
3827                         for format_param in formats.keys():
3828                                 url_list = self.get_urls(formats, format_param)
3829                                 # check urls
3830                                 file_url = self.check_urls(url_list)
3831                                 if file_url is not None:
3832                                         break # got it!
3833                 else:
3834                         if req_format not in formats.keys():
3835                                 self._downloader.trouble(u'ERROR: format is not available')
3836                                 return
3837
3838                         url_list = self.get_urls(formats, req_format)
3839                         file_url = self.check_urls(url_list)
3840                         format_param = req_format
3841
3842                 # We have audio
3843                 self._downloader.increment_downloads()
3844                 try:
3845                         # Process file information
3846                         self._downloader.process_info({
3847                                 'id': file_id.decode('utf-8'),
3848                                 'url': file_url.decode('utf-8'),
3849                                 'uploader':     uploader.decode('utf-8'),
3850                                 'upload_date': u'NA',
3851                                 'title': json_data['name'],
3852                                 'stitle': _simplify_title(json_data['name']),
3853                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3854                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3855                                 'thumbnail': json_data['thumbnail_url'],
3856                                 'description': json_data['description'],
3857                                 'player_url': player_url.decode('utf-8'),
3858                         })
3859                 except UnavailableVideoError, err:
3860                         self._downloader.trouble(u'ERROR: unable to download file')
3861
3862 class StanfordOpenClassroomIE(InfoExtractor):
3863         """Information extractor for Stanford's Open ClassRoom"""
3864
3865         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3866         IE_NAME = u'stanfordoc'
3867
3868         def report_download_webpage(self, objid):
3869                 """Report information extraction."""
3870                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3871
3872         def report_extraction(self, video_id):
3873                 """Report information extraction."""
3874                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3875
3876         def _real_extract(self, url):
3877                 mobj = re.match(self._VALID_URL, url)
3878                 if mobj is None:
3879                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3880                         return
3881
3882                 if mobj.group('course') and mobj.group('video'): # A specific video
3883                         course = mobj.group('course')
3884                         video = mobj.group('video')
3885                         info = {
3886                                 'id': _simplify_title(course + '_' + video),
3887                         }
3888
3889                         self.report_extraction(info['id'])
3890                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3891                         xmlUrl = baseUrl + video + '.xml'
3892                         try:
3893                                 metaXml = urllib2.urlopen(xmlUrl).read()
3894                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3895                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3896                                 return
3897                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3898                         try:
3899                                 info['title'] = mdoc.findall('./title')[0].text
3900                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3901                         except IndexError:
3902                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3903                                 return
3904                         info['stitle'] = _simplify_title(info['title'])
3905                         info['ext'] = info['url'].rpartition('.')[2]
3906                         info['format'] = info['ext']
3907                         self._downloader.increment_downloads()
3908                         try:
3909                                 self._downloader.process_info(info)
3910                         except UnavailableVideoError, err:
3911                                 self._downloader.trouble(u'\nERROR: unable to download video')
3912                 elif mobj.group('course'): # A course page
3913                         unescapeHTML = HTMLParser.HTMLParser().unescape
3914
3915                         course = mobj.group('course')
3916                         info = {
3917                                 'id': _simplify_title(course),
3918                                 'type': 'playlist',
3919                         }
3920
3921                         self.report_download_webpage(info['id'])
3922                         try:
3923                                 coursepage = urllib2.urlopen(url).read()
3924                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3925                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3926                                 return
3927
3928                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3929                         if m:
3930                                 info['title'] = unescapeHTML(m.group(1))
3931                         else:
3932                                 info['title'] = info['id']
3933                         info['stitle'] = _simplify_title(info['title'])
3934
3935                         m = re.search('<description>([^<]+)</description>', coursepage)
3936                         if m:
3937                                 info['description'] = unescapeHTML(m.group(1))
3938
3939                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3940                         info['list'] = [
3941                                 {
3942                                         'type': 'reference',
3943                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3944                                 }
3945                                         for vpage in links]
3946
3947                         for entry in info['list']:
3948                                 assert entry['type'] == 'reference'
3949                                 self.extract(entry['url'])
3950                 else: # Root page
3951                         unescapeHTML = HTMLParser.HTMLParser().unescape
3952
3953                         info = {
3954                                 'id': 'Stanford OpenClassroom',
3955                                 'type': 'playlist',
3956                         }
3957
3958                         self.report_download_webpage(info['id'])
3959                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3960                         try:
3961                                 rootpage = urllib2.urlopen(rootURL).read()
3962                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3963                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3964                                 return
3965
3966                         info['title'] = info['id']
3967                         info['stitle'] = _simplify_title(info['title'])
3968
3969                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3970                         info['list'] = [
3971                                 {
3972                                         'type': 'reference',
3973                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3974                                 }
3975                                         for cpage in links]
3976
3977                         for entry in info['list']:
3978                                 assert entry['type'] == 'reference'
3979                                 self.extract(entry['url'])
3980
3981 class MTVIE(InfoExtractor):
3982         """Information extractor for MTV.com"""
3983
3984         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3985         IE_NAME = u'mtv'
3986
3987         def report_webpage(self, video_id):
3988                 """Report information extraction."""
3989                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3990
3991         def report_extraction(self, video_id):
3992                 """Report information extraction."""
3993                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3994
3995         def _real_extract(self, url):
3996                 mobj = re.match(self._VALID_URL, url)
3997                 if mobj is None:
3998                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3999                         return
4000                 if not mobj.group('proto'):
4001                         url = 'http://' + url
4002                 video_id = mobj.group('videoid')
4003                 self.report_webpage(video_id)
4004
4005                 request = urllib2.Request(url)
4006                 try:
4007                         webpage = urllib2.urlopen(request).read()
4008                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4009                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4010                         return
4011
4012                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4013                 if mobj is None:
4014                         self._downloader.trouble(u'ERROR: unable to extract song name')
4015                         return
4016                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4018                 if mobj is None:
4019                         self._downloader.trouble(u'ERROR: unable to extract performer')
4020                         return
4021                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022                 video_title = performer + ' - ' + song_name
4023
4024                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4025                 if mobj is None:
4026                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4027                         return
4028                 mtvn_uri = mobj.group(1)
4029
4030                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4031                 if mobj is None:
4032                         self._downloader.trouble(u'ERROR: unable to extract content id')
4033                         return
4034                 content_id = mobj.group(1)
4035
4036                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4037                 self.report_extraction(video_id)
4038                 request = urllib2.Request(videogen_url)
4039                 try:
4040                         metadataXml = urllib2.urlopen(request).read()
4041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4042                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4043                         return
4044
4045                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4046                 renditions = mdoc.findall('.//rendition')
4047
4048                 # For now, always pick the highest quality.
4049                 rendition = renditions[-1]
4050
4051                 try:
4052                         _,_,ext = rendition.attrib['type'].partition('/')
4053                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4054                         video_url = rendition.find('./src').text
4055                 except KeyError:
4056                         self._downloader.trouble('Invalid rendition field.')
4057                         return
4058
4059                 self._downloader.increment_downloads()
4060                 info = {
4061                         'id': video_id,
4062                         'url': video_url,
4063                         'uploader': performer,
4064                         'title': video_title,
4065                         'stitle': _simplify_title(video_title),
4066                         'ext': ext,
4067                         'format': format,
4068                 }
4069
4070                 try:
4071                         self._downloader.process_info(info)
4072                 except UnavailableVideoError, err:
4073                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4074
4075
4076 class PostProcessor(object):
4077         """Post Processor class.
4078
4079         PostProcessor objects can be added to downloaders with their
4080         add_post_processor() method. When the downloader has finished a
4081         successful download, it will take its internal chain of PostProcessors
4082         and start calling the run() method on each one of them, first with
4083         an initial argument and then with the returned value of the previous
4084         PostProcessor.
4085
4086         The chain will be stopped if one of them ever returns None or the end
4087         of the chain is reached.
4088
4089         PostProcessor objects follow a "mutual registration" process similar
4090         to InfoExtractor objects.
4091         """
4092
4093         _downloader = None
4094
4095         def __init__(self, downloader=None):
4096                 self._downloader = downloader
4097
4098         def set_downloader(self, downloader):
4099                 """Sets the downloader for this PP."""
4100                 self._downloader = downloader
4101
4102         def run(self, information):
4103                 """Run the PostProcessor.
4104
4105                 The "information" argument is a dictionary like the ones
4106                 composed by InfoExtractors. The only difference is that this
4107                 one has an extra field called "filepath" that points to the
4108                 downloaded file.
4109
4110                 When this method returns None, the postprocessing chain is
4111                 stopped. However, this method may return an information
4112                 dictionary that will be passed to the next postprocessing
4113                 object in the chain. It can be the one it received after
4114                 changing some fields.
4115
4116                 In addition, this method may raise a PostProcessingError
4117                 exception that will be taken into account by the downloader
4118                 it was called from.
4119                 """
4120                 return information # by default, do nothing
4121
4122 class AudioConversionError(BaseException):
4123         def __init__(self, message):
4124                 self.message = message
4125
4126 class FFmpegExtractAudioPP(PostProcessor):
4127
4128         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4129                 PostProcessor.__init__(self, downloader)
4130                 if preferredcodec is None:
4131                         preferredcodec = 'best'
4132                 self._preferredcodec = preferredcodec
4133                 self._preferredquality = preferredquality
4134                 self._keepvideo = keepvideo
4135
4136         @staticmethod
4137         def get_audio_codec(path):
4138                 try:
4139                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4140                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4141                         output = handle.communicate()[0]
4142                         if handle.wait() != 0:
4143                                 return None
4144                 except (IOError, OSError):
4145                         return None
4146                 audio_codec = None
4147                 for line in output.split('\n'):
4148                         if line.startswith('codec_name='):
4149                                 audio_codec = line.split('=')[1].strip()
4150                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4151                                 return audio_codec
4152                 return None
4153
4154         @staticmethod
4155         def run_ffmpeg(path, out_path, codec, more_opts):
4156                 if codec is None:
4157                         acodec_opts = []
4158                 else:
4159                         acodec_opts = ['-acodec', codec]
4160                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4161                 try:
4162                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4163                         stdout,stderr = p.communicate()
4164                 except (IOError, OSError):
4165                         e = sys.exc_info()[1]
4166                         if isinstance(e, OSError) and e.errno == 2:
4167                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4168                         else:
4169                                 raise e
4170                 if p.returncode != 0:
4171                         msg = stderr.strip().split('\n')[-1]
4172                         raise AudioConversionError(msg)
4173
4174         def run(self, information):
4175                 path = information['filepath']
4176
4177                 filecodec = self.get_audio_codec(path)
4178                 if filecodec is None:
4179                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4180                         return None
4181
4182                 more_opts = []
4183                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4184                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4185                                 # Lossless, but in another container
4186                                 acodec = 'copy'
4187                                 extension = self._preferredcodec
4188                                 more_opts = ['-absf', 'aac_adtstoasc']
4189                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4190                                 # Lossless if possible
4191                                 acodec = 'copy'
4192                                 extension = filecodec
4193                                 if filecodec == 'aac':
4194                                         more_opts = ['-f', 'adts']
4195                                 if filecodec == 'vorbis':
4196                                         extension = 'ogg'
4197                         else:
4198                                 # MP3 otherwise.
4199                                 acodec = 'libmp3lame'
4200                                 extension = 'mp3'
4201                                 more_opts = []
4202                                 if self._preferredquality is not None:
4203                                         more_opts += ['-ab', self._preferredquality]
4204                 else:
4205                         # We convert the audio (lossy)
4206                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4207                         extension = self._preferredcodec
4208                         more_opts = []
4209                         if self._preferredquality is not None:
4210                                 more_opts += ['-ab', self._preferredquality]
4211                         if self._preferredcodec == 'aac':
4212                                 more_opts += ['-f', 'adts']
4213                         if self._preferredcodec == 'm4a':
4214                                 more_opts += ['-absf', 'aac_adtstoasc']
4215                         if self._preferredcodec == 'vorbis':
4216                                 extension = 'ogg'
4217                         if self._preferredcodec == 'wav':
4218                                 extension = 'wav'
4219                                 more_opts += ['-f', 'wav']
4220
4221                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4222                 new_path = prefix + sep + extension
4223                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4224                 try:
4225                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4226                 except:
4227                         etype,e,tb = sys.exc_info()
4228                         if isinstance(e, AudioConversionError):
4229                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4230                         else:
4231                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4232                         return None
4233
4234                 # Try to update the date time for extracted audio file.
4235                 if information.get('filetime') is not None:
4236                         try:
4237                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4238                         except:
4239                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4240
4241                 if not self._keepvideo:
4242                         try:
4243                                 os.remove(_encodeFilename(path))
4244                         except (IOError, OSError):
4245                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4246                                 return None
4247
4248                 information['filepath'] = new_path
4249                 return information
4250
4251
4252 def updateSelf(downloader, filename):
4253         ''' Update the program file with the latest version from the repository '''
4254         # Note: downloader only used for options
4255         if not os.access(filename, os.W_OK):
4256                 sys.exit('ERROR: no write permissions on %s' % filename)
4257
4258         downloader.to_screen(u'Updating to latest version...')
4259
4260         try:
4261                 try:
4262                         urlh = urllib.urlopen(UPDATE_URL)
4263                         newcontent = urlh.read()
4264
4265                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4266                         if vmatch is not None and vmatch.group(1) == __version__:
4267                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4268                                 return
4269                 finally:
4270                         urlh.close()
4271         except (IOError, OSError), err:
4272                 sys.exit('ERROR: unable to download latest version')
4273
4274         try:
4275                 outf = open(filename, 'wb')
4276                 try:
4277                         outf.write(newcontent)
4278                 finally:
4279                         outf.close()
4280         except (IOError, OSError), err:
4281                 sys.exit('ERROR: unable to overwrite current version')
4282
4283         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4284
4285 def parseOpts():
4286         def _readOptions(filename_bytes):
4287                 try:
4288                         optionf = open(filename_bytes)
4289                 except IOError:
4290                         return [] # silently skip if file is not present
4291                 try:
4292                         res = []
4293                         for l in optionf:
4294                                 res += shlex.split(l, comments=True)
4295                 finally:
4296                         optionf.close()
4297                 return res
4298
4299         def _format_option_string(option):
4300                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4301
4302                 opts = []
4303
4304                 if option._short_opts: opts.append(option._short_opts[0])
4305                 if option._long_opts: opts.append(option._long_opts[0])
4306                 if len(opts) > 1: opts.insert(1, ', ')
4307
4308                 if option.takes_value(): opts.append(' %s' % option.metavar)
4309
4310                 return "".join(opts)
4311
4312         def _find_term_columns():
4313                 columns = os.environ.get('COLUMNS', None)
4314                 if columns:
4315                         return int(columns)
4316
4317                 try:
4318                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4319                         out,err = sp.communicate()
4320                         return int(out.split()[1])
4321                 except:
4322                         pass
4323                 return None
4324
4325         max_width = 80
4326         max_help_position = 80
4327
4328         # No need to wrap help messages if we're on a wide console
4329         columns = _find_term_columns()
4330         if columns: max_width = columns
4331
4332         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4333         fmt.format_option_strings = _format_option_string
4334
4335         kw = {
4336                 'version'   : __version__,
4337                 'formatter' : fmt,
4338                 'usage' : '%prog [options] url [url...]',
4339                 'conflict_handler' : 'resolve',
4340         }
4341
4342         parser = optparse.OptionParser(**kw)
4343
4344         # option groups
4345         general        = optparse.OptionGroup(parser, 'General Options')
4346         selection      = optparse.OptionGroup(parser, 'Video Selection')
4347         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4348         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4349         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4350         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4351         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4352
4353         general.add_option('-h', '--help',
4354                         action='help', help='print this help text and exit')
4355         general.add_option('-v', '--version',
4356                         action='version', help='print program version and exit')
4357         general.add_option('-U', '--update',
4358                         action='store_true', dest='update_self', help='update this program to latest version')
4359         general.add_option('-i', '--ignore-errors',
4360                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4361         general.add_option('-r', '--rate-limit',
4362                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4363         general.add_option('-R', '--retries',
4364                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4365         general.add_option('--dump-user-agent',
4366                         action='store_true', dest='dump_user_agent',
4367                         help='display the current browser identification', default=False)
4368         general.add_option('--list-extractors',
4369                         action='store_true', dest='list_extractors',
4370                         help='List all supported extractors and the URLs they would handle', default=False)
4371
4372         selection.add_option('--playlist-start',
4373                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4374         selection.add_option('--playlist-end',
4375                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4376         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4377         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4378         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4379
4380         authentication.add_option('-u', '--username',
4381                         dest='username', metavar='USERNAME', help='account username')
4382         authentication.add_option('-p', '--password',
4383                         dest='password', metavar='PASSWORD', help='account password')
4384         authentication.add_option('-n', '--netrc',
4385                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4386
4387
4388         video_format.add_option('-f', '--format',
4389                         action='store', dest='format', metavar='FORMAT', help='video format code')
4390         video_format.add_option('--all-formats',
4391                         action='store_const', dest='format', help='download all available video formats', const='all')
4392         video_format.add_option('--prefer-free-formats',
4393                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4394         video_format.add_option('--max-quality',
4395                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4396         video_format.add_option('-F', '--list-formats',
4397                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4398         video_format.add_option('--write-srt',
4399                         action='store_true', dest='writesubtitles',
4400                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4401         video_format.add_option('--srt-lang',
4402                         action='store', dest='subtitleslang', metavar='LANG',
4403                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4404
4405
4406         verbosity.add_option('-q', '--quiet',
4407                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4408         verbosity.add_option('-s', '--simulate',
4409                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4410         verbosity.add_option('--skip-download',
4411                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4412         verbosity.add_option('-g', '--get-url',
4413                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4414         verbosity.add_option('-e', '--get-title',
4415                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4416         verbosity.add_option('--get-thumbnail',
4417                         action='store_true', dest='getthumbnail',
4418                         help='simulate, quiet but print thumbnail URL', default=False)
4419         verbosity.add_option('--get-description',
4420                         action='store_true', dest='getdescription',
4421                         help='simulate, quiet but print video description', default=False)
4422         verbosity.add_option('--get-filename',
4423                         action='store_true', dest='getfilename',
4424                         help='simulate, quiet but print output filename', default=False)
4425         verbosity.add_option('--get-format',
4426                         action='store_true', dest='getformat',
4427                         help='simulate, quiet but print output format', default=False)
4428         verbosity.add_option('--no-progress',
4429                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4430         verbosity.add_option('--console-title',
4431                         action='store_true', dest='consoletitle',
4432                         help='display progress in console titlebar', default=False)
4433         verbosity.add_option('-v', '--verbose',
4434                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4435
4436
4437         filesystem.add_option('-t', '--title',
4438                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4439         filesystem.add_option('-l', '--literal',
4440                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4441         filesystem.add_option('-A', '--auto-number',
4442                         action='store_true', dest='autonumber',
4443                         help='number downloaded files starting from 00000', default=False)
4444         filesystem.add_option('-o', '--output',
4445                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4446         filesystem.add_option('-a', '--batch-file',
4447                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4448         filesystem.add_option('-w', '--no-overwrites',
4449                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4450         filesystem.add_option('-c', '--continue',
4451                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4452         filesystem.add_option('--no-continue',
4453                         action='store_false', dest='continue_dl',
4454                         help='do not resume partially downloaded files (restart from beginning)')
4455         filesystem.add_option('--cookies',
4456                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4457         filesystem.add_option('--no-part',
4458                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4459         filesystem.add_option('--no-mtime',
4460                         action='store_false', dest='updatetime',
4461                         help='do not use the Last-modified header to set the file modification time', default=True)
4462         filesystem.add_option('--write-description',
4463                         action='store_true', dest='writedescription',
4464                         help='write video description to a .description file', default=False)
4465         filesystem.add_option('--write-info-json',
4466                         action='store_true', dest='writeinfojson',
4467                         help='write video metadata to a .info.json file', default=False)
4468
4469
4470         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4471                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4472         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4473                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4474         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4475                         help='ffmpeg audio bitrate specification, 128k by default')
4476         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4477                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4478
4479
4480         parser.add_option_group(general)
4481         parser.add_option_group(selection)
4482         parser.add_option_group(filesystem)
4483         parser.add_option_group(verbosity)
4484         parser.add_option_group(video_format)
4485         parser.add_option_group(authentication)
4486         parser.add_option_group(postproc)
4487
4488         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4489         if xdg_config_home:
4490                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4491         else:
4492                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4493         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4494         opts, args = parser.parse_args(argv)
4495
4496         return parser, opts, args
4497
4498 def gen_extractors():
4499         """ Return a list of an instance of every supported extractor.
4500         The order does matter; the first extractor matched is the one handling the URL.
4501         """
4502         youtube_ie = YoutubeIE()
4503         google_ie = GoogleIE()
4504         yahoo_ie = YahooIE()
4505         return [
4506                 YoutubePlaylistIE(youtube_ie),
4507                 YoutubeUserIE(youtube_ie),
4508                 YoutubeSearchIE(youtube_ie),
4509                 youtube_ie,
4510                 MetacafeIE(youtube_ie),
4511                 DailymotionIE(),
4512                 google_ie,
4513                 GoogleSearchIE(google_ie),
4514                 PhotobucketIE(),
4515                 yahoo_ie,
4516                 YahooSearchIE(yahoo_ie),
4517                 DepositFilesIE(),
4518                 FacebookIE(),
4519                 BlipTVIE(),
4520                 VimeoIE(),
4521                 MyVideoIE(),
4522                 ComedyCentralIE(),
4523                 EscapistIE(),
4524                 CollegeHumorIE(),
4525                 XVideosIE(),
4526                 SoundcloudIE(),
4527                 InfoQIE(),
4528                 MixcloudIE(),
4529                 StanfordOpenClassroomIE(),
4530                 MTVIE(),
4531
4532                 GenericIE()
4533         ]
4534
4535 def _real_main():
4536         parser, opts, args = parseOpts()
4537
4538         # Open appropriate CookieJar
4539         if opts.cookiefile is None:
4540                 jar = cookielib.CookieJar()
4541         else:
4542                 try:
4543                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4544                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4545                                 jar.load()
4546                 except (IOError, OSError), err:
4547                         sys.exit(u'ERROR: unable to open cookie file')
4548
4549         # Dump user agent
4550         if opts.dump_user_agent:
4551                 print std_headers['User-Agent']
4552                 sys.exit(0)
4553
4554         # Batch file verification
4555         batchurls = []
4556         if opts.batchfile is not None:
4557                 try:
4558                         if opts.batchfile == '-':
4559                                 batchfd = sys.stdin
4560                         else:
4561                                 batchfd = open(opts.batchfile, 'r')
4562                         batchurls = batchfd.readlines()
4563                         batchurls = [x.strip() for x in batchurls]
4564                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4565                 except IOError:
4566                         sys.exit(u'ERROR: batch file could not be read')
4567         all_urls = batchurls + args
4568
4569         # General configuration
4570         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4571         proxy_handler = urllib2.ProxyHandler()
4572         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4573         urllib2.install_opener(opener)
4574         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4575
4576         if opts.verbose:
4577                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4578
4579         extractors = gen_extractors()
4580
4581         if opts.list_extractors:
4582                 for ie in extractors:
4583                         print(ie.IE_NAME)
4584                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4585                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4586                         for mu in matchedUrls:
4587                                 print(u'  ' + mu)
4588                 sys.exit(0)
4589
4590         # Conflicting, missing and erroneous options
4591         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4592                 parser.error(u'using .netrc conflicts with giving username/password')
4593         if opts.password is not None and opts.username is None:
4594                 parser.error(u'account username missing')
4595         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4596                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4597         if opts.usetitle and opts.useliteral:
4598                 parser.error(u'using title conflicts with using literal title')
4599         if opts.username is not None and opts.password is None:
4600                 opts.password = getpass.getpass(u'Type account password and press return:')
4601         if opts.ratelimit is not None:
4602                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4603                 if numeric_limit is None:
4604                         parser.error(u'invalid rate limit specified')
4605                 opts.ratelimit = numeric_limit
4606         if opts.retries is not None:
4607                 try:
4608                         opts.retries = long(opts.retries)
4609                 except (TypeError, ValueError), err:
4610                         parser.error(u'invalid retry count specified')
4611         try:
4612                 opts.playliststart = int(opts.playliststart)
4613                 if opts.playliststart <= 0:
4614                         raise ValueError(u'Playlist start must be positive')
4615         except (TypeError, ValueError), err:
4616                 parser.error(u'invalid playlist start number specified')
4617         try:
4618                 opts.playlistend = int(opts.playlistend)
4619                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4620                         raise ValueError(u'Playlist end must be greater than playlist start')
4621         except (TypeError, ValueError), err:
4622                 parser.error(u'invalid playlist end number specified')
4623         if opts.extractaudio:
4624                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4625                         parser.error(u'invalid audio format specified')
4626
4627         # File downloader
4628         fd = FileDownloader({
4629                 'usenetrc': opts.usenetrc,
4630                 'username': opts.username,
4631                 'password': opts.password,
4632                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633                 'forceurl': opts.geturl,
4634                 'forcetitle': opts.gettitle,
4635                 'forcethumbnail': opts.getthumbnail,
4636                 'forcedescription': opts.getdescription,
4637                 'forcefilename': opts.getfilename,
4638                 'forceformat': opts.getformat,
4639                 'simulate': opts.simulate,
4640                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4641                 'format': opts.format,
4642                 'format_limit': opts.format_limit,
4643                 'listformats': opts.listformats,
4644                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4645                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4646                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4647                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4648                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4649                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4650                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4651                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4652                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4653                         or u'%(id)s.%(ext)s'),
4654                 'ignoreerrors': opts.ignoreerrors,
4655                 'ratelimit': opts.ratelimit,
4656                 'nooverwrites': opts.nooverwrites,
4657                 'retries': opts.retries,
4658                 'continuedl': opts.continue_dl,
4659                 'noprogress': opts.noprogress,
4660                 'playliststart': opts.playliststart,
4661                 'playlistend': opts.playlistend,
4662                 'logtostderr': opts.outtmpl == '-',
4663                 'consoletitle': opts.consoletitle,
4664                 'nopart': opts.nopart,
4665                 'updatetime': opts.updatetime,
4666                 'writedescription': opts.writedescription,
4667                 'writeinfojson': opts.writeinfojson,
4668                 'writesubtitles': opts.writesubtitles,
4669                 'subtitleslang': opts.subtitleslang,
4670                 'matchtitle': opts.matchtitle,
4671                 'rejecttitle': opts.rejecttitle,
4672                 'max_downloads': opts.max_downloads,
4673                 'prefer_free_formats': opts.prefer_free_formats,
4674                 'verbose': opts.verbose,
4675                 })
4676         for extractor in extractors:
4677                 fd.add_info_extractor(extractor)
4678
4679         # PostProcessors
4680         if opts.extractaudio:
4681                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4682
4683         # Update version
4684         if opts.update_self:
4685                 updateSelf(fd, sys.argv[0])
4686
4687         # Maybe do nothing
4688         if len(all_urls) < 1:
4689                 if not opts.update_self:
4690                         parser.error(u'you must provide at least one URL')
4691                 else:
4692                         sys.exit()
4693
4694         try:
4695                 retcode = fd.download(all_urls)
4696         except MaxDownloadsReached:
4697                 fd.to_screen(u'--max-download limit reached, aborting.')
4698                 retcode = 101
4699
4700         # Dump cookie jar if requested
4701         if opts.cookiefile is not None:
4702                 try:
4703                         jar.save()
4704                 except (IOError, OSError), err:
4705                         sys.exit(u'ERROR: unable to save cookie jar')
4706
4707         sys.exit(retcode)
4708
4709 def main():
4710         try:
4711                 _real_main()
4712         except DownloadError:
4713                 sys.exit(1)
4714         except SameFileError:
4715                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4716         except KeyboardInterrupt:
4717                 sys.exit(u'\nERROR: Interrupted by user')
4718
4719 if __name__ == '__main__':
4720         main()
4721
4722 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: