_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196
 197 class IDParser(HTMLParser.HTMLParser):
 198         """Modified HTMLParser that isolates a tag with the specified id"""
 199         def __init__(self, id):
 200                 self.id = id
 201                 self.result = None
 202                 self.started = False
 203                 self.depth = {}
 204                 self.html = None
 205                 self.watch_startpos = False
 206                 HTMLParser.HTMLParser.__init__(self)
 207
 208         def loads(self, html):
 209                 self.html = html
 210                 self.feed(html)
 211                 self.close()
 212
 213         def handle_starttag(self, tag, attrs):
 214                 attrs = dict(attrs)
 215                 if self.started:
 216                         self.find_startpos(None)
 217                 if 'id' in attrs and attrs['id'] == self.id:
 218                         self.result = [tag]
 219                         self.started = True
 220                         self.watch_startpos = True
 221                 if self.started:
 222                         if not tag in self.depth: self.depth[tag] = 0
 223                         self.depth[tag] += 1
 224
 225         def handle_endtag(self, tag):
 226                 if self.started:
 227                         if tag in self.depth: self.depth[tag] -= 1
 228                         if self.depth[self.result[0]] == 0:
 229                                 self.started = False
 230                                 self.result.append(self.getpos())
 231
 232         def find_startpos(self, x):
 233                 """Needed to put the start position of the result (self.result[1])
 234                 after the opening tag with the requested id"""
 235                 if self.watch_startpos:
 236                         self.watch_startpos = False
 237                         self.result.append(self.getpos())
 238         handle_entityref = handle_charref = handle_data = handle_comment = \
 239         handle_decl = handle_pi = unknown_decl = find_startpos
 240
 241         def get_result(self):
 242                 if self.result == None: return None
 243                 if len(self.result) != 3: return None
 244                 lines = self.html.split('\n')
 245                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 246                 lines[0] = lines[0][self.result[1][1]:]
 247                 if len(lines) == 1:
 248                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 249                 lines[-1] = lines[-1][:self.result[2][1]]
 250                 return '\n'.join(lines).strip()
 251
 252 def get_element_by_id(id, html):
 253         """Return the content of the tag with the specified id in the passed HTML document"""
 254         parser = IDParser(id)
 255         parser.loads(html)
 256         return parser.get_result()
 257
 258
 259 def preferredencoding():
 260         """Get preferred encoding.
 261
 262         Returns the best encoding scheme for the system, based on
 263         locale.getpreferredencoding() and some further tweaks.
 264         """
 265         def yield_preferredencoding():
 266                 try:
 267                         pref = locale.getpreferredencoding()
 268                         u'TEST'.encode(pref)
 269                 except:
 270                         pref = 'UTF-8'
 271                 while True:
 272                         yield pref
 273         return yield_preferredencoding().next()
 274
 275
 276 def htmlentity_transform(matchobj):
 277         """Transforms an HTML entity to a Unicode character.
 278
 279         This function receives a match object and is intended to be used with
 280         the re.sub() function.
 281         """
 282         entity = matchobj.group(1)
 283
 284         # Known non-numeric HTML entity
 285         if entity in htmlentitydefs.name2codepoint:
 286                 return unichr(htmlentitydefs.name2codepoint[entity])
 287
 288         # Unicode character
 289         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 290         if mobj is not None:
 291                 numstr = mobj.group(1)
 292                 if numstr.startswith(u'x'):
 293                         base = 16
 294                         numstr = u'0%s' % numstr
 295                 else:
 296                         base = 10
 297                 return unichr(long(numstr, base))
 298
 299         # Unknown entity in name, return its literal representation
 300         return (u'&%s;' % entity)
 301
 302
 303 def clean_html(html):
 304         """Clean an HTML snippet into a readable string"""
 305         # Newline vs <br />
 306         html = html.replace('\n', ' ')
 307         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 308         # Strip html tags
 309         html = re.sub('<.*?>', '', html)
 310         # Replace html entities
 311         html = _unescapeHTML(html)
 312         return html
 313
 314
 315 def sanitize_title(utitle):
 316         """Sanitizes a video title so it could be used as part of a filename."""
 317         utitle = _unescapeHTML(utitle)
 318         return utitle.replace(unicode(os.sep), u'%')
 319
 320
 321 def sanitize_open(filename, open_mode):
 322         """Try to open the given filename, and slightly tweak it if this fails.
 323
 324         Attempts to open the given filename. If this fails, it tries to change
 325         the filename slightly, step by step, until it's either able to open it
 326         or it fails and raises a final exception, like the standard open()
 327         function.
 328
 329         It returns the tuple (stream, definitive_file_name).
 330         """
 331         try:
 332                 if filename == u'-':
 333                         if sys.platform == 'win32':
 334                                 import msvcrt
 335                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 336                         return (sys.stdout, filename)
 337                 stream = open(_encodeFilename(filename), open_mode)
 338                 return (stream, filename)
 339         except (IOError, OSError), err:
 340                 # In case of error, try to remove win32 forbidden chars
 341                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 342
 343                 # An exception here should be caught in the caller
 344                 stream = open(_encodeFilename(filename), open_mode)
 345                 return (stream, filename)
 346
 347
 348 def timeconvert(timestr):
 349         """Convert RFC 2822 defined time string into system timestamp"""
 350         timestamp = None
 351         timetuple = email.utils.parsedate_tz(timestr)
 352         if timetuple is not None:
 353                 timestamp = email.utils.mktime_tz(timetuple)
 354         return timestamp
 355
 356 def _simplify_title(title):
 357         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 358         return expr.sub(u'_', title).strip(u'_')
 359
 360 def _orderedSet(iterable):
 361         """ Remove all duplicates from the input iterable """
 362         res = []
 363         for el in iterable:
 364                 if el not in res:
 365                         res.append(el)
 366         return res
 367
 368 def _unescapeHTML(s):
 369         """
 370         @param s a string (of type unicode)
 371         """
 372         assert type(s) == type(u'')
 373
 374         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 375         return result
 376
 377 def _encodeFilename(s):
 378         """
 379         @param s The name of the file (of type unicode)
 380         """
 381
 382         assert type(s) == type(u'')
 383
 384         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 385                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 386                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 387                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 388                 return s
 389         else:
 390                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 391
 392 class DownloadError(Exception):
 393         """Download Error exception.
 394
 395         This exception may be thrown by FileDownloader objects if they are not
 396         configured to continue on errors. They will contain the appropriate
 397         error message.
 398         """
 399         pass
 400
 401
 402 class SameFileError(Exception):
 403         """Same File exception.
 404
 405         This exception will be thrown by FileDownloader objects if they detect
 406         multiple files would have to be downloaded to the same file on disk.
 407         """
 408         pass
 409
 410
 411 class PostProcessingError(Exception):
 412         """Post Processing exception.
 413
 414         This exception may be raised by PostProcessor's .run() method to
 415         indicate an error in the postprocessing task.
 416         """
 417         pass
 418
 419 class MaxDownloadsReached(Exception):
 420         """ --max-downloads limit has been reached. """
 421         pass
 422
 423
 424 class UnavailableVideoError(Exception):
 425         """Unavailable Format exception.
 426
 427         This exception will be thrown when a video is requested
 428         in a format that is not available for that video.
 429         """
 430         pass
 431
 432
 433 class ContentTooShortError(Exception):
 434         """Content Too Short exception.
 435
 436         This exception may be raised by FileDownloader objects when a file they
 437         download is too small for what the server announced first, indicating
 438         the connection was probably interrupted.
 439         """
 440         # Both in bytes
 441         downloaded = None
 442         expected = None
 443
 444         def __init__(self, downloaded, expected):
 445                 self.downloaded = downloaded
 446                 self.expected = expected
 447
 448
 449 class YoutubeDLHandler(urllib2.HTTPHandler):
 450         """Handler for HTTP requests and responses.
 451
 452         This class, when installed with an OpenerDirector, automatically adds
 453         the standard headers to every HTTP request and handles gzipped and
 454         deflated responses from web servers. If compression is to be avoided in
 455         a particular request, the original request in the program code only has
 456         to include the HTTP header "Youtubedl-No-Compression", which will be
 457         removed before making the real request.
 458
 459         Part of this code was copied from:
 460
 461         http://techknack.net/python-urllib2-handlers/
 462
 463         Andrew Rowls, the author of that code, agreed to release it to the
 464         public domain.
 465         """
 466
 467         @staticmethod
 468         def deflate(data):
 469                 try:
 470                         return zlib.decompress(data, -zlib.MAX_WBITS)
 471                 except zlib.error:
 472                         return zlib.decompress(data)
 473
 474         @staticmethod
 475         def addinfourl_wrapper(stream, headers, url, code):
 476                 if hasattr(urllib2.addinfourl, 'getcode'):
 477                         return urllib2.addinfourl(stream, headers, url, code)
 478                 ret = urllib2.addinfourl(stream, headers, url)
 479                 ret.code = code
 480                 return ret
 481
 482         def http_request(self, req):
 483                 for h in std_headers:
 484                         if h in req.headers:
 485                                 del req.headers[h]
 486                         req.add_header(h, std_headers[h])
 487                 if 'Youtubedl-no-compression' in req.headers:
 488                         if 'Accept-encoding' in req.headers:
 489                                 del req.headers['Accept-encoding']
 490                         del req.headers['Youtubedl-no-compression']
 491                 return req
 492
 493         def http_response(self, req, resp):
 494                 old_resp = resp
 495                 # gzip
 496                 if resp.headers.get('Content-encoding', '') == 'gzip':
 497                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 498                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 499                         resp.msg = old_resp.msg
 500                 # deflate
 501                 if resp.headers.get('Content-encoding', '') == 'deflate':
 502                         gz = StringIO.StringIO(self.deflate(resp.read()))
 503                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 504                         resp.msg = old_resp.msg
 505                 return resp
 506
 507
 508 class FileDownloader(object):
 509         """File Downloader class.
 510
 511         File downloader objects are the ones responsible of downloading the
 512         actual video file and writing it to disk if the user has requested
 513         it, among some other tasks. In most cases there should be one per
 514         program. As, given a video URL, the downloader doesn't know how to
 515         extract all the needed information, task that InfoExtractors do, it
 516         has to pass the URL to one of them.
 517
 518         For this, file downloader objects have a method that allows
 519         InfoExtractors to be registered in a given order. When it is passed
 520         a URL, the file downloader handles it to the first InfoExtractor it
 521         finds that reports being able to handle it. The InfoExtractor extracts
 522         all the information about the video or videos the URL refers to, and
 523         asks the FileDownloader to process the video information, possibly
 524         downloading the video.
 525
 526         File downloaders accept a lot of parameters. In order not to saturate
 527         the object constructor with arguments, it receives a dictionary of
 528         options instead. These options are available through the params
 529         attribute for the InfoExtractors to use. The FileDownloader also
 530         registers itself as the downloader in charge for the InfoExtractors
 531         that are added to it, so this is a "mutual registration".
 532
 533         Available options:
 534
 535         username:         Username for authentication purposes.
 536         password:         Password for authentication purposes.
 537         usenetrc:         Use netrc for authentication instead.
 538         quiet:            Do not print messages to stdout.
 539         forceurl:         Force printing final URL.
 540         forcetitle:       Force printing title.
 541         forcethumbnail:   Force printing thumbnail URL.
 542         forcedescription: Force printing description.
 543         forcefilename:    Force printing final filename.
 544         simulate:         Do not download the video files.
 545         format:           Video format code.
 546         format_limit:     Highest quality format to try.
 547         outtmpl:          Template for output names.
 548         ignoreerrors:     Do not stop on download errors.
 549         ratelimit:        Download speed limit, in bytes/sec.
 550         nooverwrites:     Prevent overwriting files.
 551         retries:          Number of times to retry for HTTP error 5xx
 552         continuedl:       Try to continue downloads if possible.
 553         noprogress:       Do not print the progress bar.
 554         playliststart:    Playlist item to start at.
 555         playlistend:      Playlist item to end at.
 556         matchtitle:       Download only matching titles.
 557         rejecttitle:      Reject downloads for matching titles.
 558         logtostderr:      Log messages to stderr instead of stdout.
 559         consoletitle:     Display progress in console window's titlebar.
 560         nopart:           Do not use temporary .part files.
 561         updatetime:       Use the Last-modified header to set output file timestamps.
 562         writedescription: Write the video description to a .description file
 563         writeinfojson:    Write the video description to a .info.json file
 564         writesubtitles:   Write the video subtitles to a .srt file
 565         subtitleslang:    Language of the subtitles to download
 566         """
 567
 568         params = None
 569         _ies = []
 570         _pps = []
 571         _download_retcode = None
 572         _num_downloads = None
 573         _screen_file = None
 574
 575         def __init__(self, params):
 576                 """Create a FileDownloader object with the given options."""
 577                 self._ies = []
 578                 self._pps = []
 579                 self._download_retcode = 0
 580                 self._num_downloads = 0
 581                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 582                 self.params = params
 583
 584         @staticmethod
 585         def format_bytes(bytes):
 586                 if bytes is None:
 587                         return 'N/A'
 588                 if type(bytes) is str:
 589                         bytes = float(bytes)
 590                 if bytes == 0.0:
 591                         exponent = 0
 592                 else:
 593                         exponent = long(math.log(bytes, 1024.0))
 594                 suffix = 'bkMGTPEZY'[exponent]
 595                 converted = float(bytes) / float(1024 ** exponent)
 596                 return '%.2f%s' % (converted, suffix)
 597
 598         @staticmethod
 599         def calc_percent(byte_counter, data_len):
 600                 if data_len is None:
 601                         return '---.-%'
 602                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 603
 604         @staticmethod
 605         def calc_eta(start, now, total, current):
 606                 if total is None:
 607                         return '--:--'
 608                 dif = now - start
 609                 if current == 0 or dif < 0.001: # One millisecond
 610                         return '--:--'
 611                 rate = float(current) / dif
 612                 eta = long((float(total) - float(current)) / rate)
 613                 (eta_mins, eta_secs) = divmod(eta, 60)
 614                 if eta_mins > 99:
 615                         return '--:--'
 616                 return '%02d:%02d' % (eta_mins, eta_secs)
 617
 618         @staticmethod
 619         def calc_speed(start, now, bytes):
 620                 dif = now - start
 621                 if bytes == 0 or dif < 0.001: # One millisecond
 622                         return '%10s' % '---b/s'
 623                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 624
 625         @staticmethod
 626         def best_block_size(elapsed_time, bytes):
 627                 new_min = max(bytes / 2.0, 1.0)
 628                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 629                 if elapsed_time < 0.001:
 630                         return long(new_max)
 631                 rate = bytes / elapsed_time
 632                 if rate > new_max:
 633                         return long(new_max)
 634                 if rate < new_min:
 635                         return long(new_min)
 636                 return long(rate)
 637
 638         @staticmethod
 639         def parse_bytes(bytestr):
 640                 """Parse a string indicating a byte quantity into a long integer."""
 641                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 642                 if matchobj is None:
 643                         return None
 644                 number = float(matchobj.group(1))
 645                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 646                 return long(round(number * multiplier))
 647
 648         def add_info_extractor(self, ie):
 649                 """Add an InfoExtractor object to the end of the list."""
 650                 self._ies.append(ie)
 651                 ie.set_downloader(self)
 652
 653         def add_post_processor(self, pp):
 654                 """Add a PostProcessor object to the end of the chain."""
 655                 self._pps.append(pp)
 656                 pp.set_downloader(self)
 657
 658         def to_screen(self, message, skip_eol=False):
 659                 """Print message to stdout if not in quiet mode."""
 660                 assert type(message) == type(u'')
 661                 if not self.params.get('quiet', False):
 662                         terminator = [u'\n', u''][skip_eol]
 663                         output = message + terminator
 664
 665                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 666                                 output = output.encode(preferredencoding(), 'ignore')
 667                         self._screen_file.write(output)
 668                         self._screen_file.flush()
 669
 670         def to_stderr(self, message):
 671                 """Print message to stderr."""
 672                 print >>sys.stderr, message.encode(preferredencoding())
 673
 674         def to_cons_title(self, message):
 675                 """Set console/terminal window title to message."""
 676                 if not self.params.get('consoletitle', False):
 677                         return
 678                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 679                         # c_wchar_p() might not be necessary if `message` is
 680                         # already of type unicode()
 681                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 682                 elif 'TERM' in os.environ:
 683                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 684
 685         def fixed_template(self):
 686                 """Checks if the output template is fixed."""
 687                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 688
 689         def trouble(self, message=None):
 690                 """Determine action to take when a download problem appears.
 691
 692                 Depending on if the downloader has been configured to ignore
 693                 download errors or not, this method may throw an exception or
 694                 not when errors are found, after printing the message.
 695                 """
 696                 if message is not None:
 697                         self.to_stderr(message)
 698                 if not self.params.get('ignoreerrors', False):
 699                         raise DownloadError(message)
 700                 self._download_retcode = 1
 701
 702         def slow_down(self, start_time, byte_counter):
 703                 """Sleep if the download speed is over the rate limit."""
 704                 rate_limit = self.params.get('ratelimit', None)
 705                 if rate_limit is None or byte_counter == 0:
 706                         return
 707                 now = time.time()
 708                 elapsed = now - start_time
 709                 if elapsed <= 0.0:
 710                         return
 711                 speed = float(byte_counter) / elapsed
 712                 if speed > rate_limit:
 713                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 714
 715         def temp_name(self, filename):
 716                 """Returns a temporary filename for the given filename."""
 717                 if self.params.get('nopart', False) or filename == u'-' or \
 718                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 719                         return filename
 720                 return filename + u'.part'
 721
 722         def undo_temp_name(self, filename):
 723                 if filename.endswith(u'.part'):
 724                         return filename[:-len(u'.part')]
 725                 return filename
 726
 727         def try_rename(self, old_filename, new_filename):
 728                 try:
 729                         if old_filename == new_filename:
 730                                 return
 731                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 732                 except (IOError, OSError), err:
 733                         self.trouble(u'ERROR: unable to rename file')
 734
 735         def try_utime(self, filename, last_modified_hdr):
 736                 """Try to set the last-modified time of the given file."""
 737                 if last_modified_hdr is None:
 738                         return
 739                 if not os.path.isfile(_encodeFilename(filename)):
 740                         return
 741                 timestr = last_modified_hdr
 742                 if timestr is None:
 743                         return
 744                 filetime = timeconvert(timestr)
 745                 if filetime is None:
 746                         return filetime
 747                 try:
 748                         os.utime(filename, (time.time(), filetime))
 749                 except:
 750                         pass
 751                 return filetime
 752
 753         def report_writedescription(self, descfn):
 754                 """ Report that the description file is being written """
 755                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 756
 757         def report_writesubtitles(self, srtfn):
 758                 """ Report that the subtitles file is being written """
 759                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 760
 761         def report_writeinfojson(self, infofn):
 762                 """ Report that the metadata file has been written """
 763                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 764
 765         def report_destination(self, filename):
 766                 """Report destination filename."""
 767                 self.to_screen(u'[download] Destination: ' + filename)
 768
 769         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 770                 """Report download progress."""
 771                 if self.params.get('noprogress', False):
 772                         return
 773                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 774                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 775                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 776                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 777
 778         def report_resuming_byte(self, resume_len):
 779                 """Report attempt to resume at given byte."""
 780                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 781
 782         def report_retry(self, count, retries):
 783                 """Report retry in case of HTTP error 5xx"""
 784                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 785
 786         def report_file_already_downloaded(self, file_name):
 787                 """Report file has already been fully downloaded."""
 788                 try:
 789                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 790                 except (UnicodeEncodeError), err:
 791                         self.to_screen(u'[download] The file has already been downloaded')
 792
 793         def report_unable_to_resume(self):
 794                 """Report it was impossible to resume download."""
 795                 self.to_screen(u'[download] Unable to resume')
 796
 797         def report_finish(self):
 798                 """Report download finished."""
 799                 if self.params.get('noprogress', False):
 800                         self.to_screen(u'[download] Download completed')
 801                 else:
 802                         self.to_screen(u'')
 803
 804         def increment_downloads(self):
 805                 """Increment the ordinal that assigns a number to each file."""
 806                 self._num_downloads += 1
 807
 808         def prepare_filename(self, info_dict):
 809                 """Generate the output filename."""
 810                 try:
 811                         template_dict = dict(info_dict)
 812                         template_dict['epoch'] = unicode(long(time.time()))
 813                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 814                         filename = self.params['outtmpl'] % template_dict
 815                         return filename
 816                 except (ValueError, KeyError), err:
 817                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 818                         return None
 819
 820         def _match_entry(self, info_dict):
 821                 """ Returns None iff the file should be downloaded """
 822
 823                 title = info_dict['title']
 824                 matchtitle = self.params.get('matchtitle', False)
 825                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 826                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 827                 rejecttitle = self.params.get('rejecttitle', False)
 828                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 829                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 830                 return None
 831
 832         def process_info(self, info_dict):
 833                 """Process a single dictionary returned by an InfoExtractor."""
 834
 835                 reason = self._match_entry(info_dict)
 836                 if reason is not None:
 837                         self.to_screen(u'[download] ' + reason)
 838                         return
 839
 840                 max_downloads = self.params.get('max_downloads')
 841                 if max_downloads is not None:
 842                         if self._num_downloads > int(max_downloads):
 843                                 raise MaxDownloadsReached()
 844
 845                 filename = self.prepare_filename(info_dict)
 846
 847                 # Forced printings
 848                 if self.params.get('forcetitle', False):
 849                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 850                 if self.params.get('forceurl', False):
 851                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 852                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 853                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 854                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 855                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 856                 if self.params.get('forcefilename', False) and filename is not None:
 857                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 858                 if self.params.get('forceformat', False):
 859                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 860
 861                 # Do nothing else if in simulate mode
 862                 if self.params.get('simulate', False):
 863                         return
 864
 865                 if filename is None:
 866                         return
 867
 868                 try:
 869                         dn = os.path.dirname(_encodeFilename(filename))
 870                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 871                                 os.makedirs(dn)
 872                 except (OSError, IOError), err:
 873                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 874                         return
 875
 876                 if self.params.get('writedescription', False):
 877                         try:
 878                                 descfn = filename + u'.description'
 879                                 self.report_writedescription(descfn)
 880                                 descfile = open(_encodeFilename(descfn), 'wb')
 881                                 try:
 882                                         descfile.write(info_dict['description'].encode('utf-8'))
 883                                 finally:
 884                                         descfile.close()
 885                         except (OSError, IOError):
 886                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 887                                 return
 888
 889                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 890                         # subtitles download errors are already managed as troubles in relevant IE
 891                         # that way it will silently go on when used with unsupporting IE
 892                         try:
 893                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 894                                 self.report_writesubtitles(srtfn)
 895                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 896                                 try:
 897                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 898                                 finally:
 899                                         srtfile.close()
 900                         except (OSError, IOError):
 901                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 902                                 return
 903
 904                 if self.params.get('writeinfojson', False):
 905                         infofn = filename + u'.info.json'
 906                         self.report_writeinfojson(infofn)
 907                         try:
 908                                 json.dump
 909                         except (NameError,AttributeError):
 910                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 911                                 return
 912                         try:
 913                                 infof = open(_encodeFilename(infofn), 'wb')
 914                                 try:
 915                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 916                                         json.dump(json_info_dict, infof)
 917                                 finally:
 918                                         infof.close()
 919                         except (OSError, IOError):
 920                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 921                                 return
 922
 923                 if not self.params.get('skip_download', False):
 924                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 925                                 success = True
 926                         else:
 927                                 try:
 928                                         success = self._do_download(filename, info_dict)
 929                                 except (OSError, IOError), err:
 930                                         raise UnavailableVideoError
 931                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 932                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 933                                         return
 934                                 except (ContentTooShortError, ), err:
 935                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 936                                         return
 937
 938                         if success:
 939                                 try:
 940                                         self.post_process(filename, info_dict)
 941                                 except (PostProcessingError), err:
 942                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 943                                         return
 944
 945         def download(self, url_list):
 946                 """Download a given list of URLs."""
 947                 if len(url_list) > 1 and self.fixed_template():
 948                         raise SameFileError(self.params['outtmpl'])
 949
 950                 for url in url_list:
 951                         suitable_found = False
 952                         for ie in self._ies:
 953                                 # Go to next InfoExtractor if not suitable
 954                                 if not ie.suitable(url):
 955                                         continue
 956
 957                                 # Suitable InfoExtractor found
 958                                 suitable_found = True
 959
 960                                 # Extract information from URL and process it
 961                                 ie.extract(url)
 962
 963                                 # Suitable InfoExtractor had been found; go to next URL
 964                                 break
 965
 966                         if not suitable_found:
 967                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 968
 969                 return self._download_retcode
 970
 971         def post_process(self, filename, ie_info):
 972                 """Run the postprocessing chain on the given file."""
 973                 info = dict(ie_info)
 974                 info['filepath'] = filename
 975                 for pp in self._pps:
 976                         info = pp.run(info)
 977                         if info is None:
 978                                 break
 979
 980         def _download_with_rtmpdump(self, filename, url, player_url):
 981                 self.report_destination(filename)
 982                 tmpfilename = self.temp_name(filename)
 983
 984                 # Check for rtmpdump first
 985                 try:
 986                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 987                 except (OSError, IOError):
 988                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 989                         return False
 990
 991                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 992                 # the connection was interrumpted and resuming appears to be
 993                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 994                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 995                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 996                 if self.params.get('verbose', False):
 997                         try:
 998                                 import pipes
 999                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
1000                         except ImportError:
1001                                 shell_quote = repr
1002                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
1003                 retval = subprocess.call(args)
1004                 while retval == 2 or retval == 1:
1005                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
1006                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
1007                         time.sleep(5.0) # This seems to be needed
1008                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
1009                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
1010                         if prevsize == cursize and retval == 1:
1011                                 break
1012                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
1013                         if prevsize == cursize and retval == 2 and cursize > 1024:
1014                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
1015                                 retval = 0
1016                                 break
1017                 if retval == 0:
1018                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
1019                         self.try_rename(tmpfilename, filename)
1020                         return True
1021                 else:
1022                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
1023                         return False
1024
1025         def _do_download(self, filename, info_dict):
1026                 url = info_dict['url']
1027                 player_url = info_dict.get('player_url', None)
1028
1029                 # Check file already present
1030                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
1031                         self.report_file_already_downloaded(filename)
1032                         return True
1033
1034                 # Attempt to download using rtmpdump
1035                 if url.startswith('rtmp'):
1036                         return self._download_with_rtmpdump(filename, url, player_url)
1037
1038                 tmpfilename = self.temp_name(filename)
1039                 stream = None
1040
1041                 # Do not include the Accept-Encoding header
1042                 headers = {'Youtubedl-no-compression': 'True'}
1043                 basic_request = urllib2.Request(url, None, headers)
1044                 request = urllib2.Request(url, None, headers)
1045
1046                 # Establish possible resume length
1047                 if os.path.isfile(_encodeFilename(tmpfilename)):
1048                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
1049                 else:
1050                         resume_len = 0
1051
1052                 open_mode = 'wb'
1053                 if resume_len != 0:
1054                         if self.params.get('continuedl', False):
1055                                 self.report_resuming_byte(resume_len)
1056                                 request.add_header('Range','bytes=%d-' % resume_len)
1057                                 open_mode = 'ab'
1058                         else:
1059                                 resume_len = 0
1060
1061                 count = 0
1062                 retries = self.params.get('retries', 0)
1063                 while count <= retries:
1064                         # Establish connection
1065                         try:
1066                                 if count == 0 and 'urlhandle' in info_dict:
1067                                         data = info_dict['urlhandle']
1068                                 data = urllib2.urlopen(request)
1069                                 break
1070                         except (urllib2.HTTPError, ), err:
1071                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1072                                         # Unexpected HTTP error
1073                                         raise
1074                                 elif err.code == 416:
1075                                         # Unable to resume (requested range not satisfiable)
1076                                         try:
1077                                                 # Open the connection again without the range header
1078                                                 data = urllib2.urlopen(basic_request)
1079                                                 content_length = data.info()['Content-Length']
1080                                         except (urllib2.HTTPError, ), err:
1081                                                 if err.code < 500 or err.code >= 600:
1082                                                         raise
1083                                         else:
1084                                                 # Examine the reported length
1085                                                 if (content_length is not None and
1086                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1087                                                         # The file had already been fully downloaded.
1088                                                         # Explanation to the above condition: in issue #175 it was revealed that
1089                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1090                                                         # changing the file size slightly and causing problems for some users. So
1091                                                         # I decided to implement a suggested change and consider the file
1092                                                         # completely downloaded if the file size differs less than 100 bytes from
1093                                                         # the one in the hard drive.
1094                                                         self.report_file_already_downloaded(filename)
1095                                                         self.try_rename(tmpfilename, filename)
1096                                                         return True
1097                                                 else:
1098                                                         # The length does not match, we start the download over
1099                                                         self.report_unable_to_resume()
1100                                                         open_mode = 'wb'
1101                                                         break
1102                         # Retry
1103                         count += 1
1104                         if count <= retries:
1105                                 self.report_retry(count, retries)
1106
1107                 if count > retries:
1108                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1109                         return False
1110
1111                 data_len = data.info().get('Content-length', None)
1112                 if data_len is not None:
1113                         data_len = long(data_len) + resume_len
1114                 data_len_str = self.format_bytes(data_len)
1115                 byte_counter = 0 + resume_len
1116                 block_size = 1024
1117                 start = time.time()
1118                 while True:
1119                         # Download and write
1120                         before = time.time()
1121                         data_block = data.read(block_size)
1122                         after = time.time()
1123                         if len(data_block) == 0:
1124                                 break
1125                         byte_counter += len(data_block)
1126
1127                         # Open file just in time
1128                         if stream is None:
1129                                 try:
1130                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1131                                         assert stream is not None
1132                                         filename = self.undo_temp_name(tmpfilename)
1133                                         self.report_destination(filename)
1134                                 except (OSError, IOError), err:
1135                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1136                                         return False
1137                         try:
1138                                 stream.write(data_block)
1139                         except (IOError, OSError), err:
1140                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1141                                 return False
1142                         block_size = self.best_block_size(after - before, len(data_block))
1143
1144                         # Progress message
1145                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1146                         if data_len is None:
1147                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1148                         else:
1149                                 percent_str = self.calc_percent(byte_counter, data_len)
1150                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1151                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1152
1153                         # Apply rate limit
1154                         self.slow_down(start, byte_counter - resume_len)
1155
1156                 if stream is None:
1157                         self.trouble(u'\nERROR: Did not get any data blocks')
1158                         return False
1159                 stream.close()
1160                 self.report_finish()
1161                 if data_len is not None and byte_counter != data_len:
1162                         raise ContentTooShortError(byte_counter, long(data_len))
1163                 self.try_rename(tmpfilename, filename)
1164
1165                 # Update file modification time
1166                 if self.params.get('updatetime', True):
1167                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1168
1169                 return True
1170
1171
1172 class InfoExtractor(object):
1173         """Information Extractor class.
1174
1175         Information extractors are the classes that, given a URL, extract
1176         information from the video (or videos) the URL refers to. This
1177         information includes the real video URL, the video title and simplified
1178         title, author and others. The information is stored in a dictionary
1179         which is then passed to the FileDownloader. The FileDownloader
1180         processes this information possibly downloading the video to the file
1181         system, among other possible outcomes. The dictionaries must include
1182         the following fields:
1183
1184         id:             Video identifier.
1185         url:            Final video URL.
1186         uploader:       Nickname of the video uploader.
1187         title:          Literal title.
1188         stitle:         Simplified title.
1189         ext:            Video filename extension.
1190         format:         Video format.
1191         player_url:     SWF Player URL (may be None).
1192
1193         The following fields are optional. Their primary purpose is to allow
1194         youtube-dl to serve as the backend for a video search function, such
1195         as the one in youtube2mp3.  They are only used when their respective
1196         forced printing functions are called:
1197
1198         thumbnail:      Full URL to a video thumbnail image.
1199         description:    One-line video description.
1200
1201         Subclasses of this one should re-define the _real_initialize() and
1202         _real_extract() methods and define a _VALID_URL regexp.
1203         Probably, they should also be added to the list of extractors.
1204         """
1205
1206         _ready = False
1207         _downloader = None
1208
1209         def __init__(self, downloader=None):
1210                 """Constructor. Receives an optional downloader."""
1211                 self._ready = False
1212                 self.set_downloader(downloader)
1213
1214         def suitable(self, url):
1215                 """Receives a URL and returns True if suitable for this IE."""
1216                 return re.match(self._VALID_URL, url) is not None
1217
1218         def initialize(self):
1219                 """Initializes an instance (authentication, etc)."""
1220                 if not self._ready:
1221                         self._real_initialize()
1222                         self._ready = True
1223
1224         def extract(self, url):
1225                 """Extracts URL information and returns it in list of dicts."""
1226                 self.initialize()
1227                 return self._real_extract(url)
1228
1229         def set_downloader(self, downloader):
1230                 """Sets the downloader for this IE."""
1231                 self._downloader = downloader
1232
1233         def _real_initialize(self):
1234                 """Real initialization process. Redefine in subclasses."""
1235                 pass
1236
1237         def _real_extract(self, url):
1238                 """Real extraction process. Redefine in subclasses."""
1239                 pass
1240
1241
1242 class YoutubeIE(InfoExtractor):
1243         """Information extractor for youtube.com."""
1244
1245         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1246         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1247         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1248         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1249         _NETRC_MACHINE = 'youtube'
1250         # Listed in order of quality
1251         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1252         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1253         _video_extensions = {
1254                 '13': '3gp',
1255                 '17': 'mp4',
1256                 '18': 'mp4',
1257                 '22': 'mp4',
1258                 '37': 'mp4',
1259                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1260                 '43': 'webm',
1261                 '44': 'webm',
1262                 '45': 'webm',
1263         }
1264         _video_dimensions = {
1265                 '5': '240x400',
1266                 '6': '???',
1267                 '13': '???',
1268                 '17': '144x176',
1269                 '18': '360x640',
1270                 '22': '720x1280',
1271                 '34': '360x640',
1272                 '35': '480x854',
1273                 '37': '1080x1920',
1274                 '38': '3072x4096',
1275                 '43': '360x640',
1276                 '44': '480x854',
1277                 '45': '720x1280',
1278         }
1279         IE_NAME = u'youtube'
1280
1281         def report_lang(self):
1282                 """Report attempt to set language."""
1283                 self._downloader.to_screen(u'[youtube] Setting language')
1284
1285         def report_login(self):
1286                 """Report attempt to log in."""
1287                 self._downloader.to_screen(u'[youtube] Logging in')
1288
1289         def report_age_confirmation(self):
1290                 """Report attempt to confirm age."""
1291                 self._downloader.to_screen(u'[youtube] Confirming age')
1292
1293         def report_video_webpage_download(self, video_id):
1294                 """Report attempt to download video webpage."""
1295                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1296
1297         def report_video_info_webpage_download(self, video_id):
1298                 """Report attempt to download video info webpage."""
1299                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1300
1301         def report_video_subtitles_download(self, video_id):
1302                 """Report attempt to download video info webpage."""
1303                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1304
1305         def report_information_extraction(self, video_id):
1306                 """Report attempt to extract video information."""
1307                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1308
1309         def report_unavailable_format(self, video_id, format):
1310                 """Report extracted video URL."""
1311                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1312
1313         def report_rtmp_download(self):
1314                 """Indicate the download will use the RTMP protocol."""
1315                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1316
1317         def _closed_captions_xml_to_srt(self, xml_string):
1318                 srt = ''
1319                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1320                 # TODO parse xml instead of regex
1321                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1322                         if not dur: dur = '4'
1323                         start = float(start)
1324                         end = start + float(dur)
1325                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1326                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1327                         caption = _unescapeHTML(caption)
1328                         caption = _unescapeHTML(caption) # double cycle, inentional
1329                         srt += str(n) + '\n'
1330                         srt += start + ' --> ' + end + '\n'
1331                         srt += caption + '\n\n'
1332                 return srt
1333
1334         def _print_formats(self, formats):
1335                 print 'Available formats:'
1336                 for x in formats:
1337                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1338
1339         def _real_initialize(self):
1340                 if self._downloader is None:
1341                         return
1342
1343                 username = None
1344                 password = None
1345                 downloader_params = self._downloader.params
1346
1347                 # Attempt to use provided username and password or .netrc data
1348                 if downloader_params.get('username', None) is not None:
1349                         username = downloader_params['username']
1350                         password = downloader_params['password']
1351                 elif downloader_params.get('usenetrc', False):
1352                         try:
1353                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1354                                 if info is not None:
1355                                         username = info[0]
1356                                         password = info[2]
1357                                 else:
1358                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1359                         except (IOError, netrc.NetrcParseError), err:
1360                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1361                                 return
1362
1363                 # Set language
1364                 request = urllib2.Request(self._LANG_URL)
1365                 try:
1366                         self.report_lang()
1367                         urllib2.urlopen(request).read()
1368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1370                         return
1371
1372                 # No authentication to be performed
1373                 if username is None:
1374                         return
1375
1376                 # Log in
1377                 login_form = {
1378                                 'current_form': 'loginForm',
1379                                 'next':         '/',
1380                                 'action_login': 'Log In',
1381                                 'username':     username,
1382                                 'password':     password,
1383                                 }
1384                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1385                 try:
1386                         self.report_login()
1387                         login_results = urllib2.urlopen(request).read()
1388                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1389                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1390                                 return
1391                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1393                         return
1394
1395                 # Confirm age
1396                 age_form = {
1397                                 'next_url':             '/',
1398                                 'action_confirm':       'Confirm',
1399                                 }
1400                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1401                 try:
1402                         self.report_age_confirmation()
1403                         age_results = urllib2.urlopen(request).read()
1404                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1406                         return
1407
1408         def _real_extract(self, url):
1409                 # Extract video id from URL
1410                 mobj = re.match(self._VALID_URL, url)
1411                 if mobj is None:
1412                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1413                         return
1414                 video_id = mobj.group(2)
1415
1416                 # Get video webpage
1417                 self.report_video_webpage_download(video_id)
1418                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1419                 try:
1420                         video_webpage = urllib2.urlopen(request).read()
1421                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1422                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1423                         return
1424
1425                 # Attempt to extract SWF player URL
1426                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1427                 if mobj is not None:
1428                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1429                 else:
1430                         player_url = None
1431
1432                 # Get video info
1433                 self.report_video_info_webpage_download(video_id)
1434                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1435                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1436                                         % (video_id, el_type))
1437                         request = urllib2.Request(video_info_url)
1438                         try:
1439                                 video_info_webpage = urllib2.urlopen(request).read()
1440                                 video_info = parse_qs(video_info_webpage)
1441                                 if 'token' in video_info:
1442                                         break
1443                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1445                                 return
1446                 if 'token' not in video_info:
1447                         if 'reason' in video_info:
1448                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1449                         else:
1450                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1451                         return
1452
1453                 # Start extracting information
1454                 self.report_information_extraction(video_id)
1455
1456                 # uploader
1457                 if 'author' not in video_info:
1458                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1459                         return
1460                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1461
1462                 # title
1463                 if 'title' not in video_info:
1464                         self._downloader.trouble(u'ERROR: unable to extract video title')
1465                         return
1466                 video_title = urllib.unquote_plus(video_info['title'][0])
1467                 video_title = video_title.decode('utf-8')
1468                 video_title = sanitize_title(video_title)
1469
1470                 # simplified title
1471                 simple_title = _simplify_title(video_title)
1472
1473                 # thumbnail image
1474                 if 'thumbnail_url' not in video_info:
1475                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1476                         video_thumbnail = ''
1477                 else:   # don't panic if we can't find it
1478                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1479
1480                 # upload date
1481                 upload_date = u'NA'
1482                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1483                 if mobj is not None:
1484                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1485                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1486                         for expression in format_expressions:
1487                                 try:
1488                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1489                                 except:
1490                                         pass
1491
1492                 # description
1493                 video_description = get_element_by_id("eow-description", video_webpage)
1494                 if video_description: video_description = clean_html(video_description.decode('utf8'))
1495                 else: video_description = ''
1496
1497                 # closed captions
1498                 video_subtitles = None
1499                 if self._downloader.params.get('writesubtitles', False):
1500                         self.report_video_subtitles_download(video_id)
1501                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1502                         try:
1503                                 srt_list = urllib2.urlopen(request).read()
1504                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1506                         else:
1507                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1508                                 if srt_lang_list:
1509                                         if self._downloader.params.get('subtitleslang', False):
1510                                                 srt_lang = self._downloader.params.get('subtitleslang')
1511                                         elif 'en' in srt_lang_list:
1512                                                 srt_lang = 'en'
1513                                         else:
1514                                                 srt_lang = srt_lang_list[0]
1515                                         if not srt_lang in srt_lang_list:
1516                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1517                                         else:
1518                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1519                                                 try:
1520                                                         srt_xml = urllib2.urlopen(request).read()
1521                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1522                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1523                                                 else:
1524                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1525                                 else:
1526                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1527
1528                 # token
1529                 video_token = urllib.unquote_plus(video_info['token'][0])
1530
1531                 # Decide which formats to download
1532                 req_format = self._downloader.params.get('format', None)
1533
1534                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1535                         self.report_rtmp_download()
1536                         video_url_list = [(None, video_info['conn'][0])]
1537                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1538                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1539                         url_data = [parse_qs(uds) for uds in url_data_strs]
1540                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1541                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1542
1543                         format_limit = self._downloader.params.get('format_limit', None)
1544                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1545                         if format_limit is not None and format_limit in available_formats:
1546                                 format_list = available_formats[available_formats.index(format_limit):]
1547                         else:
1548                                 format_list = available_formats
1549                         existing_formats = [x for x in format_list if x in url_map]
1550                         if len(existing_formats) == 0:
1551                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1552                                 return
1553                         if self._downloader.params.get('listformats', None):
1554                                 self._print_formats(existing_formats)
1555                                 return
1556                         if req_format is None or req_format == 'best':
1557                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1558                         elif req_format == 'worst':
1559                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1560                         elif req_format in ('-1', 'all'):
1561                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1562                         else:
1563                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1564                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1565                                 req_formats = req_format.split('/')
1566                                 video_url_list = None
1567                                 for rf in req_formats:
1568                                         if rf in url_map:
1569                                                 video_url_list = [(rf, url_map[rf])]
1570                                                 break
1571                                 if video_url_list is None:
1572                                         self._downloader.trouble(u'ERROR: requested format not available')
1573                                         return
1574                 else:
1575                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1576                         return
1577
1578                 for format_param, video_real_url in video_url_list:
1579                         # At this point we have a new video
1580                         self._downloader.increment_downloads()
1581
1582                         # Extension
1583                         video_extension = self._video_extensions.get(format_param, 'flv')
1584
1585                         try:
1586                                 # Process video information
1587                                 self._downloader.process_info({
1588                                         'id':           video_id.decode('utf-8'),
1589                                         'url':          video_real_url.decode('utf-8'),
1590                                         'uploader':     video_uploader.decode('utf-8'),
1591                                         'upload_date':  upload_date,
1592                                         'title':        video_title,
1593                                         'stitle':       simple_title,
1594                                         'ext':          video_extension.decode('utf-8'),
1595                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1596                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1597                                         'description':  video_description,
1598                                         'player_url':   player_url,
1599                                         'subtitles':    video_subtitles
1600                                 })
1601                         except UnavailableVideoError, err:
1602                                 self._downloader.trouble(u'\nERROR: unable to download video')
1603
1604
1605 class MetacafeIE(InfoExtractor):
1606         """Information Extractor for metacafe.com."""
1607
1608         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1609         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1610         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1611         _youtube_ie = None
1612         IE_NAME = u'metacafe'
1613
1614         def __init__(self, youtube_ie, downloader=None):
1615                 InfoExtractor.__init__(self, downloader)
1616                 self._youtube_ie = youtube_ie
1617
1618         def report_disclaimer(self):
1619                 """Report disclaimer retrieval."""
1620                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1621
1622         def report_age_confirmation(self):
1623                 """Report attempt to confirm age."""
1624                 self._downloader.to_screen(u'[metacafe] Confirming age')
1625
1626         def report_download_webpage(self, video_id):
1627                 """Report webpage download."""
1628                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1629
1630         def report_extraction(self, video_id):
1631                 """Report information extraction."""
1632                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1633
1634         def _real_initialize(self):
1635                 # Retrieve disclaimer
1636                 request = urllib2.Request(self._DISCLAIMER)
1637                 try:
1638                         self.report_disclaimer()
1639                         disclaimer = urllib2.urlopen(request).read()
1640                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1642                         return
1643
1644                 # Confirm age
1645                 disclaimer_form = {
1646                         'filters': '0',
1647                         'submit': "Continue - I'm over 18",
1648                         }
1649                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1650                 try:
1651                         self.report_age_confirmation()
1652                         disclaimer = urllib2.urlopen(request).read()
1653                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1654                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1655                         return
1656
1657         def _real_extract(self, url):
1658                 # Extract id and simplified title from URL
1659                 mobj = re.match(self._VALID_URL, url)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1662                         return
1663
1664                 video_id = mobj.group(1)
1665
1666                 # Check if video comes from YouTube
1667                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1668                 if mobj2 is not None:
1669                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1670                         return
1671
1672                 # At this point we have a new video
1673                 self._downloader.increment_downloads()
1674
1675                 simple_title = mobj.group(2).decode('utf-8')
1676
1677                 # Retrieve video webpage to extract further information
1678                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1679                 try:
1680                         self.report_download_webpage(video_id)
1681                         webpage = urllib2.urlopen(request).read()
1682                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1684                         return
1685
1686                 # Extract URL, uploader and title from webpage
1687                 self.report_extraction(video_id)
1688                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1689                 if mobj is not None:
1690                         mediaURL = urllib.unquote(mobj.group(1))
1691                         video_extension = mediaURL[-3:]
1692
1693                         # Extract gdaKey if available
1694                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1695                         if mobj is None:
1696                                 video_url = mediaURL
1697                         else:
1698                                 gdaKey = mobj.group(1)
1699                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1700                 else:
1701                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1702                         if mobj is None:
1703                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1704                                 return
1705                         vardict = parse_qs(mobj.group(1))
1706                         if 'mediaData' not in vardict:
1707                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1708                                 return
1709                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1710                         if mobj is None:
1711                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1712                                 return
1713                         mediaURL = mobj.group(1).replace('\\/', '/')
1714                         video_extension = mediaURL[-3:]
1715                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1716
1717                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1718                 if mobj is None:
1719                         self._downloader.trouble(u'ERROR: unable to extract title')
1720                         return
1721                 video_title = mobj.group(1).decode('utf-8')
1722                 video_title = sanitize_title(video_title)
1723
1724                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1725                 if mobj is None:
1726                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1727                         return
1728                 video_uploader = mobj.group(1)
1729
1730                 try:
1731                         # Process video information
1732                         self._downloader.process_info({
1733                                 'id':           video_id.decode('utf-8'),
1734                                 'url':          video_url.decode('utf-8'),
1735                                 'uploader':     video_uploader.decode('utf-8'),
1736                                 'upload_date':  u'NA',
1737                                 'title':        video_title,
1738                                 'stitle':       simple_title,
1739                                 'ext':          video_extension.decode('utf-8'),
1740                                 'format':       u'NA',
1741                                 'player_url':   None,
1742                         })
1743                 except UnavailableVideoError:
1744                         self._downloader.trouble(u'\nERROR: unable to download video')
1745
1746
1747 class DailymotionIE(InfoExtractor):
1748         """Information Extractor for Dailymotion"""
1749
1750         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1751         IE_NAME = u'dailymotion'
1752
1753         def __init__(self, downloader=None):
1754                 InfoExtractor.__init__(self, downloader)
1755
1756         def report_download_webpage(self, video_id):
1757                 """Report webpage download."""
1758                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1759
1760         def report_extraction(self, video_id):
1761                 """Report information extraction."""
1762                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1763
1764         def _real_extract(self, url):
1765                 # Extract id and simplified title from URL
1766                 mobj = re.match(self._VALID_URL, url)
1767                 if mobj is None:
1768                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1769                         return
1770
1771                 # At this point we have a new video
1772                 self._downloader.increment_downloads()
1773                 video_id = mobj.group(1)
1774
1775                 video_extension = 'flv'
1776
1777                 # Retrieve video webpage to extract further information
1778                 request = urllib2.Request(url)
1779                 request.add_header('Cookie', 'family_filter=off')
1780                 try:
1781                         self.report_download_webpage(video_id)
1782                         webpage = urllib2.urlopen(request).read()
1783                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1784                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1785                         return
1786
1787                 # Extract URL, uploader and title from webpage
1788                 self.report_extraction(video_id)
1789                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1792                         return
1793                 sequence = urllib.unquote(mobj.group(1))
1794                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1795                 if mobj is None:
1796                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1797                         return
1798                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1799
1800                 # if needed add http://www.dailymotion.com/ if relative URL
1801
1802                 video_url = mediaURL
1803
1804                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1805                 if mobj is None:
1806                         self._downloader.trouble(u'ERROR: unable to extract title')
1807                         return
1808                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1809                 video_title = sanitize_title(video_title)
1810                 simple_title = _simplify_title(video_title)
1811
1812                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1813                 if mobj is None:
1814                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1815                         return
1816                 video_uploader = mobj.group(1)
1817
1818                 try:
1819                         # Process video information
1820                         self._downloader.process_info({
1821                                 'id':           video_id.decode('utf-8'),
1822                                 'url':          video_url.decode('utf-8'),
1823                                 'uploader':     video_uploader.decode('utf-8'),
1824                                 'upload_date':  u'NA',
1825                                 'title':        video_title,
1826                                 'stitle':       simple_title,
1827                                 'ext':          video_extension.decode('utf-8'),
1828                                 'format':       u'NA',
1829                                 'player_url':   None,
1830                         })
1831                 except UnavailableVideoError:
1832                         self._downloader.trouble(u'\nERROR: unable to download video')
1833
1834
1835 class GoogleIE(InfoExtractor):
1836         """Information extractor for video.google.com."""
1837
1838         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1839         IE_NAME = u'video.google'
1840
1841         def __init__(self, downloader=None):
1842                 InfoExtractor.__init__(self, downloader)
1843
1844         def report_download_webpage(self, video_id):
1845                 """Report webpage download."""
1846                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1847
1848         def report_extraction(self, video_id):
1849                 """Report information extraction."""
1850                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1851
1852         def _real_extract(self, url):
1853                 # Extract id from URL
1854                 mobj = re.match(self._VALID_URL, url)
1855                 if mobj is None:
1856                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1857                         return
1858
1859                 # At this point we have a new video
1860                 self._downloader.increment_downloads()
1861                 video_id = mobj.group(1)
1862
1863                 video_extension = 'mp4'
1864
1865                 # Retrieve video webpage to extract further information
1866                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1867                 try:
1868                         self.report_download_webpage(video_id)
1869                         webpage = urllib2.urlopen(request).read()
1870                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1872                         return
1873
1874                 # Extract URL, uploader, and title from webpage
1875                 self.report_extraction(video_id)
1876                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1877                 if mobj is None:
1878                         video_extension = 'flv'
1879                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1880                 if mobj is None:
1881                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1882                         return
1883                 mediaURL = urllib.unquote(mobj.group(1))
1884                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1885                 mediaURL = mediaURL.replace('\\x26', '\x26')
1886
1887                 video_url = mediaURL
1888
1889                 mobj = re.search(r'<title>(.*)</title>', webpage)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: unable to extract title')
1892                         return
1893                 video_title = mobj.group(1).decode('utf-8')
1894                 video_title = sanitize_title(video_title)
1895                 simple_title = _simplify_title(video_title)
1896
1897                 # Extract video description
1898                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: unable to extract video description')
1901                         return
1902                 video_description = mobj.group(1).decode('utf-8')
1903                 if not video_description:
1904                         video_description = 'No description available.'
1905
1906                 # Extract video thumbnail
1907                 if self._downloader.params.get('forcethumbnail', False):
1908                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1909                         try:
1910                                 webpage = urllib2.urlopen(request).read()
1911                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1912                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1913                                 return
1914                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1915                         if mobj is None:
1916                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1917                                 return
1918                         video_thumbnail = mobj.group(1)
1919                 else:   # we need something to pass to process_info
1920                         video_thumbnail = ''
1921
1922                 try:
1923                         # Process video information
1924                         self._downloader.process_info({
1925                                 'id':           video_id.decode('utf-8'),
1926                                 'url':          video_url.decode('utf-8'),
1927                                 'uploader':     u'NA',
1928                                 'upload_date':  u'NA',
1929                                 'title':        video_title,
1930                                 'stitle':       simple_title,
1931                                 'ext':          video_extension.decode('utf-8'),
1932                                 'format':       u'NA',
1933                                 'player_url':   None,
1934                         })
1935                 except UnavailableVideoError:
1936                         self._downloader.trouble(u'\nERROR: unable to download video')
1937
1938
1939 class PhotobucketIE(InfoExtractor):
1940         """Information extractor for photobucket.com."""
1941
1942         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1943         IE_NAME = u'photobucket'
1944
1945         def __init__(self, downloader=None):
1946                 InfoExtractor.__init__(self, downloader)
1947
1948         def report_download_webpage(self, video_id):
1949                 """Report webpage download."""
1950                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1951
1952         def report_extraction(self, video_id):
1953                 """Report information extraction."""
1954                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1955
1956         def _real_extract(self, url):
1957                 # Extract id from URL
1958                 mobj = re.match(self._VALID_URL, url)
1959                 if mobj is None:
1960                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1961                         return
1962
1963                 # At this point we have a new video
1964                 self._downloader.increment_downloads()
1965                 video_id = mobj.group(1)
1966
1967                 video_extension = 'flv'
1968
1969                 # Retrieve video webpage to extract further information
1970                 request = urllib2.Request(url)
1971                 try:
1972                         self.report_download_webpage(video_id)
1973                         webpage = urllib2.urlopen(request).read()
1974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1975                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1976                         return
1977
1978                 # Extract URL, uploader, and title from webpage
1979                 self.report_extraction(video_id)
1980                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1981                 if mobj is None:
1982                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1983                         return
1984                 mediaURL = urllib.unquote(mobj.group(1))
1985
1986                 video_url = mediaURL
1987
1988                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1989                 if mobj is None:
1990                         self._downloader.trouble(u'ERROR: unable to extract title')
1991                         return
1992                 video_title = mobj.group(1).decode('utf-8')
1993                 video_title = sanitize_title(video_title)
1994                 simple_title = _simplify_title(vide_title)
1995
1996                 video_uploader = mobj.group(2).decode('utf-8')
1997
1998                 try:
1999                         # Process video information
2000                         self._downloader.process_info({
2001                                 'id':           video_id.decode('utf-8'),
2002                                 'url':          video_url.decode('utf-8'),
2003                                 'uploader':     video_uploader,
2004                                 'upload_date':  u'NA',
2005                                 'title':        video_title,
2006                                 'stitle':       simple_title,
2007                                 'ext':          video_extension.decode('utf-8'),
2008                                 'format':       u'NA',
2009                                 'player_url':   None,
2010                         })
2011                 except UnavailableVideoError:
2012                         self._downloader.trouble(u'\nERROR: unable to download video')
2013
2014
2015 class YahooIE(InfoExtractor):
2016         """Information extractor for video.yahoo.com."""
2017
2018         # _VALID_URL matches all Yahoo! Video URLs
2019         # _VPAGE_URL matches only the extractable '/watch/' URLs
2020         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2021         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2022         IE_NAME = u'video.yahoo'
2023
2024         def __init__(self, downloader=None):
2025                 InfoExtractor.__init__(self, downloader)
2026
2027         def report_download_webpage(self, video_id):
2028                 """Report webpage download."""
2029                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2030
2031         def report_extraction(self, video_id):
2032                 """Report information extraction."""
2033                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2034
2035         def _real_extract(self, url, new_video=True):
2036                 # Extract ID from URL
2037                 mobj = re.match(self._VALID_URL, url)
2038                 if mobj is None:
2039                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2040                         return
2041
2042                 # At this point we have a new video
2043                 self._downloader.increment_downloads()
2044                 video_id = mobj.group(2)
2045                 video_extension = 'flv'
2046
2047                 # Rewrite valid but non-extractable URLs as
2048                 # extractable English language /watch/ URLs
2049                 if re.match(self._VPAGE_URL, url) is None:
2050                         request = urllib2.Request(url)
2051                         try:
2052                                 webpage = urllib2.urlopen(request).read()
2053                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2054                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2055                                 return
2056
2057                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2058                         if mobj is None:
2059                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2060                                 return
2061                         yahoo_id = mobj.group(1)
2062
2063                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2064                         if mobj is None:
2065                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2066                                 return
2067                         yahoo_vid = mobj.group(1)
2068
2069                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2070                         return self._real_extract(url, new_video=False)
2071
2072                 # Retrieve video webpage to extract further information
2073                 request = urllib2.Request(url)
2074                 try:
2075                         self.report_download_webpage(video_id)
2076                         webpage = urllib2.urlopen(request).read()
2077                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2078                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2079                         return
2080
2081                 # Extract uploader and title from webpage
2082                 self.report_extraction(video_id)
2083                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2084                 if mobj is None:
2085                         self._downloader.trouble(u'ERROR: unable to extract video title')
2086                         return
2087                 video_title = mobj.group(1).decode('utf-8')
2088                 simple_title = _simplify_title(video_title)
2089
2090                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2091                 if mobj is None:
2092                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2093                         return
2094                 video_uploader = mobj.group(1).decode('utf-8')
2095
2096                 # Extract video thumbnail
2097                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2098                 if mobj is None:
2099                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2100                         return
2101                 video_thumbnail = mobj.group(1).decode('utf-8')
2102
2103                 # Extract video description
2104                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2105                 if mobj is None:
2106                         self._downloader.trouble(u'ERROR: unable to extract video description')
2107                         return
2108                 video_description = mobj.group(1).decode('utf-8')
2109                 if not video_description:
2110                         video_description = 'No description available.'
2111
2112                 # Extract video height and width
2113                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2114                 if mobj is None:
2115                         self._downloader.trouble(u'ERROR: unable to extract video height')
2116                         return
2117                 yv_video_height = mobj.group(1)
2118
2119                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2120                 if mobj is None:
2121                         self._downloader.trouble(u'ERROR: unable to extract video width')
2122                         return
2123                 yv_video_width = mobj.group(1)
2124
2125                 # Retrieve video playlist to extract media URL
2126                 # I'm not completely sure what all these options are, but we
2127                 # seem to need most of them, otherwise the server sends a 401.
2128                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2129                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2130                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2131                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2132                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2133                 try:
2134                         self.report_download_webpage(video_id)
2135                         webpage = urllib2.urlopen(request).read()
2136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2138                         return
2139
2140                 # Extract media URL from playlist XML
2141                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2142                 if mobj is None:
2143                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2144                         return
2145                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2146                 video_url = _unescapeHTML(video_url)
2147
2148                 try:
2149                         # Process video information
2150                         self._downloader.process_info({
2151                                 'id':           video_id.decode('utf-8'),
2152                                 'url':          video_url,
2153                                 'uploader':     video_uploader,
2154                                 'upload_date':  u'NA',
2155                                 'title':        video_title,
2156                                 'stitle':       simple_title,
2157                                 'ext':          video_extension.decode('utf-8'),
2158                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2159                                 'description':  video_description,
2160                                 'thumbnail':    video_thumbnail,
2161                                 'player_url':   None,
2162                         })
2163                 except UnavailableVideoError:
2164                         self._downloader.trouble(u'\nERROR: unable to download video')
2165
2166
2167 class VimeoIE(InfoExtractor):
2168         """Information extractor for vimeo.com."""
2169
2170         # _VALID_URL matches Vimeo URLs
2171         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2172         IE_NAME = u'vimeo'
2173
2174         def __init__(self, downloader=None):
2175                 InfoExtractor.__init__(self, downloader)
2176
2177         def report_download_webpage(self, video_id):
2178                 """Report webpage download."""
2179                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2180
2181         def report_extraction(self, video_id):
2182                 """Report information extraction."""
2183                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2184
2185         def _real_extract(self, url, new_video=True):
2186                 # Extract ID from URL
2187                 mobj = re.match(self._VALID_URL, url)
2188                 if mobj is None:
2189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2190                         return
2191
2192                 # At this point we have a new video
2193                 self._downloader.increment_downloads()
2194                 video_id = mobj.group(1)
2195
2196                 # Retrieve video webpage to extract further information
2197                 request = urllib2.Request(url, None, std_headers)
2198                 try:
2199                         self.report_download_webpage(video_id)
2200                         webpage = urllib2.urlopen(request).read()
2201                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2203                         return
2204
2205                 # Now we begin extracting as much information as we can from what we
2206                 # retrieved. First we extract the information common to all extractors,
2207                 # and latter we extract those that are Vimeo specific.
2208                 self.report_extraction(video_id)
2209
2210                 # Extract the config JSON
2211                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2212                 try:
2213                         config = json.loads(config)
2214                 except:
2215                         self._downloader.trouble(u'ERROR: unable to extract info section')
2216                         return
2217
2218                 # Extract title
2219                 video_title = config["video"]["title"]
2220                 simple_title = _simplify_title(video_title)
2221
2222                 # Extract uploader
2223                 video_uploader = config["video"]["owner"]["name"]
2224
2225                 # Extract video thumbnail
2226                 video_thumbnail = config["video"]["thumbnail"]
2227
2228                 # Extract video description
2229                 video_description = get_element_by_id("description", webpage)
2230                 if video_description: video_description = clean_html(video_description.decode('utf8'))
2231                 else: video_description = ''
2232
2233                 # Extract upload date
2234                 video_upload_date = u'NA'
2235                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2236                 if mobj is not None:
2237                         video_upload_date = mobj.group(1)
2238
2239                 # Vimeo specific: extract request signature and timestamp
2240                 sig = config['request']['signature']
2241                 timestamp = config['request']['timestamp']
2242
2243                 # Vimeo specific: extract video codec and quality information
2244                 # TODO bind to format param
2245                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2246                 for codec in codecs:
2247                         if codec[0] in config["video"]["files"]:
2248                                 video_codec = codec[0]
2249                                 video_extension = codec[1]
2250                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2251                                 else: quality = 'sd'
2252                                 break
2253                 else:
2254                         self._downloader.trouble(u'ERROR: no known codec found')
2255                         return
2256
2257                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2258                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2259
2260                 try:
2261                         # Process video information
2262                         self._downloader.process_info({
2263                                 'id':           video_id,
2264                                 'url':          video_url,
2265                                 'uploader':     video_uploader,
2266                                 'upload_date':  video_upload_date,
2267                                 'title':        video_title,
2268                                 'stitle':       simple_title,
2269                                 'ext':          video_extension,
2270                                 'thumbnail':    video_thumbnail,
2271                                 'description':  video_description,
2272                                 'player_url':   None,
2273                         })
2274                 except UnavailableVideoError:
2275                         self._downloader.trouble(u'ERROR: unable to download video')
2276
2277
2278 class GenericIE(InfoExtractor):
2279         """Generic last-resort information extractor."""
2280
2281         _VALID_URL = r'.*'
2282         IE_NAME = u'generic'
2283
2284         def __init__(self, downloader=None):
2285                 InfoExtractor.__init__(self, downloader)
2286
2287         def report_download_webpage(self, video_id):
2288                 """Report webpage download."""
2289                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2290                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2291
2292         def report_extraction(self, video_id):
2293                 """Report information extraction."""
2294                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2295
2296         def _real_extract(self, url):
2297                 # At this point we have a new video
2298                 self._downloader.increment_downloads()
2299
2300                 video_id = url.split('/')[-1]
2301                 request = urllib2.Request(url)
2302                 try:
2303                         self.report_download_webpage(video_id)
2304                         webpage = urllib2.urlopen(request).read()
2305                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2306                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2307                         return
2308                 except ValueError, err:
2309                         # since this is the last-resort InfoExtractor, if
2310                         # this error is thrown, it'll be thrown here
2311                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2312                         return
2313
2314                 self.report_extraction(video_id)
2315                 # Start with something easy: JW Player in SWFObject
2316                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2317                 if mobj is None:
2318                         # Broaden the search a little bit
2319                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2320                 if mobj is None:
2321                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2322                         return
2323
2324                 # It's possible that one of the regexes
2325                 # matched, but returned an empty group:
2326                 if mobj.group(1) is None:
2327                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2328                         return
2329
2330                 video_url = urllib.unquote(mobj.group(1))
2331                 video_id = os.path.basename(video_url)
2332
2333                 # here's a fun little line of code for you:
2334                 video_extension = os.path.splitext(video_id)[1][1:]
2335                 video_id = os.path.splitext(video_id)[0]
2336
2337                 # it's tempting to parse this further, but you would
2338                 # have to take into account all the variations like
2339                 #   Video Title - Site Name
2340                 #   Site Name | Video Title
2341                 #   Video Title - Tagline | Site Name
2342                 # and so on and so forth; it's just not practical
2343                 mobj = re.search(r'<title>(.*)</title>', webpage)
2344                 if mobj is None:
2345                         self._downloader.trouble(u'ERROR: unable to extract title')
2346                         return
2347                 video_title = mobj.group(1).decode('utf-8')
2348                 video_title = sanitize_title(video_title)
2349                 simple_title = _simplify_title(video_title)
2350
2351                 # video uploader is domain name
2352                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2353                 if mobj is None:
2354                         self._downloader.trouble(u'ERROR: unable to extract title')
2355                         return
2356                 video_uploader = mobj.group(1).decode('utf-8')
2357
2358                 try:
2359                         # Process video information
2360                         self._downloader.process_info({
2361                                 'id':           video_id.decode('utf-8'),
2362                                 'url':          video_url.decode('utf-8'),
2363                                 'uploader':     video_uploader,
2364                                 'upload_date':  u'NA',
2365                                 'title':        video_title,
2366                                 'stitle':       simple_title,
2367                                 'ext':          video_extension.decode('utf-8'),
2368                                 'format':       u'NA',
2369                                 'player_url':   None,
2370                         })
2371                 except UnavailableVideoError, err:
2372                         self._downloader.trouble(u'\nERROR: unable to download video')
2373
2374
2375 class YoutubeSearchIE(InfoExtractor):
2376         """Information Extractor for YouTube search queries."""
2377         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2378         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2379         _youtube_ie = None
2380         _max_youtube_results = 1000
2381         IE_NAME = u'youtube:search'
2382
2383         def __init__(self, youtube_ie, downloader=None):
2384                 InfoExtractor.__init__(self, downloader)
2385                 self._youtube_ie = youtube_ie
2386
2387         def report_download_page(self, query, pagenum):
2388                 """Report attempt to download playlist page with given number."""
2389                 query = query.decode(preferredencoding())
2390                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2391
2392         def _real_initialize(self):
2393                 self._youtube_ie.initialize()
2394
2395         def _real_extract(self, query):
2396                 mobj = re.match(self._VALID_URL, query)
2397                 if mobj is None:
2398                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2399                         return
2400
2401                 prefix, query = query.split(':')
2402                 prefix = prefix[8:]
2403                 query = query.encode('utf-8')
2404                 if prefix == '':
2405                         self._download_n_results(query, 1)
2406                         return
2407                 elif prefix == 'all':
2408                         self._download_n_results(query, self._max_youtube_results)
2409                         return
2410                 else:
2411                         try:
2412                                 n = long(prefix)
2413                                 if n <= 0:
2414                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2415                                         return
2416                                 elif n > self._max_youtube_results:
2417                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2418                                         n = self._max_youtube_results
2419                                 self._download_n_results(query, n)
2420                                 return
2421                         except ValueError: # parsing prefix as integer fails
2422                                 self._download_n_results(query, 1)
2423                                 return
2424
2425         def _download_n_results(self, query, n):
2426                 """Downloads a specified number of results for a query"""
2427
2428                 video_ids = []
2429                 pagenum = 0
2430                 limit = n
2431
2432                 while (50 * pagenum) < limit:
2433                         self.report_download_page(query, pagenum+1)
2434                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2435                         request = urllib2.Request(result_url)
2436                         try:
2437                                 data = urllib2.urlopen(request).read()
2438                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2439                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2440                                 return
2441                         api_response = json.loads(data)['data']
2442
2443                         new_ids = list(video['id'] for video in api_response['items'])
2444                         video_ids += new_ids
2445
2446                         limit = min(n, api_response['totalItems'])
2447                         pagenum += 1
2448
2449                 if len(video_ids) > n:
2450                         video_ids = video_ids[:n]
2451                 for id in video_ids:
2452                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2453                 return
2454
2455
2456 class GoogleSearchIE(InfoExtractor):
2457         """Information Extractor for Google Video search queries."""
2458         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2459         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2460         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2461         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2462         _google_ie = None
2463         _max_google_results = 1000
2464         IE_NAME = u'video.google:search'
2465
2466         def __init__(self, google_ie, downloader=None):
2467                 InfoExtractor.__init__(self, downloader)
2468                 self._google_ie = google_ie
2469
2470         def report_download_page(self, query, pagenum):
2471                 """Report attempt to download playlist page with given number."""
2472                 query = query.decode(preferredencoding())
2473                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2474
2475         def _real_initialize(self):
2476                 self._google_ie.initialize()
2477
2478         def _real_extract(self, query):
2479                 mobj = re.match(self._VALID_URL, query)
2480                 if mobj is None:
2481                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2482                         return
2483
2484                 prefix, query = query.split(':')
2485                 prefix = prefix[8:]
2486                 query = query.encode('utf-8')
2487                 if prefix == '':
2488                         self._download_n_results(query, 1)
2489                         return
2490                 elif prefix == 'all':
2491                         self._download_n_results(query, self._max_google_results)
2492                         return
2493                 else:
2494                         try:
2495                                 n = long(prefix)
2496                                 if n <= 0:
2497                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2498                                         return
2499                                 elif n > self._max_google_results:
2500                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2501                                         n = self._max_google_results
2502                                 self._download_n_results(query, n)
2503                                 return
2504                         except ValueError: # parsing prefix as integer fails
2505                                 self._download_n_results(query, 1)
2506                                 return
2507
2508         def _download_n_results(self, query, n):
2509                 """Downloads a specified number of results for a query"""
2510
2511                 video_ids = []
2512                 pagenum = 0
2513
2514                 while True:
2515                         self.report_download_page(query, pagenum)
2516                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2517                         request = urllib2.Request(result_url)
2518                         try:
2519                                 page = urllib2.urlopen(request).read()
2520                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2522                                 return
2523
2524                         # Extract video identifiers
2525                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2526                                 video_id = mobj.group(1)
2527                                 if video_id not in video_ids:
2528                                         video_ids.append(video_id)
2529                                         if len(video_ids) == n:
2530                                                 # Specified n videos reached
2531                                                 for id in video_ids:
2532                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2533                                                 return
2534
2535                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2536                                 for id in video_ids:
2537                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2538                                 return
2539
2540                         pagenum = pagenum + 1
2541
2542
2543 class YahooSearchIE(InfoExtractor):
2544         """Information Extractor for Yahoo! Video search queries."""
2545         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2546         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2547         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2548         _MORE_PAGES_INDICATOR = r'\s*Next'
2549         _yahoo_ie = None
2550         _max_yahoo_results = 1000
2551         IE_NAME = u'video.yahoo:search'
2552
2553         def __init__(self, yahoo_ie, downloader=None):
2554                 InfoExtractor.__init__(self, downloader)
2555                 self._yahoo_ie = yahoo_ie
2556
2557         def report_download_page(self, query, pagenum):
2558                 """Report attempt to download playlist page with given number."""
2559                 query = query.decode(preferredencoding())
2560                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2561
2562         def _real_initialize(self):
2563                 self._yahoo_ie.initialize()
2564
2565         def _real_extract(self, query):
2566                 mobj = re.match(self._VALID_URL, query)
2567                 if mobj is None:
2568                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2569                         return
2570
2571                 prefix, query = query.split(':')
2572                 prefix = prefix[8:]
2573                 query = query.encode('utf-8')
2574                 if prefix == '':
2575                         self._download_n_results(query, 1)
2576                         return
2577                 elif prefix == 'all':
2578                         self._download_n_results(query, self._max_yahoo_results)
2579                         return
2580                 else:
2581                         try:
2582                                 n = long(prefix)
2583                                 if n <= 0:
2584                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2585                                         return
2586                                 elif n > self._max_yahoo_results:
2587                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2588                                         n = self._max_yahoo_results
2589                                 self._download_n_results(query, n)
2590                                 return
2591                         except ValueError: # parsing prefix as integer fails
2592                                 self._download_n_results(query, 1)
2593                                 return
2594
2595         def _download_n_results(self, query, n):
2596                 """Downloads a specified number of results for a query"""
2597
2598                 video_ids = []
2599                 already_seen = set()
2600                 pagenum = 1
2601
2602                 while True:
2603                         self.report_download_page(query, pagenum)
2604                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2605                         request = urllib2.Request(result_url)
2606                         try:
2607                                 page = urllib2.urlopen(request).read()
2608                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2609                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2610                                 return
2611
2612                         # Extract video identifiers
2613                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614                                 video_id = mobj.group(1)
2615                                 if video_id not in already_seen:
2616                                         video_ids.append(video_id)
2617                                         already_seen.add(video_id)
2618                                         if len(video_ids) == n:
2619                                                 # Specified n videos reached
2620                                                 for id in video_ids:
2621                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2622                                                 return
2623
2624                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2625                                 for id in video_ids:
2626                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2627                                 return
2628
2629                         pagenum = pagenum + 1
2630
2631
2632 class YoutubePlaylistIE(InfoExtractor):
2633         """Information Extractor for YouTube playlists."""
2634
2635         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2636         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2637         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2638         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2639         _youtube_ie = None
2640         IE_NAME = u'youtube:playlist'
2641
2642         def __init__(self, youtube_ie, downloader=None):
2643                 InfoExtractor.__init__(self, downloader)
2644                 self._youtube_ie = youtube_ie
2645
2646         def report_download_page(self, playlist_id, pagenum):
2647                 """Report attempt to download playlist page with given number."""
2648                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2649
2650         def _real_initialize(self):
2651                 self._youtube_ie.initialize()
2652
2653         def _real_extract(self, url):
2654                 # Extract playlist id
2655                 mobj = re.match(self._VALID_URL, url)
2656                 if mobj is None:
2657                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2658                         return
2659
2660                 # Single video case
2661                 if mobj.group(3) is not None:
2662                         self._youtube_ie.extract(mobj.group(3))
2663                         return
2664
2665                 # Download playlist pages
2666                 # prefix is 'p' as default for playlists but there are other types that need extra care
2667                 playlist_prefix = mobj.group(1)
2668                 if playlist_prefix == 'a':
2669                         playlist_access = 'artist'
2670                 else:
2671                         playlist_prefix = 'p'
2672                         playlist_access = 'view_play_list'
2673                 playlist_id = mobj.group(2)
2674                 video_ids = []
2675                 pagenum = 1
2676
2677                 while True:
2678                         self.report_download_page(playlist_id, pagenum)
2679                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2680                         request = urllib2.Request(url)
2681                         try:
2682                                 page = urllib2.urlopen(request).read()
2683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2685                                 return
2686
2687                         # Extract video identifiers
2688                         ids_in_page = []
2689                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2690                                 if mobj.group(1) not in ids_in_page:
2691                                         ids_in_page.append(mobj.group(1))
2692                         video_ids.extend(ids_in_page)
2693
2694                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2695                                 break
2696                         pagenum = pagenum + 1
2697
2698                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2699                 playlistend = self._downloader.params.get('playlistend', -1)
2700                 if playlistend == -1:
2701                         video_ids = video_ids[playliststart:]
2702                 else:
2703                         video_ids = video_ids[playliststart:playlistend]
2704
2705                 for id in video_ids:
2706                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2707                 return
2708
2709
2710 class YoutubeUserIE(InfoExtractor):
2711         """Information Extractor for YouTube users."""
2712
2713         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2714         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2715         _GDATA_PAGE_SIZE = 50
2716         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2717         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2718         _youtube_ie = None
2719         IE_NAME = u'youtube:user'
2720
2721         def __init__(self, youtube_ie, downloader=None):
2722                 InfoExtractor.__init__(self, downloader)
2723                 self._youtube_ie = youtube_ie
2724
2725         def report_download_page(self, username, start_index):
2726                 """Report attempt to download user page."""
2727                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2728                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2729
2730         def _real_initialize(self):
2731                 self._youtube_ie.initialize()
2732
2733         def _real_extract(self, url):
2734                 # Extract username
2735                 mobj = re.match(self._VALID_URL, url)
2736                 if mobj is None:
2737                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2738                         return
2739
2740                 username = mobj.group(1)
2741
2742                 # Download video ids using YouTube Data API. Result size per
2743                 # query is limited (currently to 50 videos) so we need to query
2744                 # page by page until there are no video ids - it means we got
2745                 # all of them.
2746
2747                 video_ids = []
2748                 pagenum = 0
2749
2750                 while True:
2751                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2752                         self.report_download_page(username, start_index)
2753
2754                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2755
2756                         try:
2757                                 page = urllib2.urlopen(request).read()
2758                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2759                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2760                                 return
2761
2762                         # Extract video identifiers
2763                         ids_in_page = []
2764
2765                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2766                                 if mobj.group(1) not in ids_in_page:
2767                                         ids_in_page.append(mobj.group(1))
2768
2769                         video_ids.extend(ids_in_page)
2770
2771                         # A little optimization - if current page is not
2772                         # "full", ie. does not contain PAGE_SIZE video ids then
2773                         # we can assume that this page is the last one - there
2774                         # are no more ids on further pages - no need to query
2775                         # again.
2776
2777                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2778                                 break
2779
2780                         pagenum += 1
2781
2782                 all_ids_count = len(video_ids)
2783                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2784                 playlistend = self._downloader.params.get('playlistend', -1)
2785
2786                 if playlistend == -1:
2787                         video_ids = video_ids[playliststart:]
2788                 else:
2789                         video_ids = video_ids[playliststart:playlistend]
2790
2791                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2792                                 (username, all_ids_count, len(video_ids)))
2793
2794                 for video_id in video_ids:
2795                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2796
2797
2798 class DepositFilesIE(InfoExtractor):
2799         """Information extractor for depositfiles.com"""
2800
2801         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2802         IE_NAME = u'DepositFiles'
2803
2804         def __init__(self, downloader=None):
2805                 InfoExtractor.__init__(self, downloader)
2806
2807         def report_download_webpage(self, file_id):
2808                 """Report webpage download."""
2809                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2810
2811         def report_extraction(self, file_id):
2812                 """Report information extraction."""
2813                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2814
2815         def _real_extract(self, url):
2816                 # At this point we have a new file
2817                 self._downloader.increment_downloads()
2818
2819                 file_id = url.split('/')[-1]
2820                 # Rebuild url in english locale
2821                 url = 'http://depositfiles.com/en/files/' + file_id
2822
2823                 # Retrieve file webpage with 'Free download' button pressed
2824                 free_download_indication = { 'gateway_result' : '1' }
2825                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2826                 try:
2827                         self.report_download_webpage(file_id)
2828                         webpage = urllib2.urlopen(request).read()
2829                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2830                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2831                         return
2832
2833                 # Search for the real file URL
2834                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2835                 if (mobj is None) or (mobj.group(1) is None):
2836                         # Try to figure out reason of the error.
2837                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2838                         if (mobj is not None) and (mobj.group(1) is not None):
2839                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2840                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2841                         else:
2842                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2843                         return
2844
2845                 file_url = mobj.group(1)
2846                 file_extension = os.path.splitext(file_url)[1][1:]
2847
2848                 # Search for file title
2849                 mobj = re.search(r'<b title="(.*?)">', webpage)
2850                 if mobj is None:
2851                         self._downloader.trouble(u'ERROR: unable to extract title')
2852                         return
2853                 file_title = mobj.group(1).decode('utf-8')
2854
2855                 try:
2856                         # Process file information
2857                         self._downloader.process_info({
2858                                 'id':           file_id.decode('utf-8'),
2859                                 'url':          file_url.decode('utf-8'),
2860                                 'uploader':     u'NA',
2861                                 'upload_date':  u'NA',
2862                                 'title':        file_title,
2863                                 'stitle':       file_title,
2864                                 'ext':          file_extension.decode('utf-8'),
2865                                 'format':       u'NA',
2866                                 'player_url':   None,
2867                         })
2868                 except UnavailableVideoError, err:
2869                         self._downloader.trouble(u'ERROR: unable to download file')
2870
2871
2872 class FacebookIE(InfoExtractor):
2873         """Information Extractor for Facebook"""
2874
2875         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2876         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2877         _NETRC_MACHINE = 'facebook'
2878         _available_formats = ['video', 'highqual', 'lowqual']
2879         _video_extensions = {
2880                 'video': 'mp4',
2881                 'highqual': 'mp4',
2882                 'lowqual': 'mp4',
2883         }
2884         IE_NAME = u'facebook'
2885
2886         def __init__(self, downloader=None):
2887                 InfoExtractor.__init__(self, downloader)
2888
2889         def _reporter(self, message):
2890                 """Add header and report message."""
2891                 self._downloader.to_screen(u'[facebook] %s' % message)
2892
2893         def report_login(self):
2894                 """Report attempt to log in."""
2895                 self._reporter(u'Logging in')
2896
2897         def report_video_webpage_download(self, video_id):
2898                 """Report attempt to download video webpage."""
2899                 self._reporter(u'%s: Downloading video webpage' % video_id)
2900
2901         def report_information_extraction(self, video_id):
2902                 """Report attempt to extract video information."""
2903                 self._reporter(u'%s: Extracting video information' % video_id)
2904
2905         def _parse_page(self, video_webpage):
2906                 """Extract video information from page"""
2907                 # General data
2908                 data = {'title': r'\("video_title", "(.*?)"\)',
2909                         'description': r'<div class="datawrap">(.*?)</div>',
2910                         'owner': r'\("video_owner_name", "(.*?)"\)',
2911                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2912                         }
2913                 video_info = {}
2914                 for piece in data.keys():
2915                         mobj = re.search(data[piece], video_webpage)
2916                         if mobj is not None:
2917                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2918
2919                 # Video urls
2920                 video_urls = {}
2921                 for fmt in self._available_formats:
2922                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2923                         if mobj is not None:
2924                                 # URL is in a Javascript segment inside an escaped Unicode format within
2925                                 # the generally utf-8 page
2926                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2927                 video_info['video_urls'] = video_urls
2928
2929                 return video_info
2930
2931         def _real_initialize(self):
2932                 if self._downloader is None:
2933                         return
2934
2935                 useremail = None
2936                 password = None
2937                 downloader_params = self._downloader.params
2938
2939                 # Attempt to use provided username and password or .netrc data
2940                 if downloader_params.get('username', None) is not None:
2941                         useremail = downloader_params['username']
2942                         password = downloader_params['password']
2943                 elif downloader_params.get('usenetrc', False):
2944                         try:
2945                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2946                                 if info is not None:
2947                                         useremail = info[0]
2948                                         password = info[2]
2949                                 else:
2950                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2951                         except (IOError, netrc.NetrcParseError), err:
2952                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2953                                 return
2954
2955                 if useremail is None:
2956                         return
2957
2958                 # Log in
2959                 login_form = {
2960                         'email': useremail,
2961                         'pass': password,
2962                         'login': 'Log+In'
2963                         }
2964                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2965                 try:
2966                         self.report_login()
2967                         login_results = urllib2.urlopen(request).read()
2968                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2969                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2970                                 return
2971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2972                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2973                         return
2974
2975         def _real_extract(self, url):
2976                 mobj = re.match(self._VALID_URL, url)
2977                 if mobj is None:
2978                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2979                         return
2980                 video_id = mobj.group('ID')
2981
2982                 # Get video webpage
2983                 self.report_video_webpage_download(video_id)
2984                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2985                 try:
2986                         page = urllib2.urlopen(request)
2987                         video_webpage = page.read()
2988                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2990                         return
2991
2992                 # Start extracting information
2993                 self.report_information_extraction(video_id)
2994
2995                 # Extract information
2996                 video_info = self._parse_page(video_webpage)
2997
2998                 # uploader
2999                 if 'owner' not in video_info:
3000                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3001                         return
3002                 video_uploader = video_info['owner']
3003
3004                 # title
3005                 if 'title' not in video_info:
3006                         self._downloader.trouble(u'ERROR: unable to extract video title')
3007                         return
3008                 video_title = video_info['title']
3009                 video_title = video_title.decode('utf-8')
3010                 video_title = sanitize_title(video_title)
3011
3012                 simple_title = _simplify_title(video_title)
3013
3014                 # thumbnail image
3015                 if 'thumbnail' not in video_info:
3016                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3017                         video_thumbnail = ''
3018                 else:
3019                         video_thumbnail = video_info['thumbnail']
3020
3021                 # upload date
3022                 upload_date = u'NA'
3023                 if 'upload_date' in video_info:
3024                         upload_time = video_info['upload_date']
3025                         timetuple = email.utils.parsedate_tz(upload_time)
3026                         if timetuple is not None:
3027                                 try:
3028                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3029                                 except:
3030                                         pass
3031
3032                 # description
3033                 video_description = video_info.get('description', 'No description available.')
3034
3035                 url_map = video_info['video_urls']
3036                 if len(url_map.keys()) > 0:
3037                         # Decide which formats to download
3038                         req_format = self._downloader.params.get('format', None)
3039                         format_limit = self._downloader.params.get('format_limit', None)
3040
3041                         if format_limit is not None and format_limit in self._available_formats:
3042                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3043                         else:
3044                                 format_list = self._available_formats
3045                         existing_formats = [x for x in format_list if x in url_map]
3046                         if len(existing_formats) == 0:
3047                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3048                                 return
3049                         if req_format is None:
3050                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3051                         elif req_format == 'worst':
3052                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3053                         elif req_format == '-1':
3054                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3055                         else:
3056                                 # Specific format
3057                                 if req_format not in url_map:
3058                                         self._downloader.trouble(u'ERROR: requested format not available')
3059                                         return
3060                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3061
3062                 for format_param, video_real_url in video_url_list:
3063
3064                         # At this point we have a new video
3065                         self._downloader.increment_downloads()
3066
3067                         # Extension
3068                         video_extension = self._video_extensions.get(format_param, 'mp4')
3069
3070                         try:
3071                                 # Process video information
3072                                 self._downloader.process_info({
3073                                         'id':           video_id.decode('utf-8'),
3074                                         'url':          video_real_url.decode('utf-8'),
3075                                         'uploader':     video_uploader.decode('utf-8'),
3076                                         'upload_date':  upload_date,
3077                                         'title':        video_title,
3078                                         'stitle':       simple_title,
3079                                         'ext':          video_extension.decode('utf-8'),
3080                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3081                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3082                                         'description':  video_description.decode('utf-8'),
3083                                         'player_url':   None,
3084                                 })
3085                         except UnavailableVideoError, err:
3086                                 self._downloader.trouble(u'\nERROR: unable to download video')
3087
3088 class BlipTVIE(InfoExtractor):
3089         """Information extractor for blip.tv"""
3090
3091         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3092         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3093         IE_NAME = u'blip.tv'
3094
3095         def report_extraction(self, file_id):
3096                 """Report information extraction."""
3097                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3098
3099         def report_direct_download(self, title):
3100                 """Report information extraction."""
3101                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3102
3103         def _real_extract(self, url):
3104                 mobj = re.match(self._VALID_URL, url)
3105                 if mobj is None:
3106                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3107                         return
3108
3109                 if '?' in url:
3110                         cchar = '&'
3111                 else:
3112                         cchar = '?'
3113                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3114                 request = urllib2.Request(json_url)
3115                 self.report_extraction(mobj.group(1))
3116                 info = None
3117                 try:
3118                         urlh = urllib2.urlopen(request)
3119                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3120                                 basename = url.split('/')[-1]
3121                                 title,ext = os.path.splitext(basename)
3122                                 title = title.decode('UTF-8')
3123                                 ext = ext.replace('.', '')
3124                                 self.report_direct_download(title)
3125                                 info = {
3126                                         'id': title,
3127                                         'url': url,
3128                                         'title': title,
3129                                         'stitle': _simplify_title(title),
3130                                         'ext': ext,
3131                                         'urlhandle': urlh
3132                                 }
3133                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3135                         return
3136                 if info is None: # Regular URL
3137                         try:
3138                                 json_code = urlh.read()
3139                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3140                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3141                                 return
3142
3143                         try:
3144                                 json_data = json.loads(json_code)
3145                                 if 'Post' in json_data:
3146                                         data = json_data['Post']
3147                                 else:
3148                                         data = json_data
3149
3150                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3151                                 video_url = data['media']['url']
3152                                 umobj = re.match(self._URL_EXT, video_url)
3153                                 if umobj is None:
3154                                         raise ValueError('Can not determine filename extension')
3155                                 ext = umobj.group(1)
3156
3157                                 info = {
3158                                         'id': data['item_id'],
3159                                         'url': video_url,
3160                                         'uploader': data['display_name'],
3161                                         'upload_date': upload_date,
3162                                         'title': data['title'],
3163                                         'stitle': _simplify_title(data['title']),
3164                                         'ext': ext,
3165                                         'format': data['media']['mimeType'],
3166                                         'thumbnail': data['thumbnailUrl'],
3167                                         'description': data['description'],
3168                                         'player_url': data['embedUrl']
3169                                 }
3170                         except (ValueError,KeyError), err:
3171                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3172                                 return
3173
3174                 self._downloader.increment_downloads()
3175
3176                 try:
3177                         self._downloader.process_info(info)
3178                 except UnavailableVideoError, err:
3179                         self._downloader.trouble(u'\nERROR: unable to download video')
3180
3181
3182 class MyVideoIE(InfoExtractor):
3183         """Information Extractor for myvideo.de."""
3184
3185         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3186         IE_NAME = u'myvideo'
3187
3188         def __init__(self, downloader=None):
3189                 InfoExtractor.__init__(self, downloader)
3190
3191         def report_download_webpage(self, video_id):
3192                 """Report webpage download."""
3193                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3194
3195         def report_extraction(self, video_id):
3196                 """Report information extraction."""
3197                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3198
3199         def _real_extract(self,url):
3200                 mobj = re.match(self._VALID_URL, url)
3201                 if mobj is None:
3202                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3203                         return
3204
3205                 video_id = mobj.group(1)
3206
3207                 # Get video webpage
3208                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3209                 try:
3210                         self.report_download_webpage(video_id)
3211                         webpage = urllib2.urlopen(request).read()
3212                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3213                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3214                         return
3215
3216                 self.report_extraction(video_id)
3217                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3218                                  webpage)
3219                 if mobj is None:
3220                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3221                         return
3222                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3223
3224                 mobj = re.search('<title>([^<]+)</title>', webpage)
3225                 if mobj is None:
3226                         self._downloader.trouble(u'ERROR: unable to extract title')
3227                         return
3228
3229                 video_title = mobj.group(1)
3230                 video_title = sanitize_title(video_title)
3231
3232                 simple_title = _simplify_title(video_title)
3233
3234                 try:
3235                         self._downloader.process_info({
3236                                 'id':           video_id,
3237                                 'url':          video_url,
3238                                 'uploader':     u'NA',
3239                                 'upload_date':  u'NA',
3240                                 'title':        video_title,
3241                                 'stitle':       simple_title,
3242                                 'ext':          u'flv',
3243                                 'format':       u'NA',
3244                                 'player_url':   None,
3245                         })
3246                 except UnavailableVideoError:
3247                         self._downloader.trouble(u'\nERROR: Unable to download video')
3248
3249 class ComedyCentralIE(InfoExtractor):
3250         """Information extractor for The Daily Show and Colbert Report """
3251
3252         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3253         IE_NAME = u'comedycentral'
3254
3255         def report_extraction(self, episode_id):
3256                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3257
3258         def report_config_download(self, episode_id):
3259                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3260
3261         def report_index_download(self, episode_id):
3262                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3263
3264         def report_player_url(self, episode_id):
3265                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3266
3267         def _real_extract(self, url):
3268                 mobj = re.match(self._VALID_URL, url)
3269                 if mobj is None:
3270                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3271                         return
3272
3273                 if mobj.group('shortname'):
3274                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3275                                 url = u'http://www.thedailyshow.com/full-episodes/'
3276                         else:
3277                                 url = u'http://www.colbertnation.com/full-episodes/'
3278                         mobj = re.match(self._VALID_URL, url)
3279                         assert mobj is not None
3280
3281                 dlNewest = not mobj.group('episode')
3282                 if dlNewest:
3283                         epTitle = mobj.group('showname')
3284                 else:
3285                         epTitle = mobj.group('episode')
3286
3287                 req = urllib2.Request(url)
3288                 self.report_extraction(epTitle)
3289                 try:
3290                         htmlHandle = urllib2.urlopen(req)
3291                         html = htmlHandle.read()
3292                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3294                         return
3295                 if dlNewest:
3296                         url = htmlHandle.geturl()
3297                         mobj = re.match(self._VALID_URL, url)
3298                         if mobj is None:
3299                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3300                                 return
3301                         if mobj.group('episode') == '':
3302                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3303                                 return
3304                         epTitle = mobj.group('episode')
3305
3306                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3307                 if len(mMovieParams) == 0:
3308                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3309                         return
3310
3311                 playerUrl_raw = mMovieParams[0][0]
3312                 self.report_player_url(epTitle)
3313                 try:
3314                         urlHandle = urllib2.urlopen(playerUrl_raw)
3315                         playerUrl = urlHandle.geturl()
3316                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3317                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3318                         return
3319
3320                 uri = mMovieParams[0][1]
3321                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3322                 self.report_index_download(epTitle)
3323                 try:
3324                         indexXml = urllib2.urlopen(indexUrl).read()
3325                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3326                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3327                         return
3328
3329                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3330                 itemEls = idoc.findall('.//item')
3331                 for itemEl in itemEls:
3332                         mediaId = itemEl.findall('./guid')[0].text
3333                         shortMediaId = mediaId.split(':')[-1]
3334                         showId = mediaId.split(':')[-2].replace('.com', '')
3335                         officialTitle = itemEl.findall('./title')[0].text
3336                         officialDate = itemEl.findall('./pubDate')[0].text
3337
3338                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3339                                                 urllib.urlencode({'uri': mediaId}))
3340                         configReq = urllib2.Request(configUrl)
3341                         self.report_config_download(epTitle)
3342                         try:
3343                                 configXml = urllib2.urlopen(configReq).read()
3344                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3345                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3346                                 return
3347
3348                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3349                         turls = []
3350                         for rendition in cdoc.findall('.//rendition'):
3351                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3352                                 turls.append(finfo)
3353
3354                         if len(turls) == 0:
3355                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3356                                 continue
3357
3358                         # For now, just pick the highest bitrate
3359                         format,video_url = turls[-1]
3360
3361                         self._downloader.increment_downloads()
3362
3363                         effTitle = showId + u'-' + epTitle
3364                         info = {
3365                                 'id': shortMediaId,
3366                                 'url': video_url,
3367                                 'uploader': showId,
3368                                 'upload_date': officialDate,
3369                                 'title': effTitle,
3370                                 'stitle': _simplify_title(effTitle),
3371                                 'ext': 'mp4',
3372                                 'format': format,
3373                                 'thumbnail': None,
3374                                 'description': officialTitle,
3375                                 'player_url': playerUrl
3376                         }
3377
3378                         try:
3379                                 self._downloader.process_info(info)
3380                         except UnavailableVideoError, err:
3381                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3382                                 continue
3383
3384
3385 class EscapistIE(InfoExtractor):
3386         """Information extractor for The Escapist """
3387
3388         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3389         IE_NAME = u'escapist'
3390
3391         def report_extraction(self, showName):
3392                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3393
3394         def report_config_download(self, showName):
3395                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3396
3397         def _real_extract(self, url):
3398                 mobj = re.match(self._VALID_URL, url)
3399                 if mobj is None:
3400                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3401                         return
3402                 showName = mobj.group('showname')
3403                 videoId = mobj.group('episode')
3404
3405                 self.report_extraction(showName)
3406                 try:
3407                         webPage = urllib2.urlopen(url).read()
3408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3409                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3410                         return
3411
3412                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3413                 description = _unescapeHTML(descMatch.group(1))
3414                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3415                 imgUrl = _unescapeHTML(imgMatch.group(1))
3416                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3417                 playerUrl = _unescapeHTML(playerUrlMatch.group(1))
3418                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3419                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3420
3421                 self.report_config_download(showName)
3422                 try:
3423                         configJSON = urllib2.urlopen(configUrl).read()
3424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3425                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3426                         return
3427
3428                 # Technically, it's JavaScript, not JSON
3429                 configJSON = configJSON.replace("'", '"')
3430
3431                 try:
3432                         config = json.loads(configJSON)
3433                 except (ValueError,), err:
3434                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3435                         return
3436
3437                 playlist = config['playlist']
3438                 videoUrl = playlist[1]['url']
3439
3440                 self._downloader.increment_downloads()
3441                 info = {
3442                         'id': videoId,
3443                         'url': videoUrl,
3444                         'uploader': showName,
3445                         'upload_date': None,
3446                         'title': showName,
3447                         'stitle': _simplify_title(showName),
3448                         'ext': 'flv',
3449                         'format': 'flv',
3450                         'thumbnail': imgUrl,
3451                         'description': description,
3452                         'player_url': playerUrl,
3453                 }
3454
3455                 try:
3456                         self._downloader.process_info(info)
3457                 except UnavailableVideoError, err:
3458                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3459
3460
3461 class CollegeHumorIE(InfoExtractor):
3462         """Information extractor for collegehumor.com"""
3463
3464         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3465         IE_NAME = u'collegehumor'
3466
3467         def report_webpage(self, video_id):
3468                 """Report information extraction."""
3469                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3470
3471         def report_extraction(self, video_id):
3472                 """Report information extraction."""
3473                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3474
3475         def _real_extract(self, url):
3476                 mobj = re.match(self._VALID_URL, url)
3477                 if mobj is None:
3478                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3479                         return
3480                 video_id = mobj.group('videoid')
3481
3482                 self.report_webpage(video_id)
3483                 request = urllib2.Request(url)
3484                 try:
3485                         webpage = urllib2.urlopen(request).read()
3486                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3487                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3488                         return
3489
3490                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3491                 if m is None:
3492                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3493                         return
3494                 internal_video_id = m.group('internalvideoid')
3495
3496                 info = {
3497                         'id': video_id,
3498                         'internal_id': internal_video_id,
3499                 }
3500
3501                 self.report_extraction(video_id)
3502                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3503                 try:
3504                         metaXml = urllib2.urlopen(xmlUrl).read()
3505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3506                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3507                         return
3508
3509                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3510                 try:
3511                         videoNode = mdoc.findall('./video')[0]
3512                         info['description'] = videoNode.findall('./description')[0].text
3513                         info['title'] = videoNode.findall('./caption')[0].text
3514                         info['stitle'] = _simplify_title(info['title'])
3515                         info['url'] = videoNode.findall('./file')[0].text
3516                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3517                         info['ext'] = info['url'].rpartition('.')[2]
3518                         info['format'] = info['ext']
3519                 except IndexError:
3520                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3521                         return
3522
3523                 self._downloader.increment_downloads()
3524
3525                 try:
3526                         self._downloader.process_info(info)
3527                 except UnavailableVideoError, err:
3528                         self._downloader.trouble(u'\nERROR: unable to download video')
3529
3530
3531 class XVideosIE(InfoExtractor):
3532         """Information extractor for xvideos.com"""
3533
3534         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3535         IE_NAME = u'xvideos'
3536
3537         def report_webpage(self, video_id):
3538                 """Report information extraction."""
3539                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3540
3541         def report_extraction(self, video_id):
3542                 """Report information extraction."""
3543                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3544
3545         def _real_extract(self, url):
3546                 mobj = re.match(self._VALID_URL, url)
3547                 if mobj is None:
3548                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3549                         return
3550                 video_id = mobj.group(1).decode('utf-8')
3551
3552                 self.report_webpage(video_id)
3553
3554                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3555                 try:
3556                         webpage = urllib2.urlopen(request).read()
3557                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3558                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3559                         return
3560
3561                 self.report_extraction(video_id)
3562
3563
3564                 # Extract video URL
3565                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3566                 if mobj is None:
3567                         self._downloader.trouble(u'ERROR: unable to extract video url')
3568                         return
3569                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3570
3571
3572                 # Extract title
3573                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3574                 if mobj is None:
3575                         self._downloader.trouble(u'ERROR: unable to extract video title')
3576                         return
3577                 video_title = mobj.group(1).decode('utf-8')
3578
3579
3580                 # Extract video thumbnail
3581                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3582                 if mobj is None:
3583                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3584                         return
3585                 video_thumbnail = mobj.group(1).decode('utf-8')
3586
3587
3588
3589                 self._downloader.increment_downloads()
3590                 info = {
3591                         'id': video_id,
3592                         'url': video_url,
3593                         'uploader': None,
3594                         'upload_date': None,
3595                         'title': video_title,
3596                         'stitle': _simplify_title(video_title),
3597                         'ext': 'flv',
3598                         'format': 'flv',
3599                         'thumbnail': video_thumbnail,
3600                         'description': None,
3601                         'player_url': None,
3602                 }
3603
3604                 try:
3605                         self._downloader.process_info(info)
3606                 except UnavailableVideoError, err:
3607                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3608
3609
3610 class SoundcloudIE(InfoExtractor):
3611         """Information extractor for soundcloud.com
3612            To access the media, the uid of the song and a stream token
3613            must be extracted from the page source and the script must make
3614            a request to media.soundcloud.com/crossdomain.xml. Then
3615            the media can be grabbed by requesting from an url composed
3616            of the stream token and uid
3617          """
3618
3619         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3620         IE_NAME = u'soundcloud'
3621
3622         def __init__(self, downloader=None):
3623                 InfoExtractor.__init__(self, downloader)
3624
3625         def report_webpage(self, video_id):
3626                 """Report information extraction."""
3627                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3628
3629         def report_extraction(self, video_id):
3630                 """Report information extraction."""
3631                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3632
3633         def _real_extract(self, url):
3634                 mobj = re.match(self._VALID_URL, url)
3635                 if mobj is None:
3636                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3637                         return
3638
3639                 # extract uploader (which is in the url)
3640                 uploader = mobj.group(1).decode('utf-8')
3641                 # extract simple title (uploader + slug of song title)
3642                 slug_title =  mobj.group(2).decode('utf-8')
3643                 simple_title = uploader + '-' + slug_title
3644
3645                 self.report_webpage('%s/%s' % (uploader, slug_title))
3646
3647                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3648                 try:
3649                         webpage = urllib2.urlopen(request).read()
3650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3651                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3652                         return
3653
3654                 self.report_extraction('%s/%s' % (uploader, slug_title))
3655
3656                 # extract uid and stream token that soundcloud hands out for access
3657                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3658                 if mobj:
3659                         video_id = mobj.group(1)
3660                         stream_token = mobj.group(2)
3661
3662                 # extract unsimplified title
3663                 mobj = re.search('"title":"(.*?)",', webpage)
3664                 if mobj:
3665                         title = mobj.group(1)
3666
3667                 # construct media url (with uid/token)
3668                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3669                 mediaURL = mediaURL % (video_id, stream_token)
3670
3671                 # description
3672                 description = u'No description available'
3673                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3674                 if mobj:
3675                         description = mobj.group(1)
3676
3677                 # upload date
3678                 upload_date = None
3679                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3680                 if mobj:
3681                         try:
3682                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3683                         except Exception, e:
3684                                 print str(e)
3685
3686                 # for soundcloud, a request to a cross domain is required for cookies
3687                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3688
3689                 try:
3690                         self._downloader.process_info({
3691                                 'id':           video_id.decode('utf-8'),
3692                                 'url':          mediaURL,
3693                                 'uploader':     uploader.decode('utf-8'),
3694                                 'upload_date':  upload_date,
3695                                 'title':        simple_title.decode('utf-8'),
3696                                 'stitle':       simple_title.decode('utf-8'),
3697                                 'ext':          u'mp3',
3698                                 'format':       u'NA',
3699                                 'player_url':   None,
3700                                 'description': description.decode('utf-8')
3701                         })
3702                 except UnavailableVideoError:
3703                         self._downloader.trouble(u'\nERROR: unable to download video')
3704
3705
3706 class InfoQIE(InfoExtractor):
3707         """Information extractor for infoq.com"""
3708
3709         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3710         IE_NAME = u'infoq'
3711
3712         def report_webpage(self, video_id):
3713                 """Report information extraction."""
3714                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3715
3716         def report_extraction(self, video_id):
3717                 """Report information extraction."""
3718                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3719
3720         def _real_extract(self, url):
3721                 mobj = re.match(self._VALID_URL, url)
3722                 if mobj is None:
3723                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3724                         return
3725
3726                 self.report_webpage(url)
3727
3728                 request = urllib2.Request(url)
3729                 try:
3730                         webpage = urllib2.urlopen(request).read()
3731                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3732                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3733                         return
3734
3735                 self.report_extraction(url)
3736
3737
3738                 # Extract video URL
3739                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3740                 if mobj is None:
3741                         self._downloader.trouble(u'ERROR: unable to extract video url')
3742                         return
3743                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3744
3745
3746                 # Extract title
3747                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3748                 if mobj is None:
3749                         self._downloader.trouble(u'ERROR: unable to extract video title')
3750                         return
3751                 video_title = mobj.group(1).decode('utf-8')
3752
3753                 # Extract description
3754                 video_description = u'No description available.'
3755                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3756                 if mobj is not None:
3757                         video_description = mobj.group(1).decode('utf-8')
3758
3759                 video_filename = video_url.split('/')[-1]
3760                 video_id, extension = video_filename.split('.')
3761
3762                 self._downloader.increment_downloads()
3763                 info = {
3764                         'id': video_id,
3765                         'url': video_url,
3766                         'uploader': None,
3767                         'upload_date': None,
3768                         'title': video_title,
3769                         'stitle': _simplify_title(video_title),
3770                         'ext': extension,
3771                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3772                         'thumbnail': None,
3773                         'description': video_description,
3774                         'player_url': None,
3775                 }
3776
3777                 try:
3778                         self._downloader.process_info(info)
3779                 except UnavailableVideoError, err:
3780                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3781
3782 class MixcloudIE(InfoExtractor):
3783         """Information extractor for www.mixcloud.com"""
3784         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3785         IE_NAME = u'mixcloud'
3786
3787         def __init__(self, downloader=None):
3788                 InfoExtractor.__init__(self, downloader)
3789
3790         def report_download_json(self, file_id):
3791                 """Report JSON download."""
3792                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3793
3794         def report_extraction(self, file_id):
3795                 """Report information extraction."""
3796                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3797
3798         def get_urls(self, jsonData, fmt, bitrate='best'):
3799                 """Get urls from 'audio_formats' section in json"""
3800                 file_url = None
3801                 try:
3802                         bitrate_list = jsonData[fmt]
3803                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3804                                 bitrate = max(bitrate_list) # select highest
3805
3806                         url_list = jsonData[fmt][bitrate]
3807                 except TypeError: # we have no bitrate info.
3808                         url_list = jsonData[fmt]
3809
3810                 return url_list
3811
3812         def check_urls(self, url_list):
3813                 """Returns 1st active url from list"""
3814                 for url in url_list:
3815                         try:
3816                                 urllib2.urlopen(url)
3817                                 return url
3818                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3819                                 url = None
3820
3821                 return None
3822
3823         def _print_formats(self, formats):
3824                 print 'Available formats:'
3825                 for fmt in formats.keys():
3826                         for b in formats[fmt]:
3827                                 try:
3828                                         ext = formats[fmt][b][0]
3829                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3830                                 except TypeError: # we have no bitrate info
3831                                         ext = formats[fmt][0]
3832                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3833                                         break
3834
3835         def _real_extract(self, url):
3836                 mobj = re.match(self._VALID_URL, url)
3837                 if mobj is None:
3838                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3839                         return
3840                 # extract uploader & filename from url
3841                 uploader = mobj.group(1).decode('utf-8')
3842                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3843
3844                 # construct API request
3845                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3846                 # retrieve .json file with links to files
3847                 request = urllib2.Request(file_url)
3848                 try:
3849                         self.report_download_json(file_url)
3850                         jsonData = urllib2.urlopen(request).read()
3851                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3852                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3853                         return
3854
3855                 # parse JSON
3856                 json_data = json.loads(jsonData)
3857                 player_url = json_data['player_swf_url']
3858                 formats = dict(json_data['audio_formats'])
3859
3860                 req_format = self._downloader.params.get('format', None)
3861                 bitrate = None
3862
3863                 if self._downloader.params.get('listformats', None):
3864                         self._print_formats(formats)
3865                         return
3866
3867                 if req_format is None or req_format == 'best':
3868                         for format_param in formats.keys():
3869                                 url_list = self.get_urls(formats, format_param)
3870                                 # check urls
3871                                 file_url = self.check_urls(url_list)
3872                                 if file_url is not None:
3873                                         break # got it!
3874                 else:
3875                         if req_format not in formats.keys():
3876                                 self._downloader.trouble(u'ERROR: format is not available')
3877                                 return
3878
3879                         url_list = self.get_urls(formats, req_format)
3880                         file_url = self.check_urls(url_list)
3881                         format_param = req_format
3882
3883                 # We have audio
3884                 self._downloader.increment_downloads()
3885                 try:
3886                         # Process file information
3887                         self._downloader.process_info({
3888                                 'id': file_id.decode('utf-8'),
3889                                 'url': file_url.decode('utf-8'),
3890                                 'uploader':     uploader.decode('utf-8'),
3891                                 'upload_date': u'NA',
3892                                 'title': json_data['name'],
3893                                 'stitle': _simplify_title(json_data['name']),
3894                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3895                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3896                                 'thumbnail': json_data['thumbnail_url'],
3897                                 'description': json_data['description'],
3898                                 'player_url': player_url.decode('utf-8'),
3899                         })
3900                 except UnavailableVideoError, err:
3901                         self._downloader.trouble(u'ERROR: unable to download file')
3902
3903 class StanfordOpenClassroomIE(InfoExtractor):
3904         """Information extractor for Stanford's Open ClassRoom"""
3905
3906         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3907         IE_NAME = u'stanfordoc'
3908
3909         def report_download_webpage(self, objid):
3910                 """Report information extraction."""
3911                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3912
3913         def report_extraction(self, video_id):
3914                 """Report information extraction."""
3915                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3916
3917         def _real_extract(self, url):
3918                 mobj = re.match(self._VALID_URL, url)
3919                 if mobj is None:
3920                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3921                         return
3922
3923                 if mobj.group('course') and mobj.group('video'): # A specific video
3924                         course = mobj.group('course')
3925                         video = mobj.group('video')
3926                         info = {
3927                                 'id': _simplify_title(course + '_' + video),
3928                         }
3929
3930                         self.report_extraction(info['id'])
3931                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3932                         xmlUrl = baseUrl + video + '.xml'
3933                         try:
3934                                 metaXml = urllib2.urlopen(xmlUrl).read()
3935                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3936                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3937                                 return
3938                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3939                         try:
3940                                 info['title'] = mdoc.findall('./title')[0].text
3941                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3942                         except IndexError:
3943                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3944                                 return
3945                         info['stitle'] = _simplify_title(info['title'])
3946                         info['ext'] = info['url'].rpartition('.')[2]
3947                         info['format'] = info['ext']
3948                         self._downloader.increment_downloads()
3949                         try:
3950                                 self._downloader.process_info(info)
3951                         except UnavailableVideoError, err:
3952                                 self._downloader.trouble(u'\nERROR: unable to download video')
3953                 elif mobj.group('course'): # A course page
3954                         course = mobj.group('course')
3955                         info = {
3956                                 'id': _simplify_title(course),
3957                                 'type': 'playlist',
3958                         }
3959
3960                         self.report_download_webpage(info['id'])
3961                         try:
3962                                 coursepage = urllib2.urlopen(url).read()
3963                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3964                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3965                                 return
3966
3967                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3968                         if m:
3969                                 info['title'] = _unescapeHTML(m.group(1))
3970                         else:
3971                                 info['title'] = info['id']
3972                         info['stitle'] = _simplify_title(info['title'])
3973
3974                         m = re.search('<description>([^<]+)</description>', coursepage)
3975                         if m:
3976                                 info['description'] = _unescapeHTML(m.group(1))
3977
3978                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3979                         info['list'] = [
3980                                 {
3981                                         'type': 'reference',
3982                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
3983                                 }
3984                                         for vpage in links]
3985
3986                         for entry in info['list']:
3987                                 assert entry['type'] == 'reference'
3988                                 self.extract(entry['url'])
3989                 else: # Root page
3990                         info = {
3991                                 'id': 'Stanford OpenClassroom',
3992                                 'type': 'playlist',
3993                         }
3994
3995                         self.report_download_webpage(info['id'])
3996                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3997                         try:
3998                                 rootpage = urllib2.urlopen(rootURL).read()
3999                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4000                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4001                                 return
4002
4003                         info['title'] = info['id']
4004                         info['stitle'] = _simplify_title(info['title'])
4005
4006                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4007                         info['list'] = [
4008                                 {
4009                                         'type': 'reference',
4010                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
4011                                 }
4012                                         for cpage in links]
4013
4014                         for entry in info['list']:
4015                                 assert entry['type'] == 'reference'
4016                                 self.extract(entry['url'])
4017
4018 class MTVIE(InfoExtractor):
4019         """Information extractor for MTV.com"""
4020
4021         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4022         IE_NAME = u'mtv'
4023
4024         def report_webpage(self, video_id):
4025                 """Report information extraction."""
4026                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4027
4028         def report_extraction(self, video_id):
4029                 """Report information extraction."""
4030                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4031
4032         def _real_extract(self, url):
4033                 mobj = re.match(self._VALID_URL, url)
4034                 if mobj is None:
4035                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4036                         return
4037                 if not mobj.group('proto'):
4038                         url = 'http://' + url
4039                 video_id = mobj.group('videoid')
4040                 self.report_webpage(video_id)
4041
4042                 request = urllib2.Request(url)
4043                 try:
4044                         webpage = urllib2.urlopen(request).read()
4045                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4046                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4047                         return
4048
4049                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4050                 if mobj is None:
4051                         self._downloader.trouble(u'ERROR: unable to extract song name')
4052                         return
4053                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4054                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4055                 if mobj is None:
4056                         self._downloader.trouble(u'ERROR: unable to extract performer')
4057                         return
4058                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4059                 video_title = performer + ' - ' + song_name
4060
4061                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4062                 if mobj is None:
4063                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4064                         return
4065                 mtvn_uri = mobj.group(1)
4066
4067                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4068                 if mobj is None:
4069                         self._downloader.trouble(u'ERROR: unable to extract content id')
4070                         return
4071                 content_id = mobj.group(1)
4072
4073                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4074                 self.report_extraction(video_id)
4075                 request = urllib2.Request(videogen_url)
4076                 try:
4077                         metadataXml = urllib2.urlopen(request).read()
4078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4079                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4080                         return
4081
4082                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4083                 renditions = mdoc.findall('.//rendition')
4084
4085                 # For now, always pick the highest quality.
4086                 rendition = renditions[-1]
4087
4088                 try:
4089                         _,_,ext = rendition.attrib['type'].partition('/')
4090                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4091                         video_url = rendition.find('./src').text
4092                 except KeyError:
4093                         self._downloader.trouble('Invalid rendition field.')
4094                         return
4095
4096                 self._downloader.increment_downloads()
4097                 info = {
4098                         'id': video_id,
4099                         'url': video_url,
4100                         'uploader': performer,
4101                         'title': video_title,
4102                         'stitle': _simplify_title(video_title),
4103                         'ext': ext,
4104                         'format': format,
4105                 }
4106
4107                 try:
4108                         self._downloader.process_info(info)
4109                 except UnavailableVideoError, err:
4110                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4111
4112
4113 class PostProcessor(object):
4114         """Post Processor class.
4115
4116         PostProcessor objects can be added to downloaders with their
4117         add_post_processor() method. When the downloader has finished a
4118         successful download, it will take its internal chain of PostProcessors
4119         and start calling the run() method on each one of them, first with
4120         an initial argument and then with the returned value of the previous
4121         PostProcessor.
4122
4123         The chain will be stopped if one of them ever returns None or the end
4124         of the chain is reached.
4125
4126         PostProcessor objects follow a "mutual registration" process similar
4127         to InfoExtractor objects.
4128         """
4129
4130         _downloader = None
4131
4132         def __init__(self, downloader=None):
4133                 self._downloader = downloader
4134
4135         def set_downloader(self, downloader):
4136                 """Sets the downloader for this PP."""
4137                 self._downloader = downloader
4138
4139         def run(self, information):
4140                 """Run the PostProcessor.
4141
4142                 The "information" argument is a dictionary like the ones
4143                 composed by InfoExtractors. The only difference is that this
4144                 one has an extra field called "filepath" that points to the
4145                 downloaded file.
4146
4147                 When this method returns None, the postprocessing chain is
4148                 stopped. However, this method may return an information
4149                 dictionary that will be passed to the next postprocessing
4150                 object in the chain. It can be the one it received after
4151                 changing some fields.
4152
4153                 In addition, this method may raise a PostProcessingError
4154                 exception that will be taken into account by the downloader
4155                 it was called from.
4156                 """
4157                 return information # by default, do nothing
4158
4159 class AudioConversionError(BaseException):
4160         def __init__(self, message):
4161                 self.message = message
4162
4163 class FFmpegExtractAudioPP(PostProcessor):
4164
4165         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4166                 PostProcessor.__init__(self, downloader)
4167                 if preferredcodec is None:
4168                         preferredcodec = 'best'
4169                 self._preferredcodec = preferredcodec
4170                 self._preferredquality = preferredquality
4171                 self._keepvideo = keepvideo
4172
4173         @staticmethod
4174         def get_audio_codec(path):
4175                 try:
4176                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4177                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4178                         output = handle.communicate()[0]
4179                         if handle.wait() != 0:
4180                                 return None
4181                 except (IOError, OSError):
4182                         return None
4183                 audio_codec = None
4184                 for line in output.split('\n'):
4185                         if line.startswith('codec_name='):
4186                                 audio_codec = line.split('=')[1].strip()
4187                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4188                                 return audio_codec
4189                 return None
4190
4191         @staticmethod
4192         def run_ffmpeg(path, out_path, codec, more_opts):
4193                 if codec is None:
4194                         acodec_opts = []
4195                 else:
4196                         acodec_opts = ['-acodec', codec]
4197                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4198                 try:
4199                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4200                         stdout,stderr = p.communicate()
4201                 except (IOError, OSError):
4202                         e = sys.exc_info()[1]
4203                         if isinstance(e, OSError) and e.errno == 2:
4204                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4205                         else:
4206                                 raise e
4207                 if p.returncode != 0:
4208                         msg = stderr.strip().split('\n')[-1]
4209                         raise AudioConversionError(msg)
4210
4211         def run(self, information):
4212                 path = information['filepath']
4213
4214                 filecodec = self.get_audio_codec(path)
4215                 if filecodec is None:
4216                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4217                         return None
4218
4219                 more_opts = []
4220                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4221                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4222                                 # Lossless, but in another container
4223                                 acodec = 'copy'
4224                                 extension = self._preferredcodec
4225                                 more_opts = ['-absf', 'aac_adtstoasc']
4226                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4227                                 # Lossless if possible
4228                                 acodec = 'copy'
4229                                 extension = filecodec
4230                                 if filecodec == 'aac':
4231                                         more_opts = ['-f', 'adts']
4232                                 if filecodec == 'vorbis':
4233                                         extension = 'ogg'
4234                         else:
4235                                 # MP3 otherwise.
4236                                 acodec = 'libmp3lame'
4237                                 extension = 'mp3'
4238                                 more_opts = []
4239                                 if self._preferredquality is not None:
4240                                         more_opts += ['-ab', self._preferredquality]
4241                 else:
4242                         # We convert the audio (lossy)
4243                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4244                         extension = self._preferredcodec
4245                         more_opts = []
4246                         if self._preferredquality is not None:
4247                                 more_opts += ['-ab', self._preferredquality]
4248                         if self._preferredcodec == 'aac':
4249                                 more_opts += ['-f', 'adts']
4250                         if self._preferredcodec == 'm4a':
4251                                 more_opts += ['-absf', 'aac_adtstoasc']
4252                         if self._preferredcodec == 'vorbis':
4253                                 extension = 'ogg'
4254                         if self._preferredcodec == 'wav':
4255                                 extension = 'wav'
4256                                 more_opts += ['-f', 'wav']
4257
4258                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4259                 new_path = prefix + sep + extension
4260                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4261                 try:
4262                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4263                 except:
4264                         etype,e,tb = sys.exc_info()
4265                         if isinstance(e, AudioConversionError):
4266                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4267                         else:
4268                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4269                         return None
4270
4271                 # Try to update the date time for extracted audio file.
4272                 if information.get('filetime') is not None:
4273                         try:
4274                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4275                         except:
4276                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4277
4278                 if not self._keepvideo:
4279                         try:
4280                                 os.remove(_encodeFilename(path))
4281                         except (IOError, OSError):
4282                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4283                                 return None
4284
4285                 information['filepath'] = new_path
4286                 return information
4287
4288
4289 def updateSelf(downloader, filename):
4290         ''' Update the program file with the latest version from the repository '''
4291         # Note: downloader only used for options
4292         if not os.access(filename, os.W_OK):
4293                 sys.exit('ERROR: no write permissions on %s' % filename)
4294
4295         downloader.to_screen(u'Updating to latest version...')
4296
4297         try:
4298                 try:
4299                         urlh = urllib.urlopen(UPDATE_URL)
4300                         newcontent = urlh.read()
4301
4302                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4303                         if vmatch is not None and vmatch.group(1) == __version__:
4304                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4305                                 return
4306                 finally:
4307                         urlh.close()
4308         except (IOError, OSError), err:
4309                 sys.exit('ERROR: unable to download latest version')
4310
4311         try:
4312                 outf = open(filename, 'wb')
4313                 try:
4314                         outf.write(newcontent)
4315                 finally:
4316                         outf.close()
4317         except (IOError, OSError), err:
4318                 sys.exit('ERROR: unable to overwrite current version')
4319
4320         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4321
4322 def parseOpts():
4323         def _readOptions(filename_bytes):
4324                 try:
4325                         optionf = open(filename_bytes)
4326                 except IOError:
4327                         return [] # silently skip if file is not present
4328                 try:
4329                         res = []
4330                         for l in optionf:
4331                                 res += shlex.split(l, comments=True)
4332                 finally:
4333                         optionf.close()
4334                 return res
4335
4336         def _format_option_string(option):
4337                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4338
4339                 opts = []
4340
4341                 if option._short_opts: opts.append(option._short_opts[0])
4342                 if option._long_opts: opts.append(option._long_opts[0])
4343                 if len(opts) > 1: opts.insert(1, ', ')
4344
4345                 if option.takes_value(): opts.append(' %s' % option.metavar)
4346
4347                 return "".join(opts)
4348
4349         def _find_term_columns():
4350                 columns = os.environ.get('COLUMNS', None)
4351                 if columns:
4352                         return int(columns)
4353
4354                 try:
4355                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4356                         out,err = sp.communicate()
4357                         return int(out.split()[1])
4358                 except:
4359                         pass
4360                 return None
4361
4362         max_width = 80
4363         max_help_position = 80
4364
4365         # No need to wrap help messages if we're on a wide console
4366         columns = _find_term_columns()
4367         if columns: max_width = columns
4368
4369         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4370         fmt.format_option_strings = _format_option_string
4371
4372         kw = {
4373                 'version'   : __version__,
4374                 'formatter' : fmt,
4375                 'usage' : '%prog [options] url [url...]',
4376                 'conflict_handler' : 'resolve',
4377         }
4378
4379         parser = optparse.OptionParser(**kw)
4380
4381         # option groups
4382         general        = optparse.OptionGroup(parser, 'General Options')
4383         selection      = optparse.OptionGroup(parser, 'Video Selection')
4384         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4385         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4386         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4387         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4388         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4389
4390         general.add_option('-h', '--help',
4391                         action='help', help='print this help text and exit')
4392         general.add_option('-v', '--version',
4393                         action='version', help='print program version and exit')
4394         general.add_option('-U', '--update',
4395                         action='store_true', dest='update_self', help='update this program to latest version')
4396         general.add_option('-i', '--ignore-errors',
4397                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4398         general.add_option('-r', '--rate-limit',
4399                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4400         general.add_option('-R', '--retries',
4401                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4402         general.add_option('--dump-user-agent',
4403                         action='store_true', dest='dump_user_agent',
4404                         help='display the current browser identification', default=False)
4405         general.add_option('--list-extractors',
4406                         action='store_true', dest='list_extractors',
4407                         help='List all supported extractors and the URLs they would handle', default=False)
4408
4409         selection.add_option('--playlist-start',
4410                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4411         selection.add_option('--playlist-end',
4412                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4413         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4414         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4415         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4416
4417         authentication.add_option('-u', '--username',
4418                         dest='username', metavar='USERNAME', help='account username')
4419         authentication.add_option('-p', '--password',
4420                         dest='password', metavar='PASSWORD', help='account password')
4421         authentication.add_option('-n', '--netrc',
4422                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4423
4424
4425         video_format.add_option('-f', '--format',
4426                         action='store', dest='format', metavar='FORMAT', help='video format code')
4427         video_format.add_option('--all-formats',
4428                         action='store_const', dest='format', help='download all available video formats', const='all')
4429         video_format.add_option('--prefer-free-formats',
4430                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4431         video_format.add_option('--max-quality',
4432                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4433         video_format.add_option('-F', '--list-formats',
4434                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4435         video_format.add_option('--write-srt',
4436                         action='store_true', dest='writesubtitles',
4437                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4438         video_format.add_option('--srt-lang',
4439                         action='store', dest='subtitleslang', metavar='LANG',
4440                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4441
4442
4443         verbosity.add_option('-q', '--quiet',
4444                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4445         verbosity.add_option('-s', '--simulate',
4446                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4447         verbosity.add_option('--skip-download',
4448                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4449         verbosity.add_option('-g', '--get-url',
4450                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4451         verbosity.add_option('-e', '--get-title',
4452                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4453         verbosity.add_option('--get-thumbnail',
4454                         action='store_true', dest='getthumbnail',
4455                         help='simulate, quiet but print thumbnail URL', default=False)
4456         verbosity.add_option('--get-description',
4457                         action='store_true', dest='getdescription',
4458                         help='simulate, quiet but print video description', default=False)
4459         verbosity.add_option('--get-filename',
4460                         action='store_true', dest='getfilename',
4461                         help='simulate, quiet but print output filename', default=False)
4462         verbosity.add_option('--get-format',
4463                         action='store_true', dest='getformat',
4464                         help='simulate, quiet but print output format', default=False)
4465         verbosity.add_option('--no-progress',
4466                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4467         verbosity.add_option('--console-title',
4468                         action='store_true', dest='consoletitle',
4469                         help='display progress in console titlebar', default=False)
4470         verbosity.add_option('-v', '--verbose',
4471                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4472
4473
4474         filesystem.add_option('-t', '--title',
4475                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4476         filesystem.add_option('-l', '--literal',
4477                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4478         filesystem.add_option('-A', '--auto-number',
4479                         action='store_true', dest='autonumber',
4480                         help='number downloaded files starting from 00000', default=False)
4481         filesystem.add_option('-o', '--output',
4482                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4483         filesystem.add_option('-a', '--batch-file',
4484                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4485         filesystem.add_option('-w', '--no-overwrites',
4486                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4487         filesystem.add_option('-c', '--continue',
4488                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4489         filesystem.add_option('--no-continue',
4490                         action='store_false', dest='continue_dl',
4491                         help='do not resume partially downloaded files (restart from beginning)')
4492         filesystem.add_option('--cookies',
4493                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4494         filesystem.add_option('--no-part',
4495                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4496         filesystem.add_option('--no-mtime',
4497                         action='store_false', dest='updatetime',
4498                         help='do not use the Last-modified header to set the file modification time', default=True)
4499         filesystem.add_option('--write-description',
4500                         action='store_true', dest='writedescription',
4501                         help='write video description to a .description file', default=False)
4502         filesystem.add_option('--write-info-json',
4503                         action='store_true', dest='writeinfojson',
4504                         help='write video metadata to a .info.json file', default=False)
4505
4506
4507         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4508                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4509         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4510                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4511         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4512                         help='ffmpeg audio bitrate specification, 128k by default')
4513         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4514                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4515
4516
4517         parser.add_option_group(general)
4518         parser.add_option_group(selection)
4519         parser.add_option_group(filesystem)
4520         parser.add_option_group(verbosity)
4521         parser.add_option_group(video_format)
4522         parser.add_option_group(authentication)
4523         parser.add_option_group(postproc)
4524
4525         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4526         if xdg_config_home:
4527                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4528         else:
4529                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4530         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4531         opts, args = parser.parse_args(argv)
4532
4533         return parser, opts, args
4534
4535 def gen_extractors():
4536         """ Return a list of an instance of every supported extractor.
4537         The order does matter; the first extractor matched is the one handling the URL.
4538         """
4539         youtube_ie = YoutubeIE()
4540         google_ie = GoogleIE()
4541         yahoo_ie = YahooIE()
4542         return [
4543                 YoutubePlaylistIE(youtube_ie),
4544                 YoutubeUserIE(youtube_ie),
4545                 YoutubeSearchIE(youtube_ie),
4546                 youtube_ie,
4547                 MetacafeIE(youtube_ie),
4548                 DailymotionIE(),
4549                 google_ie,
4550                 GoogleSearchIE(google_ie),
4551                 PhotobucketIE(),
4552                 yahoo_ie,
4553                 YahooSearchIE(yahoo_ie),
4554                 DepositFilesIE(),
4555                 FacebookIE(),
4556                 BlipTVIE(),
4557                 VimeoIE(),
4558                 MyVideoIE(),
4559                 ComedyCentralIE(),
4560                 EscapistIE(),
4561                 CollegeHumorIE(),
4562                 XVideosIE(),
4563                 SoundcloudIE(),
4564                 InfoQIE(),
4565                 MixcloudIE(),
4566                 StanfordOpenClassroomIE(),
4567                 MTVIE(),
4568
4569                 GenericIE()
4570         ]
4571
4572 def _real_main():
4573         parser, opts, args = parseOpts()
4574
4575         # Open appropriate CookieJar
4576         if opts.cookiefile is None:
4577                 jar = cookielib.CookieJar()
4578         else:
4579                 try:
4580                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4581                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4582                                 jar.load()
4583                 except (IOError, OSError), err:
4584                         sys.exit(u'ERROR: unable to open cookie file')
4585
4586         # Dump user agent
4587         if opts.dump_user_agent:
4588                 print std_headers['User-Agent']
4589                 sys.exit(0)
4590
4591         # Batch file verification
4592         batchurls = []
4593         if opts.batchfile is not None:
4594                 try:
4595                         if opts.batchfile == '-':
4596                                 batchfd = sys.stdin
4597                         else:
4598                                 batchfd = open(opts.batchfile, 'r')
4599                         batchurls = batchfd.readlines()
4600                         batchurls = [x.strip() for x in batchurls]
4601                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4602                 except IOError:
4603                         sys.exit(u'ERROR: batch file could not be read')
4604         all_urls = batchurls + args
4605
4606         # General configuration
4607         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4608         proxy_handler = urllib2.ProxyHandler()
4609         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4610         urllib2.install_opener(opener)
4611         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4612
4613         if opts.verbose:
4614                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4615
4616         extractors = gen_extractors()
4617
4618         if opts.list_extractors:
4619                 for ie in extractors:
4620                         print(ie.IE_NAME)
4621                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4622                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4623                         for mu in matchedUrls:
4624                                 print(u'  ' + mu)
4625                 sys.exit(0)
4626
4627         # Conflicting, missing and erroneous options
4628         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4629                 parser.error(u'using .netrc conflicts with giving username/password')
4630         if opts.password is not None and opts.username is None:
4631                 parser.error(u'account username missing')
4632         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4633                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4634         if opts.usetitle and opts.useliteral:
4635                 parser.error(u'using title conflicts with using literal title')
4636         if opts.username is not None and opts.password is None:
4637                 opts.password = getpass.getpass(u'Type account password and press return:')
4638         if opts.ratelimit is not None:
4639                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4640                 if numeric_limit is None:
4641                         parser.error(u'invalid rate limit specified')
4642                 opts.ratelimit = numeric_limit
4643         if opts.retries is not None:
4644                 try:
4645                         opts.retries = long(opts.retries)
4646                 except (TypeError, ValueError), err:
4647                         parser.error(u'invalid retry count specified')
4648         try:
4649                 opts.playliststart = int(opts.playliststart)
4650                 if opts.playliststart <= 0:
4651                         raise ValueError(u'Playlist start must be positive')
4652         except (TypeError, ValueError), err:
4653                 parser.error(u'invalid playlist start number specified')
4654         try:
4655                 opts.playlistend = int(opts.playlistend)
4656                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4657                         raise ValueError(u'Playlist end must be greater than playlist start')
4658         except (TypeError, ValueError), err:
4659                 parser.error(u'invalid playlist end number specified')
4660         if opts.extractaudio:
4661                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4662                         parser.error(u'invalid audio format specified')
4663
4664         # File downloader
4665         fd = FileDownloader({
4666                 'usenetrc': opts.usenetrc,
4667                 'username': opts.username,
4668                 'password': opts.password,
4669                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4670                 'forceurl': opts.geturl,
4671                 'forcetitle': opts.gettitle,
4672                 'forcethumbnail': opts.getthumbnail,
4673                 'forcedescription': opts.getdescription,
4674                 'forcefilename': opts.getfilename,
4675                 'forceformat': opts.getformat,
4676                 'simulate': opts.simulate,
4677                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4678                 'format': opts.format,
4679                 'format_limit': opts.format_limit,
4680                 'listformats': opts.listformats,
4681                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4682                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4683                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4684                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4685                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4686                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4687                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4688                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4689                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4690                         or u'%(id)s.%(ext)s'),
4691                 'ignoreerrors': opts.ignoreerrors,
4692                 'ratelimit': opts.ratelimit,
4693                 'nooverwrites': opts.nooverwrites,
4694                 'retries': opts.retries,
4695                 'continuedl': opts.continue_dl,
4696                 'noprogress': opts.noprogress,
4697                 'playliststart': opts.playliststart,
4698                 'playlistend': opts.playlistend,
4699                 'logtostderr': opts.outtmpl == '-',
4700                 'consoletitle': opts.consoletitle,
4701                 'nopart': opts.nopart,
4702                 'updatetime': opts.updatetime,
4703                 'writedescription': opts.writedescription,
4704                 'writeinfojson': opts.writeinfojson,
4705                 'writesubtitles': opts.writesubtitles,
4706                 'subtitleslang': opts.subtitleslang,
4707                 'matchtitle': opts.matchtitle,
4708                 'rejecttitle': opts.rejecttitle,
4709                 'max_downloads': opts.max_downloads,
4710                 'prefer_free_formats': opts.prefer_free_formats,
4711                 'verbose': opts.verbose,
4712                 })
4713         for extractor in extractors:
4714                 fd.add_info_extractor(extractor)
4715
4716         # PostProcessors
4717         if opts.extractaudio:
4718                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4719
4720         # Update version
4721         if opts.update_self:
4722                 updateSelf(fd, sys.argv[0])
4723
4724         # Maybe do nothing
4725         if len(all_urls) < 1:
4726                 if not opts.update_self:
4727                         parser.error(u'you must provide at least one URL')
4728                 else:
4729                         sys.exit()
4730
4731         try:
4732                 retcode = fd.download(all_urls)
4733         except MaxDownloadsReached:
4734                 fd.to_screen(u'--max-download limit reached, aborting.')
4735                 retcode = 101
4736
4737         # Dump cookie jar if requested
4738         if opts.cookiefile is not None:
4739                 try:
4740                         jar.save()
4741                 except (IOError, OSError), err:
4742                         sys.exit(u'ERROR: unable to save cookie jar')
4743
4744         sys.exit(retcode)
4745
4746 def main():
4747         try:
4748                 _real_main()
4749         except DownloadError:
4750                 sys.exit(1)
4751         except SameFileError:
4752                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4753         except KeyboardInterrupt:
4754                 sys.exit(u'\nERROR: Interrupted by user')
4755
4756 if __name__ == '__main__':
4757         main()
4758
4759 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: