_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196
 197 class IDParser(HTMLParser.HTMLParser):
 198         """Modified HTMLParser that isolates a tag with the specified id"""
 199         def __init__(self, id):
 200                 self.id = id
 201                 self.result = None
 202                 self.started = False
 203                 self.depth = {}
 204                 self.html = None
 205                 self.watch_startpos = False
 206                 HTMLParser.HTMLParser.__init__(self)
 207
 208         def loads(self, html):
 209                 self.html = html
 210                 self.feed(html)
 211                 self.close()
 212
 213         def handle_starttag(self, tag, attrs):
 214                 attrs = dict(attrs)
 215                 if self.started:
 216                         self.find_startpos(None)
 217                 if 'id' in attrs and attrs['id'] == self.id:
 218                         self.result = [tag]
 219                         self.started = True
 220                         self.watch_startpos = True
 221                 if self.started:
 222                         if not tag in self.depth: self.depth[tag] = 0
 223                         self.depth[tag] += 1
 224
 225         def handle_endtag(self, tag):
 226                 if self.started:
 227                         if tag in self.depth: self.depth[tag] -= 1
 228                         if self.depth[self.result[0]] == 0:
 229                                 self.started = False
 230                                 self.result.append(self.getpos())
 231
 232         def find_startpos(self, x):
 233                 """Needed to put the start position of the result (self.result[1])
 234                 after the opening tag with the requested id"""
 235                 if self.watch_startpos:
 236                         self.watch_startpos = False
 237                         self.result.append(self.getpos())
 238         handle_entityref = handle_charref = handle_data = handle_comment = \
 239         handle_decl = handle_pi = unknown_decl = find_startpos
 240
 241         def get_result(self):
 242                 if self.result == None: return None
 243                 if len(self.result) != 3: return None
 244                 lines = self.html.split('\n')
 245                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 246                 lines[0] = lines[0][self.result[1][1]:]
 247                 if len(lines) == 1:
 248                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 249                 lines[-1] = lines[-1][:self.result[2][1]]
 250                 return '\n'.join(lines).strip()
 251
 252 def get_element_by_id(id, html):
 253         """Return the content of the tag with the specified id in the passed HTML document"""
 254         parser = IDParser(id)
 255         try:
 256                 parser.loads(html)
 257         except HTMLParser.HTMLParseError:
 258                 pass
 259         return parser.get_result()
 260
 261
 262 def preferredencoding():
 263         """Get preferred encoding.
 264
 265         Returns the best encoding scheme for the system, based on
 266         locale.getpreferredencoding() and some further tweaks.
 267         """
 268         def yield_preferredencoding():
 269                 try:
 270                         pref = locale.getpreferredencoding()
 271                         u'TEST'.encode(pref)
 272                 except:
 273                         pref = 'UTF-8'
 274                 while True:
 275                         yield pref
 276         return yield_preferredencoding().next()
 277
 278
 279 def htmlentity_transform(matchobj):
 280         """Transforms an HTML entity to a Unicode character.
 281
 282         This function receives a match object and is intended to be used with
 283         the re.sub() function.
 284         """
 285         entity = matchobj.group(1)
 286
 287         # Known non-numeric HTML entity
 288         if entity in htmlentitydefs.name2codepoint:
 289                 return unichr(htmlentitydefs.name2codepoint[entity])
 290
 291         # Unicode character
 292         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 293         if mobj is not None:
 294                 numstr = mobj.group(1)
 295                 if numstr.startswith(u'x'):
 296                         base = 16
 297                         numstr = u'0%s' % numstr
 298                 else:
 299                         base = 10
 300                 return unichr(long(numstr, base))
 301
 302         # Unknown entity in name, return its literal representation
 303         return (u'&%s;' % entity)
 304
 305
 306 def clean_html(html):
 307         """Clean an HTML snippet into a readable string"""
 308         # Newline vs <br />
 309         html = html.replace('\n', ' ')
 310         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 311         # Strip html tags
 312         html = re.sub('<.*?>', '', html)
 313         # Replace html entities
 314         html = _unescapeHTML(html)
 315         return html
 316
 317
 318 def sanitize_title(utitle):
 319         """Sanitizes a video title so it could be used as part of a filename."""
 320         utitle = _unescapeHTML(utitle)
 321         return utitle.replace(unicode(os.sep), u'%')
 322
 323
 324 def sanitize_open(filename, open_mode):
 325         """Try to open the given filename, and slightly tweak it if this fails.
 326
 327         Attempts to open the given filename. If this fails, it tries to change
 328         the filename slightly, step by step, until it's either able to open it
 329         or it fails and raises a final exception, like the standard open()
 330         function.
 331
 332         It returns the tuple (stream, definitive_file_name).
 333         """
 334         try:
 335                 if filename == u'-':
 336                         if sys.platform == 'win32':
 337                                 import msvcrt
 338                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 339                         return (sys.stdout, filename)
 340                 stream = open(_encodeFilename(filename), open_mode)
 341                 return (stream, filename)
 342         except (IOError, OSError), err:
 343                 # In case of error, try to remove win32 forbidden chars
 344                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 345
 346                 # An exception here should be caught in the caller
 347                 stream = open(_encodeFilename(filename), open_mode)
 348                 return (stream, filename)
 349
 350
 351 def timeconvert(timestr):
 352         """Convert RFC 2822 defined time string into system timestamp"""
 353         timestamp = None
 354         timetuple = email.utils.parsedate_tz(timestr)
 355         if timetuple is not None:
 356                 timestamp = email.utils.mktime_tz(timetuple)
 357         return timestamp
 358
 359 def _simplify_title(title):
 360         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 361         return expr.sub(u'_', title).strip(u'_')
 362
 363 def _orderedSet(iterable):
 364         """ Remove all duplicates from the input iterable """
 365         res = []
 366         for el in iterable:
 367                 if el not in res:
 368                         res.append(el)
 369         return res
 370
 371 def _unescapeHTML(s):
 372         """
 373         @param s a string (of type unicode)
 374         """
 375         assert type(s) == type(u'')
 376
 377         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 378         return result
 379
 380 def _encodeFilename(s):
 381         """
 382         @param s The name of the file (of type unicode)
 383         """
 384
 385         assert type(s) == type(u'')
 386
 387         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 388                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 389                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 390                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 391                 return s
 392         else:
 393                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 394
 395 class DownloadError(Exception):
 396         """Download Error exception.
 397
 398         This exception may be thrown by FileDownloader objects if they are not
 399         configured to continue on errors. They will contain the appropriate
 400         error message.
 401         """
 402         pass
 403
 404
 405 class SameFileError(Exception):
 406         """Same File exception.
 407
 408         This exception will be thrown by FileDownloader objects if they detect
 409         multiple files would have to be downloaded to the same file on disk.
 410         """
 411         pass
 412
 413
 414 class PostProcessingError(Exception):
 415         """Post Processing exception.
 416
 417         This exception may be raised by PostProcessor's .run() method to
 418         indicate an error in the postprocessing task.
 419         """
 420         pass
 421
 422 class MaxDownloadsReached(Exception):
 423         """ --max-downloads limit has been reached. """
 424         pass
 425
 426
 427 class UnavailableVideoError(Exception):
 428         """Unavailable Format exception.
 429
 430         This exception will be thrown when a video is requested
 431         in a format that is not available for that video.
 432         """
 433         pass
 434
 435
 436 class ContentTooShortError(Exception):
 437         """Content Too Short exception.
 438
 439         This exception may be raised by FileDownloader objects when a file they
 440         download is too small for what the server announced first, indicating
 441         the connection was probably interrupted.
 442         """
 443         # Both in bytes
 444         downloaded = None
 445         expected = None
 446
 447         def __init__(self, downloaded, expected):
 448                 self.downloaded = downloaded
 449                 self.expected = expected
 450
 451
 452 class YoutubeDLHandler(urllib2.HTTPHandler):
 453         """Handler for HTTP requests and responses.
 454
 455         This class, when installed with an OpenerDirector, automatically adds
 456         the standard headers to every HTTP request and handles gzipped and
 457         deflated responses from web servers. If compression is to be avoided in
 458         a particular request, the original request in the program code only has
 459         to include the HTTP header "Youtubedl-No-Compression", which will be
 460         removed before making the real request.
 461
 462         Part of this code was copied from:
 463
 464         http://techknack.net/python-urllib2-handlers/
 465
 466         Andrew Rowls, the author of that code, agreed to release it to the
 467         public domain.
 468         """
 469
 470         @staticmethod
 471         def deflate(data):
 472                 try:
 473                         return zlib.decompress(data, -zlib.MAX_WBITS)
 474                 except zlib.error:
 475                         return zlib.decompress(data)
 476
 477         @staticmethod
 478         def addinfourl_wrapper(stream, headers, url, code):
 479                 if hasattr(urllib2.addinfourl, 'getcode'):
 480                         return urllib2.addinfourl(stream, headers, url, code)
 481                 ret = urllib2.addinfourl(stream, headers, url)
 482                 ret.code = code
 483                 return ret
 484
 485         def http_request(self, req):
 486                 for h in std_headers:
 487                         if h in req.headers:
 488                                 del req.headers[h]
 489                         req.add_header(h, std_headers[h])
 490                 if 'Youtubedl-no-compression' in req.headers:
 491                         if 'Accept-encoding' in req.headers:
 492                                 del req.headers['Accept-encoding']
 493                         del req.headers['Youtubedl-no-compression']
 494                 return req
 495
 496         def http_response(self, req, resp):
 497                 old_resp = resp
 498                 # gzip
 499                 if resp.headers.get('Content-encoding', '') == 'gzip':
 500                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 501                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 502                         resp.msg = old_resp.msg
 503                 # deflate
 504                 if resp.headers.get('Content-encoding', '') == 'deflate':
 505                         gz = StringIO.StringIO(self.deflate(resp.read()))
 506                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 507                         resp.msg = old_resp.msg
 508                 return resp
 509
 510
 511 class FileDownloader(object):
 512         """File Downloader class.
 513
 514         File downloader objects are the ones responsible of downloading the
 515         actual video file and writing it to disk if the user has requested
 516         it, among some other tasks. In most cases there should be one per
 517         program. As, given a video URL, the downloader doesn't know how to
 518         extract all the needed information, task that InfoExtractors do, it
 519         has to pass the URL to one of them.
 520
 521         For this, file downloader objects have a method that allows
 522         InfoExtractors to be registered in a given order. When it is passed
 523         a URL, the file downloader handles it to the first InfoExtractor it
 524         finds that reports being able to handle it. The InfoExtractor extracts
 525         all the information about the video or videos the URL refers to, and
 526         asks the FileDownloader to process the video information, possibly
 527         downloading the video.
 528
 529         File downloaders accept a lot of parameters. In order not to saturate
 530         the object constructor with arguments, it receives a dictionary of
 531         options instead. These options are available through the params
 532         attribute for the InfoExtractors to use. The FileDownloader also
 533         registers itself as the downloader in charge for the InfoExtractors
 534         that are added to it, so this is a "mutual registration".
 535
 536         Available options:
 537
 538         username:         Username for authentication purposes.
 539         password:         Password for authentication purposes.
 540         usenetrc:         Use netrc for authentication instead.
 541         quiet:            Do not print messages to stdout.
 542         forceurl:         Force printing final URL.
 543         forcetitle:       Force printing title.
 544         forcethumbnail:   Force printing thumbnail URL.
 545         forcedescription: Force printing description.
 546         forcefilename:    Force printing final filename.
 547         simulate:         Do not download the video files.
 548         format:           Video format code.
 549         format_limit:     Highest quality format to try.
 550         outtmpl:          Template for output names.
 551         ignoreerrors:     Do not stop on download errors.
 552         ratelimit:        Download speed limit, in bytes/sec.
 553         nooverwrites:     Prevent overwriting files.
 554         retries:          Number of times to retry for HTTP error 5xx
 555         continuedl:       Try to continue downloads if possible.
 556         noprogress:       Do not print the progress bar.
 557         playliststart:    Playlist item to start at.
 558         playlistend:      Playlist item to end at.
 559         matchtitle:       Download only matching titles.
 560         rejecttitle:      Reject downloads for matching titles.
 561         logtostderr:      Log messages to stderr instead of stdout.
 562         consoletitle:     Display progress in console window's titlebar.
 563         nopart:           Do not use temporary .part files.
 564         updatetime:       Use the Last-modified header to set output file timestamps.
 565         writedescription: Write the video description to a .description file
 566         writeinfojson:    Write the video description to a .info.json file
 567         writesubtitles:   Write the video subtitles to a .srt file
 568         subtitleslang:    Language of the subtitles to download
 569         """
 570
 571         params = None
 572         _ies = []
 573         _pps = []
 574         _download_retcode = None
 575         _num_downloads = None
 576         _screen_file = None
 577
 578         def __init__(self, params):
 579                 """Create a FileDownloader object with the given options."""
 580                 self._ies = []
 581                 self._pps = []
 582                 self._download_retcode = 0
 583                 self._num_downloads = 0
 584                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 585                 self.params = params
 586
 587         @staticmethod
 588         def format_bytes(bytes):
 589                 if bytes is None:
 590                         return 'N/A'
 591                 if type(bytes) is str:
 592                         bytes = float(bytes)
 593                 if bytes == 0.0:
 594                         exponent = 0
 595                 else:
 596                         exponent = long(math.log(bytes, 1024.0))
 597                 suffix = 'bkMGTPEZY'[exponent]
 598                 converted = float(bytes) / float(1024 ** exponent)
 599                 return '%.2f%s' % (converted, suffix)
 600
 601         @staticmethod
 602         def calc_percent(byte_counter, data_len):
 603                 if data_len is None:
 604                         return '---.-%'
 605                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 606
 607         @staticmethod
 608         def calc_eta(start, now, total, current):
 609                 if total is None:
 610                         return '--:--'
 611                 dif = now - start
 612                 if current == 0 or dif < 0.001: # One millisecond
 613                         return '--:--'
 614                 rate = float(current) / dif
 615                 eta = long((float(total) - float(current)) / rate)
 616                 (eta_mins, eta_secs) = divmod(eta, 60)
 617                 if eta_mins > 99:
 618                         return '--:--'
 619                 return '%02d:%02d' % (eta_mins, eta_secs)
 620
 621         @staticmethod
 622         def calc_speed(start, now, bytes):
 623                 dif = now - start
 624                 if bytes == 0 or dif < 0.001: # One millisecond
 625                         return '%10s' % '---b/s'
 626                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 627
 628         @staticmethod
 629         def best_block_size(elapsed_time, bytes):
 630                 new_min = max(bytes / 2.0, 1.0)
 631                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 632                 if elapsed_time < 0.001:
 633                         return long(new_max)
 634                 rate = bytes / elapsed_time
 635                 if rate > new_max:
 636                         return long(new_max)
 637                 if rate < new_min:
 638                         return long(new_min)
 639                 return long(rate)
 640
 641         @staticmethod
 642         def parse_bytes(bytestr):
 643                 """Parse a string indicating a byte quantity into a long integer."""
 644                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 645                 if matchobj is None:
 646                         return None
 647                 number = float(matchobj.group(1))
 648                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 649                 return long(round(number * multiplier))
 650
 651         def add_info_extractor(self, ie):
 652                 """Add an InfoExtractor object to the end of the list."""
 653                 self._ies.append(ie)
 654                 ie.set_downloader(self)
 655
 656         def add_post_processor(self, pp):
 657                 """Add a PostProcessor object to the end of the chain."""
 658                 self._pps.append(pp)
 659                 pp.set_downloader(self)
 660
 661         def to_screen(self, message, skip_eol=False):
 662                 """Print message to stdout if not in quiet mode."""
 663                 assert type(message) == type(u'')
 664                 if not self.params.get('quiet', False):
 665                         terminator = [u'\n', u''][skip_eol]
 666                         output = message + terminator
 667
 668                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 669                                 output = output.encode(preferredencoding(), 'ignore')
 670                         self._screen_file.write(output)
 671                         self._screen_file.flush()
 672
 673         def to_stderr(self, message):
 674                 """Print message to stderr."""
 675                 print >>sys.stderr, message.encode(preferredencoding())
 676
 677         def to_cons_title(self, message):
 678                 """Set console/terminal window title to message."""
 679                 if not self.params.get('consoletitle', False):
 680                         return
 681                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 682                         # c_wchar_p() might not be necessary if `message` is
 683                         # already of type unicode()
 684                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 685                 elif 'TERM' in os.environ:
 686                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 687
 688         def fixed_template(self):
 689                 """Checks if the output template is fixed."""
 690                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 691
 692         def trouble(self, message=None):
 693                 """Determine action to take when a download problem appears.
 694
 695                 Depending on if the downloader has been configured to ignore
 696                 download errors or not, this method may throw an exception or
 697                 not when errors are found, after printing the message.
 698                 """
 699                 if message is not None:
 700                         self.to_stderr(message)
 701                 if not self.params.get('ignoreerrors', False):
 702                         raise DownloadError(message)
 703                 self._download_retcode = 1
 704
 705         def slow_down(self, start_time, byte_counter):
 706                 """Sleep if the download speed is over the rate limit."""
 707                 rate_limit = self.params.get('ratelimit', None)
 708                 if rate_limit is None or byte_counter == 0:
 709                         return
 710                 now = time.time()
 711                 elapsed = now - start_time
 712                 if elapsed <= 0.0:
 713                         return
 714                 speed = float(byte_counter) / elapsed
 715                 if speed > rate_limit:
 716                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 717
 718         def temp_name(self, filename):
 719                 """Returns a temporary filename for the given filename."""
 720                 if self.params.get('nopart', False) or filename == u'-' or \
 721                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 722                         return filename
 723                 return filename + u'.part'
 724
 725         def undo_temp_name(self, filename):
 726                 if filename.endswith(u'.part'):
 727                         return filename[:-len(u'.part')]
 728                 return filename
 729
 730         def try_rename(self, old_filename, new_filename):
 731                 try:
 732                         if old_filename == new_filename:
 733                                 return
 734                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 735                 except (IOError, OSError), err:
 736                         self.trouble(u'ERROR: unable to rename file')
 737
 738         def try_utime(self, filename, last_modified_hdr):
 739                 """Try to set the last-modified time of the given file."""
 740                 if last_modified_hdr is None:
 741                         return
 742                 if not os.path.isfile(_encodeFilename(filename)):
 743                         return
 744                 timestr = last_modified_hdr
 745                 if timestr is None:
 746                         return
 747                 filetime = timeconvert(timestr)
 748                 if filetime is None:
 749                         return filetime
 750                 try:
 751                         os.utime(filename, (time.time(), filetime))
 752                 except:
 753                         pass
 754                 return filetime
 755
 756         def report_writedescription(self, descfn):
 757                 """ Report that the description file is being written """
 758                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 759
 760         def report_writesubtitles(self, srtfn):
 761                 """ Report that the subtitles file is being written """
 762                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 763
 764         def report_writeinfojson(self, infofn):
 765                 """ Report that the metadata file has been written """
 766                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 767
 768         def report_destination(self, filename):
 769                 """Report destination filename."""
 770                 self.to_screen(u'[download] Destination: ' + filename)
 771
 772         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 773                 """Report download progress."""
 774                 if self.params.get('noprogress', False):
 775                         return
 776                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 777                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 778                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 779                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 780
 781         def report_resuming_byte(self, resume_len):
 782                 """Report attempt to resume at given byte."""
 783                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 784
 785         def report_retry(self, count, retries):
 786                 """Report retry in case of HTTP error 5xx"""
 787                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 788
 789         def report_file_already_downloaded(self, file_name):
 790                 """Report file has already been fully downloaded."""
 791                 try:
 792                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 793                 except (UnicodeEncodeError), err:
 794                         self.to_screen(u'[download] The file has already been downloaded')
 795
 796         def report_unable_to_resume(self):
 797                 """Report it was impossible to resume download."""
 798                 self.to_screen(u'[download] Unable to resume')
 799
 800         def report_finish(self):
 801                 """Report download finished."""
 802                 if self.params.get('noprogress', False):
 803                         self.to_screen(u'[download] Download completed')
 804                 else:
 805                         self.to_screen(u'')
 806
 807         def increment_downloads(self):
 808                 """Increment the ordinal that assigns a number to each file."""
 809                 self._num_downloads += 1
 810
 811         def prepare_filename(self, info_dict):
 812                 """Generate the output filename."""
 813                 try:
 814                         template_dict = dict(info_dict)
 815                         template_dict['epoch'] = unicode(long(time.time()))
 816                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 817                         filename = self.params['outtmpl'] % template_dict
 818                         return filename
 819                 except (ValueError, KeyError), err:
 820                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 821                         return None
 822
 823         def _match_entry(self, info_dict):
 824                 """ Returns None iff the file should be downloaded """
 825
 826                 title = info_dict['title']
 827                 matchtitle = self.params.get('matchtitle', False)
 828                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 829                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 830                 rejecttitle = self.params.get('rejecttitle', False)
 831                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 832                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 833                 return None
 834
 835         def process_info(self, info_dict):
 836                 """Process a single dictionary returned by an InfoExtractor."""
 837
 838                 reason = self._match_entry(info_dict)
 839                 if reason is not None:
 840                         self.to_screen(u'[download] ' + reason)
 841                         return
 842
 843                 max_downloads = self.params.get('max_downloads')
 844                 if max_downloads is not None:
 845                         if self._num_downloads > int(max_downloads):
 846                                 raise MaxDownloadsReached()
 847
 848                 filename = self.prepare_filename(info_dict)
 849
 850                 # Forced printings
 851                 if self.params.get('forcetitle', False):
 852                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 853                 if self.params.get('forceurl', False):
 854                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 855                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 856                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 857                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 858                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 859                 if self.params.get('forcefilename', False) and filename is not None:
 860                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 861                 if self.params.get('forceformat', False):
 862                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 863
 864                 # Do nothing else if in simulate mode
 865                 if self.params.get('simulate', False):
 866                         return
 867
 868                 if filename is None:
 869                         return
 870
 871                 try:
 872                         dn = os.path.dirname(_encodeFilename(filename))
 873                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 874                                 os.makedirs(dn)
 875                 except (OSError, IOError), err:
 876                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 877                         return
 878
 879                 if self.params.get('writedescription', False):
 880                         try:
 881                                 descfn = filename + u'.description'
 882                                 self.report_writedescription(descfn)
 883                                 descfile = open(_encodeFilename(descfn), 'wb')
 884                                 try:
 885                                         descfile.write(info_dict['description'].encode('utf-8'))
 886                                 finally:
 887                                         descfile.close()
 888                         except (OSError, IOError):
 889                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 890                                 return
 891
 892                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 893                         # subtitles download errors are already managed as troubles in relevant IE
 894                         # that way it will silently go on when used with unsupporting IE
 895                         try:
 896                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 897                                 self.report_writesubtitles(srtfn)
 898                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 899                                 try:
 900                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 901                                 finally:
 902                                         srtfile.close()
 903                         except (OSError, IOError):
 904                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 905                                 return
 906
 907                 if self.params.get('writeinfojson', False):
 908                         infofn = filename + u'.info.json'
 909                         self.report_writeinfojson(infofn)
 910                         try:
 911                                 json.dump
 912                         except (NameError,AttributeError):
 913                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 914                                 return
 915                         try:
 916                                 infof = open(_encodeFilename(infofn), 'wb')
 917                                 try:
 918                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 919                                         json.dump(json_info_dict, infof)
 920                                 finally:
 921                                         infof.close()
 922                         except (OSError, IOError):
 923                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 924                                 return
 925
 926                 if not self.params.get('skip_download', False):
 927                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 928                                 success = True
 929                         else:
 930                                 try:
 931                                         success = self._do_download(filename, info_dict)
 932                                 except (OSError, IOError), err:
 933                                         raise UnavailableVideoError
 934                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 935                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 936                                         return
 937                                 except (ContentTooShortError, ), err:
 938                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 939                                         return
 940
 941                         if success:
 942                                 try:
 943                                         self.post_process(filename, info_dict)
 944                                 except (PostProcessingError), err:
 945                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 946                                         return
 947
 948         def download(self, url_list):
 949                 """Download a given list of URLs."""
 950                 if len(url_list) > 1 and self.fixed_template():
 951                         raise SameFileError(self.params['outtmpl'])
 952
 953                 for url in url_list:
 954                         suitable_found = False
 955                         for ie in self._ies:
 956                                 # Go to next InfoExtractor if not suitable
 957                                 if not ie.suitable(url):
 958                                         continue
 959
 960                                 # Suitable InfoExtractor found
 961                                 suitable_found = True
 962
 963                                 # Extract information from URL and process it
 964                                 ie.extract(url)
 965
 966                                 # Suitable InfoExtractor had been found; go to next URL
 967                                 break
 968
 969                         if not suitable_found:
 970                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 971
 972                 return self._download_retcode
 973
 974         def post_process(self, filename, ie_info):
 975                 """Run the postprocessing chain on the given file."""
 976                 info = dict(ie_info)
 977                 info['filepath'] = filename
 978                 for pp in self._pps:
 979                         info = pp.run(info)
 980                         if info is None:
 981                                 break
 982
 983         def _download_with_rtmpdump(self, filename, url, player_url):
 984                 self.report_destination(filename)
 985                 tmpfilename = self.temp_name(filename)
 986
 987                 # Check for rtmpdump first
 988                 try:
 989                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 990                 except (OSError, IOError):
 991                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 992                         return False
 993
 994                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 995                 # the connection was interrumpted and resuming appears to be
 996                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 997                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 998                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 999                 if self.params.get('verbose', False):
1000                         try:
1001                                 import pipes
1002                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
1003                         except ImportError:
1004                                 shell_quote = repr
1005                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
1006                 retval = subprocess.call(args)
1007                 while retval == 2 or retval == 1:
1008                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
1009                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
1010                         time.sleep(5.0) # This seems to be needed
1011                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
1012                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
1013                         if prevsize == cursize and retval == 1:
1014                                 break
1015                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
1016                         if prevsize == cursize and retval == 2 and cursize > 1024:
1017                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
1018                                 retval = 0
1019                                 break
1020                 if retval == 0:
1021                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
1022                         self.try_rename(tmpfilename, filename)
1023                         return True
1024                 else:
1025                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
1026                         return False
1027
1028         def _do_download(self, filename, info_dict):
1029                 url = info_dict['url']
1030                 player_url = info_dict.get('player_url', None)
1031
1032                 # Check file already present
1033                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
1034                         self.report_file_already_downloaded(filename)
1035                         return True
1036
1037                 # Attempt to download using rtmpdump
1038                 if url.startswith('rtmp'):
1039                         return self._download_with_rtmpdump(filename, url, player_url)
1040
1041                 tmpfilename = self.temp_name(filename)
1042                 stream = None
1043
1044                 # Do not include the Accept-Encoding header
1045                 headers = {'Youtubedl-no-compression': 'True'}
1046                 basic_request = urllib2.Request(url, None, headers)
1047                 request = urllib2.Request(url, None, headers)
1048
1049                 # Establish possible resume length
1050                 if os.path.isfile(_encodeFilename(tmpfilename)):
1051                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
1052                 else:
1053                         resume_len = 0
1054
1055                 open_mode = 'wb'
1056                 if resume_len != 0:
1057                         if self.params.get('continuedl', False):
1058                                 self.report_resuming_byte(resume_len)
1059                                 request.add_header('Range','bytes=%d-' % resume_len)
1060                                 open_mode = 'ab'
1061                         else:
1062                                 resume_len = 0
1063
1064                 count = 0
1065                 retries = self.params.get('retries', 0)
1066                 while count <= retries:
1067                         # Establish connection
1068                         try:
1069                                 if count == 0 and 'urlhandle' in info_dict:
1070                                         data = info_dict['urlhandle']
1071                                 data = urllib2.urlopen(request)
1072                                 break
1073                         except (urllib2.HTTPError, ), err:
1074                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1075                                         # Unexpected HTTP error
1076                                         raise
1077                                 elif err.code == 416:
1078                                         # Unable to resume (requested range not satisfiable)
1079                                         try:
1080                                                 # Open the connection again without the range header
1081                                                 data = urllib2.urlopen(basic_request)
1082                                                 content_length = data.info()['Content-Length']
1083                                         except (urllib2.HTTPError, ), err:
1084                                                 if err.code < 500 or err.code >= 600:
1085                                                         raise
1086                                         else:
1087                                                 # Examine the reported length
1088                                                 if (content_length is not None and
1089                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1090                                                         # The file had already been fully downloaded.
1091                                                         # Explanation to the above condition: in issue #175 it was revealed that
1092                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1093                                                         # changing the file size slightly and causing problems for some users. So
1094                                                         # I decided to implement a suggested change and consider the file
1095                                                         # completely downloaded if the file size differs less than 100 bytes from
1096                                                         # the one in the hard drive.
1097                                                         self.report_file_already_downloaded(filename)
1098                                                         self.try_rename(tmpfilename, filename)
1099                                                         return True
1100                                                 else:
1101                                                         # The length does not match, we start the download over
1102                                                         self.report_unable_to_resume()
1103                                                         open_mode = 'wb'
1104                                                         break
1105                         # Retry
1106                         count += 1
1107                         if count <= retries:
1108                                 self.report_retry(count, retries)
1109
1110                 if count > retries:
1111                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1112                         return False
1113
1114                 data_len = data.info().get('Content-length', None)
1115                 if data_len is not None:
1116                         data_len = long(data_len) + resume_len
1117                 data_len_str = self.format_bytes(data_len)
1118                 byte_counter = 0 + resume_len
1119                 block_size = 1024
1120                 start = time.time()
1121                 while True:
1122                         # Download and write
1123                         before = time.time()
1124                         data_block = data.read(block_size)
1125                         after = time.time()
1126                         if len(data_block) == 0:
1127                                 break
1128                         byte_counter += len(data_block)
1129
1130                         # Open file just in time
1131                         if stream is None:
1132                                 try:
1133                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1134                                         assert stream is not None
1135                                         filename = self.undo_temp_name(tmpfilename)
1136                                         self.report_destination(filename)
1137                                 except (OSError, IOError), err:
1138                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1139                                         return False
1140                         try:
1141                                 stream.write(data_block)
1142                         except (IOError, OSError), err:
1143                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1144                                 return False
1145                         block_size = self.best_block_size(after - before, len(data_block))
1146
1147                         # Progress message
1148                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1149                         if data_len is None:
1150                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1151                         else:
1152                                 percent_str = self.calc_percent(byte_counter, data_len)
1153                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1154                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1155
1156                         # Apply rate limit
1157                         self.slow_down(start, byte_counter - resume_len)
1158
1159                 if stream is None:
1160                         self.trouble(u'\nERROR: Did not get any data blocks')
1161                         return False
1162                 stream.close()
1163                 self.report_finish()
1164                 if data_len is not None and byte_counter != data_len:
1165                         raise ContentTooShortError(byte_counter, long(data_len))
1166                 self.try_rename(tmpfilename, filename)
1167
1168                 # Update file modification time
1169                 if self.params.get('updatetime', True):
1170                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1171
1172                 return True
1173
1174
1175 class InfoExtractor(object):
1176         """Information Extractor class.
1177
1178         Information extractors are the classes that, given a URL, extract
1179         information from the video (or videos) the URL refers to. This
1180         information includes the real video URL, the video title and simplified
1181         title, author and others. The information is stored in a dictionary
1182         which is then passed to the FileDownloader. The FileDownloader
1183         processes this information possibly downloading the video to the file
1184         system, among other possible outcomes. The dictionaries must include
1185         the following fields:
1186
1187         id:             Video identifier.
1188         url:            Final video URL.
1189         uploader:       Nickname of the video uploader.
1190         title:          Literal title.
1191         stitle:         Simplified title.
1192         ext:            Video filename extension.
1193         format:         Video format.
1194         player_url:     SWF Player URL (may be None).
1195
1196         The following fields are optional. Their primary purpose is to allow
1197         youtube-dl to serve as the backend for a video search function, such
1198         as the one in youtube2mp3.  They are only used when their respective
1199         forced printing functions are called:
1200
1201         thumbnail:      Full URL to a video thumbnail image.
1202         description:    One-line video description.
1203
1204         Subclasses of this one should re-define the _real_initialize() and
1205         _real_extract() methods and define a _VALID_URL regexp.
1206         Probably, they should also be added to the list of extractors.
1207         """
1208
1209         _ready = False
1210         _downloader = None
1211
1212         def __init__(self, downloader=None):
1213                 """Constructor. Receives an optional downloader."""
1214                 self._ready = False
1215                 self.set_downloader(downloader)
1216
1217         def suitable(self, url):
1218                 """Receives a URL and returns True if suitable for this IE."""
1219                 return re.match(self._VALID_URL, url) is not None
1220
1221         def initialize(self):
1222                 """Initializes an instance (authentication, etc)."""
1223                 if not self._ready:
1224                         self._real_initialize()
1225                         self._ready = True
1226
1227         def extract(self, url):
1228                 """Extracts URL information and returns it in list of dicts."""
1229                 self.initialize()
1230                 return self._real_extract(url)
1231
1232         def set_downloader(self, downloader):
1233                 """Sets the downloader for this IE."""
1234                 self._downloader = downloader
1235
1236         def _real_initialize(self):
1237                 """Real initialization process. Redefine in subclasses."""
1238                 pass
1239
1240         def _real_extract(self, url):
1241                 """Real extraction process. Redefine in subclasses."""
1242                 pass
1243
1244
1245 class YoutubeIE(InfoExtractor):
1246         """Information extractor for youtube.com."""
1247
1248         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1249         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1250         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1251         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1252         _NETRC_MACHINE = 'youtube'
1253         # Listed in order of quality
1254         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1255         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1256         _video_extensions = {
1257                 '13': '3gp',
1258                 '17': 'mp4',
1259                 '18': 'mp4',
1260                 '22': 'mp4',
1261                 '37': 'mp4',
1262                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1263                 '43': 'webm',
1264                 '44': 'webm',
1265                 '45': 'webm',
1266         }
1267         _video_dimensions = {
1268                 '5': '240x400',
1269                 '6': '???',
1270                 '13': '???',
1271                 '17': '144x176',
1272                 '18': '360x640',
1273                 '22': '720x1280',
1274                 '34': '360x640',
1275                 '35': '480x854',
1276                 '37': '1080x1920',
1277                 '38': '3072x4096',
1278                 '43': '360x640',
1279                 '44': '480x854',
1280                 '45': '720x1280',
1281         }
1282         IE_NAME = u'youtube'
1283
1284         def report_lang(self):
1285                 """Report attempt to set language."""
1286                 self._downloader.to_screen(u'[youtube] Setting language')
1287
1288         def report_login(self):
1289                 """Report attempt to log in."""
1290                 self._downloader.to_screen(u'[youtube] Logging in')
1291
1292         def report_age_confirmation(self):
1293                 """Report attempt to confirm age."""
1294                 self._downloader.to_screen(u'[youtube] Confirming age')
1295
1296         def report_video_webpage_download(self, video_id):
1297                 """Report attempt to download video webpage."""
1298                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1299
1300         def report_video_info_webpage_download(self, video_id):
1301                 """Report attempt to download video info webpage."""
1302                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1303
1304         def report_video_subtitles_download(self, video_id):
1305                 """Report attempt to download video info webpage."""
1306                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1307
1308         def report_information_extraction(self, video_id):
1309                 """Report attempt to extract video information."""
1310                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1311
1312         def report_unavailable_format(self, video_id, format):
1313                 """Report extracted video URL."""
1314                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1315
1316         def report_rtmp_download(self):
1317                 """Indicate the download will use the RTMP protocol."""
1318                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1319
1320         def _closed_captions_xml_to_srt(self, xml_string):
1321                 srt = ''
1322                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1323                 # TODO parse xml instead of regex
1324                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1325                         if not dur: dur = '4'
1326                         start = float(start)
1327                         end = start + float(dur)
1328                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1329                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1330                         caption = _unescapeHTML(caption)
1331                         caption = _unescapeHTML(caption) # double cycle, inentional
1332                         srt += str(n) + '\n'
1333                         srt += start + ' --> ' + end + '\n'
1334                         srt += caption + '\n\n'
1335                 return srt
1336
1337         def _print_formats(self, formats):
1338                 print 'Available formats:'
1339                 for x in formats:
1340                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1341
1342         def _real_initialize(self):
1343                 if self._downloader is None:
1344                         return
1345
1346                 username = None
1347                 password = None
1348                 downloader_params = self._downloader.params
1349
1350                 # Attempt to use provided username and password or .netrc data
1351                 if downloader_params.get('username', None) is not None:
1352                         username = downloader_params['username']
1353                         password = downloader_params['password']
1354                 elif downloader_params.get('usenetrc', False):
1355                         try:
1356                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1357                                 if info is not None:
1358                                         username = info[0]
1359                                         password = info[2]
1360                                 else:
1361                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1362                         except (IOError, netrc.NetrcParseError), err:
1363                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1364                                 return
1365
1366                 # Set language
1367                 request = urllib2.Request(self._LANG_URL)
1368                 try:
1369                         self.report_lang()
1370                         urllib2.urlopen(request).read()
1371                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1373                         return
1374
1375                 # No authentication to be performed
1376                 if username is None:
1377                         return
1378
1379                 # Log in
1380                 login_form = {
1381                                 'current_form': 'loginForm',
1382                                 'next':         '/',
1383                                 'action_login': 'Log In',
1384                                 'username':     username,
1385                                 'password':     password,
1386                                 }
1387                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1388                 try:
1389                         self.report_login()
1390                         login_results = urllib2.urlopen(request).read()
1391                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1392                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1393                                 return
1394                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1396                         return
1397
1398                 # Confirm age
1399                 age_form = {
1400                                 'next_url':             '/',
1401                                 'action_confirm':       'Confirm',
1402                                 }
1403                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1404                 try:
1405                         self.report_age_confirmation()
1406                         age_results = urllib2.urlopen(request).read()
1407                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1408                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1409                         return
1410
1411         def _real_extract(self, url):
1412                 # Extract video id from URL
1413                 mobj = re.match(self._VALID_URL, url)
1414                 if mobj is None:
1415                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1416                         return
1417                 video_id = mobj.group(2)
1418
1419                 # Get video webpage
1420                 self.report_video_webpage_download(video_id)
1421                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1422                 try:
1423                         video_webpage = urllib2.urlopen(request).read()
1424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1426                         return
1427
1428                 # Attempt to extract SWF player URL
1429                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1430                 if mobj is not None:
1431                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1432                 else:
1433                         player_url = None
1434
1435                 # Get video info
1436                 self.report_video_info_webpage_download(video_id)
1437                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1438                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1439                                         % (video_id, el_type))
1440                         request = urllib2.Request(video_info_url)
1441                         try:
1442                                 video_info_webpage = urllib2.urlopen(request).read()
1443                                 video_info = parse_qs(video_info_webpage)
1444                                 if 'token' in video_info:
1445                                         break
1446                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1447                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1448                                 return
1449                 if 'token' not in video_info:
1450                         if 'reason' in video_info:
1451                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1452                         else:
1453                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1454                         return
1455
1456                 # Start extracting information
1457                 self.report_information_extraction(video_id)
1458
1459                 # uploader
1460                 if 'author' not in video_info:
1461                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1462                         return
1463                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1464
1465                 # title
1466                 if 'title' not in video_info:
1467                         self._downloader.trouble(u'ERROR: unable to extract video title')
1468                         return
1469                 video_title = urllib.unquote_plus(video_info['title'][0])
1470                 video_title = video_title.decode('utf-8')
1471                 video_title = sanitize_title(video_title)
1472
1473                 # simplified title
1474                 simple_title = _simplify_title(video_title)
1475
1476                 # thumbnail image
1477                 if 'thumbnail_url' not in video_info:
1478                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1479                         video_thumbnail = ''
1480                 else:   # don't panic if we can't find it
1481                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1482
1483                 # upload date
1484                 upload_date = u'NA'
1485                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1486                 if mobj is not None:
1487                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1488                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1489                         for expression in format_expressions:
1490                                 try:
1491                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1492                                 except:
1493                                         pass
1494
1495                 # description
1496                 video_description = get_element_by_id("eow-description", video_webpage)
1497                 if video_description: video_description = clean_html(video_description.decode('utf8'))
1498                 else: video_description = ''
1499
1500                 # closed captions
1501                 video_subtitles = None
1502                 if self._downloader.params.get('writesubtitles', False):
1503                         self.report_video_subtitles_download(video_id)
1504                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1505                         try:
1506                                 srt_list = urllib2.urlopen(request).read()
1507                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1509                         else:
1510                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1511                                 if srt_lang_list:
1512                                         if self._downloader.params.get('subtitleslang', False):
1513                                                 srt_lang = self._downloader.params.get('subtitleslang')
1514                                         elif 'en' in srt_lang_list:
1515                                                 srt_lang = 'en'
1516                                         else:
1517                                                 srt_lang = srt_lang_list[0]
1518                                         if not srt_lang in srt_lang_list:
1519                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1520                                         else:
1521                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1522                                                 try:
1523                                                         srt_xml = urllib2.urlopen(request).read()
1524                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1525                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1526                                                 else:
1527                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1528                                 else:
1529                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1530
1531                 # token
1532                 video_token = urllib.unquote_plus(video_info['token'][0])
1533
1534                 # Decide which formats to download
1535                 req_format = self._downloader.params.get('format', None)
1536
1537                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1538                         self.report_rtmp_download()
1539                         video_url_list = [(None, video_info['conn'][0])]
1540                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1541                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1542                         url_data = [parse_qs(uds) for uds in url_data_strs]
1543                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1544                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1545
1546                         format_limit = self._downloader.params.get('format_limit', None)
1547                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1548                         if format_limit is not None and format_limit in available_formats:
1549                                 format_list = available_formats[available_formats.index(format_limit):]
1550                         else:
1551                                 format_list = available_formats
1552                         existing_formats = [x for x in format_list if x in url_map]
1553                         if len(existing_formats) == 0:
1554                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1555                                 return
1556                         if self._downloader.params.get('listformats', None):
1557                                 self._print_formats(existing_formats)
1558                                 return
1559                         if req_format is None or req_format == 'best':
1560                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1561                         elif req_format == 'worst':
1562                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1563                         elif req_format in ('-1', 'all'):
1564                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1565                         else:
1566                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1567                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1568                                 req_formats = req_format.split('/')
1569                                 video_url_list = None
1570                                 for rf in req_formats:
1571                                         if rf in url_map:
1572                                                 video_url_list = [(rf, url_map[rf])]
1573                                                 break
1574                                 if video_url_list is None:
1575                                         self._downloader.trouble(u'ERROR: requested format not available')
1576                                         return
1577                 else:
1578                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1579                         return
1580
1581                 for format_param, video_real_url in video_url_list:
1582                         # At this point we have a new video
1583                         self._downloader.increment_downloads()
1584
1585                         # Extension
1586                         video_extension = self._video_extensions.get(format_param, 'flv')
1587
1588                         try:
1589                                 # Process video information
1590                                 self._downloader.process_info({
1591                                         'id':           video_id.decode('utf-8'),
1592                                         'url':          video_real_url.decode('utf-8'),
1593                                         'uploader':     video_uploader.decode('utf-8'),
1594                                         'upload_date':  upload_date,
1595                                         'title':        video_title,
1596                                         'stitle':       simple_title,
1597                                         'ext':          video_extension.decode('utf-8'),
1598                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1599                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1600                                         'description':  video_description,
1601                                         'player_url':   player_url,
1602                                         'subtitles':    video_subtitles
1603                                 })
1604                         except UnavailableVideoError, err:
1605                                 self._downloader.trouble(u'\nERROR: unable to download video')
1606
1607
1608 class MetacafeIE(InfoExtractor):
1609         """Information Extractor for metacafe.com."""
1610
1611         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1612         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1613         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1614         _youtube_ie = None
1615         IE_NAME = u'metacafe'
1616
1617         def __init__(self, youtube_ie, downloader=None):
1618                 InfoExtractor.__init__(self, downloader)
1619                 self._youtube_ie = youtube_ie
1620
1621         def report_disclaimer(self):
1622                 """Report disclaimer retrieval."""
1623                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1624
1625         def report_age_confirmation(self):
1626                 """Report attempt to confirm age."""
1627                 self._downloader.to_screen(u'[metacafe] Confirming age')
1628
1629         def report_download_webpage(self, video_id):
1630                 """Report webpage download."""
1631                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1632
1633         def report_extraction(self, video_id):
1634                 """Report information extraction."""
1635                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1636
1637         def _real_initialize(self):
1638                 # Retrieve disclaimer
1639                 request = urllib2.Request(self._DISCLAIMER)
1640                 try:
1641                         self.report_disclaimer()
1642                         disclaimer = urllib2.urlopen(request).read()
1643                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1644                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1645                         return
1646
1647                 # Confirm age
1648                 disclaimer_form = {
1649                         'filters': '0',
1650                         'submit': "Continue - I'm over 18",
1651                         }
1652                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1653                 try:
1654                         self.report_age_confirmation()
1655                         disclaimer = urllib2.urlopen(request).read()
1656                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1657                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1658                         return
1659
1660         def _real_extract(self, url):
1661                 # Extract id and simplified title from URL
1662                 mobj = re.match(self._VALID_URL, url)
1663                 if mobj is None:
1664                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1665                         return
1666
1667                 video_id = mobj.group(1)
1668
1669                 # Check if video comes from YouTube
1670                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1671                 if mobj2 is not None:
1672                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1673                         return
1674
1675                 # At this point we have a new video
1676                 self._downloader.increment_downloads()
1677
1678                 simple_title = mobj.group(2).decode('utf-8')
1679
1680                 # Retrieve video webpage to extract further information
1681                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1682                 try:
1683                         self.report_download_webpage(video_id)
1684                         webpage = urllib2.urlopen(request).read()
1685                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1687                         return
1688
1689                 # Extract URL, uploader and title from webpage
1690                 self.report_extraction(video_id)
1691                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1692                 if mobj is not None:
1693                         mediaURL = urllib.unquote(mobj.group(1))
1694                         video_extension = mediaURL[-3:]
1695
1696                         # Extract gdaKey if available
1697                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1698                         if mobj is None:
1699                                 video_url = mediaURL
1700                         else:
1701                                 gdaKey = mobj.group(1)
1702                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1703                 else:
1704                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1705                         if mobj is None:
1706                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1707                                 return
1708                         vardict = parse_qs(mobj.group(1))
1709                         if 'mediaData' not in vardict:
1710                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1711                                 return
1712                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1713                         if mobj is None:
1714                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1715                                 return
1716                         mediaURL = mobj.group(1).replace('\\/', '/')
1717                         video_extension = mediaURL[-3:]
1718                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1719
1720                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1721                 if mobj is None:
1722                         self._downloader.trouble(u'ERROR: unable to extract title')
1723                         return
1724                 video_title = mobj.group(1).decode('utf-8')
1725                 video_title = sanitize_title(video_title)
1726
1727                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1728                 if mobj is None:
1729                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1730                         return
1731                 video_uploader = mobj.group(1)
1732
1733                 try:
1734                         # Process video information
1735                         self._downloader.process_info({
1736                                 'id':           video_id.decode('utf-8'),
1737                                 'url':          video_url.decode('utf-8'),
1738                                 'uploader':     video_uploader.decode('utf-8'),
1739                                 'upload_date':  u'NA',
1740                                 'title':        video_title,
1741                                 'stitle':       simple_title,
1742                                 'ext':          video_extension.decode('utf-8'),
1743                                 'format':       u'NA',
1744                                 'player_url':   None,
1745                         })
1746                 except UnavailableVideoError:
1747                         self._downloader.trouble(u'\nERROR: unable to download video')
1748
1749
1750 class DailymotionIE(InfoExtractor):
1751         """Information Extractor for Dailymotion"""
1752
1753         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1754         IE_NAME = u'dailymotion'
1755
1756         def __init__(self, downloader=None):
1757                 InfoExtractor.__init__(self, downloader)
1758
1759         def report_download_webpage(self, video_id):
1760                 """Report webpage download."""
1761                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1762
1763         def report_extraction(self, video_id):
1764                 """Report information extraction."""
1765                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1766
1767         def _real_extract(self, url):
1768                 # Extract id and simplified title from URL
1769                 mobj = re.match(self._VALID_URL, url)
1770                 if mobj is None:
1771                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1772                         return
1773
1774                 # At this point we have a new video
1775                 self._downloader.increment_downloads()
1776                 video_id = mobj.group(1)
1777
1778                 video_extension = 'flv'
1779
1780                 # Retrieve video webpage to extract further information
1781                 request = urllib2.Request(url)
1782                 request.add_header('Cookie', 'family_filter=off')
1783                 try:
1784                         self.report_download_webpage(video_id)
1785                         webpage = urllib2.urlopen(request).read()
1786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1787                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1788                         return
1789
1790                 # Extract URL, uploader and title from webpage
1791                 self.report_extraction(video_id)
1792                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1793                 if mobj is None:
1794                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1795                         return
1796                 sequence = urllib.unquote(mobj.group(1))
1797                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1798                 if mobj is None:
1799                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1800                         return
1801                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1802
1803                 # if needed add http://www.dailymotion.com/ if relative URL
1804
1805                 video_url = mediaURL
1806
1807                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1808                 if mobj is None:
1809                         self._downloader.trouble(u'ERROR: unable to extract title')
1810                         return
1811                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1812                 video_title = sanitize_title(video_title)
1813                 simple_title = _simplify_title(video_title)
1814
1815                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1818                         return
1819                 video_uploader = mobj.group(1)
1820
1821                 try:
1822                         # Process video information
1823                         self._downloader.process_info({
1824                                 'id':           video_id.decode('utf-8'),
1825                                 'url':          video_url.decode('utf-8'),
1826                                 'uploader':     video_uploader.decode('utf-8'),
1827                                 'upload_date':  u'NA',
1828                                 'title':        video_title,
1829                                 'stitle':       simple_title,
1830                                 'ext':          video_extension.decode('utf-8'),
1831                                 'format':       u'NA',
1832                                 'player_url':   None,
1833                         })
1834                 except UnavailableVideoError:
1835                         self._downloader.trouble(u'\nERROR: unable to download video')
1836
1837
1838 class GoogleIE(InfoExtractor):
1839         """Information extractor for video.google.com."""
1840
1841         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1842         IE_NAME = u'video.google'
1843
1844         def __init__(self, downloader=None):
1845                 InfoExtractor.__init__(self, downloader)
1846
1847         def report_download_webpage(self, video_id):
1848                 """Report webpage download."""
1849                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1850
1851         def report_extraction(self, video_id):
1852                 """Report information extraction."""
1853                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1854
1855         def _real_extract(self, url):
1856                 # Extract id from URL
1857                 mobj = re.match(self._VALID_URL, url)
1858                 if mobj is None:
1859                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1860                         return
1861
1862                 # At this point we have a new video
1863                 self._downloader.increment_downloads()
1864                 video_id = mobj.group(1)
1865
1866                 video_extension = 'mp4'
1867
1868                 # Retrieve video webpage to extract further information
1869                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1870                 try:
1871                         self.report_download_webpage(video_id)
1872                         webpage = urllib2.urlopen(request).read()
1873                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1874                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1875                         return
1876
1877                 # Extract URL, uploader, and title from webpage
1878                 self.report_extraction(video_id)
1879                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1880                 if mobj is None:
1881                         video_extension = 'flv'
1882                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1883                 if mobj is None:
1884                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1885                         return
1886                 mediaURL = urllib.unquote(mobj.group(1))
1887                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1888                 mediaURL = mediaURL.replace('\\x26', '\x26')
1889
1890                 video_url = mediaURL
1891
1892                 mobj = re.search(r'<title>(.*)</title>', webpage)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: unable to extract title')
1895                         return
1896                 video_title = mobj.group(1).decode('utf-8')
1897                 video_title = sanitize_title(video_title)
1898                 simple_title = _simplify_title(video_title)
1899
1900                 # Extract video description
1901                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: unable to extract video description')
1904                         return
1905                 video_description = mobj.group(1).decode('utf-8')
1906                 if not video_description:
1907                         video_description = 'No description available.'
1908
1909                 # Extract video thumbnail
1910                 if self._downloader.params.get('forcethumbnail', False):
1911                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1912                         try:
1913                                 webpage = urllib2.urlopen(request).read()
1914                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916                                 return
1917                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1918                         if mobj is None:
1919                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1920                                 return
1921                         video_thumbnail = mobj.group(1)
1922                 else:   # we need something to pass to process_info
1923                         video_thumbnail = ''
1924
1925                 try:
1926                         # Process video information
1927                         self._downloader.process_info({
1928                                 'id':           video_id.decode('utf-8'),
1929                                 'url':          video_url.decode('utf-8'),
1930                                 'uploader':     u'NA',
1931                                 'upload_date':  u'NA',
1932                                 'title':        video_title,
1933                                 'stitle':       simple_title,
1934                                 'ext':          video_extension.decode('utf-8'),
1935                                 'format':       u'NA',
1936                                 'player_url':   None,
1937                         })
1938                 except UnavailableVideoError:
1939                         self._downloader.trouble(u'\nERROR: unable to download video')
1940
1941
1942 class PhotobucketIE(InfoExtractor):
1943         """Information extractor for photobucket.com."""
1944
1945         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1946         IE_NAME = u'photobucket'
1947
1948         def __init__(self, downloader=None):
1949                 InfoExtractor.__init__(self, downloader)
1950
1951         def report_download_webpage(self, video_id):
1952                 """Report webpage download."""
1953                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1954
1955         def report_extraction(self, video_id):
1956                 """Report information extraction."""
1957                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1958
1959         def _real_extract(self, url):
1960                 # Extract id from URL
1961                 mobj = re.match(self._VALID_URL, url)
1962                 if mobj is None:
1963                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1964                         return
1965
1966                 # At this point we have a new video
1967                 self._downloader.increment_downloads()
1968                 video_id = mobj.group(1)
1969
1970                 video_extension = 'flv'
1971
1972                 # Retrieve video webpage to extract further information
1973                 request = urllib2.Request(url)
1974                 try:
1975                         self.report_download_webpage(video_id)
1976                         webpage = urllib2.urlopen(request).read()
1977                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1978                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1979                         return
1980
1981                 # Extract URL, uploader, and title from webpage
1982                 self.report_extraction(video_id)
1983                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1984                 if mobj is None:
1985                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1986                         return
1987                 mediaURL = urllib.unquote(mobj.group(1))
1988
1989                 video_url = mediaURL
1990
1991                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1992                 if mobj is None:
1993                         self._downloader.trouble(u'ERROR: unable to extract title')
1994                         return
1995                 video_title = mobj.group(1).decode('utf-8')
1996                 video_title = sanitize_title(video_title)
1997                 simple_title = _simplify_title(vide_title)
1998
1999                 video_uploader = mobj.group(2).decode('utf-8')
2000
2001                 try:
2002                         # Process video information
2003                         self._downloader.process_info({
2004                                 'id':           video_id.decode('utf-8'),
2005                                 'url':          video_url.decode('utf-8'),
2006                                 'uploader':     video_uploader,
2007                                 'upload_date':  u'NA',
2008                                 'title':        video_title,
2009                                 'stitle':       simple_title,
2010                                 'ext':          video_extension.decode('utf-8'),
2011                                 'format':       u'NA',
2012                                 'player_url':   None,
2013                         })
2014                 except UnavailableVideoError:
2015                         self._downloader.trouble(u'\nERROR: unable to download video')
2016
2017
2018 class YahooIE(InfoExtractor):
2019         """Information extractor for video.yahoo.com."""
2020
2021         # _VALID_URL matches all Yahoo! Video URLs
2022         # _VPAGE_URL matches only the extractable '/watch/' URLs
2023         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2024         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2025         IE_NAME = u'video.yahoo'
2026
2027         def __init__(self, downloader=None):
2028                 InfoExtractor.__init__(self, downloader)
2029
2030         def report_download_webpage(self, video_id):
2031                 """Report webpage download."""
2032                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2033
2034         def report_extraction(self, video_id):
2035                 """Report information extraction."""
2036                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2037
2038         def _real_extract(self, url, new_video=True):
2039                 # Extract ID from URL
2040                 mobj = re.match(self._VALID_URL, url)
2041                 if mobj is None:
2042                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2043                         return
2044
2045                 # At this point we have a new video
2046                 self._downloader.increment_downloads()
2047                 video_id = mobj.group(2)
2048                 video_extension = 'flv'
2049
2050                 # Rewrite valid but non-extractable URLs as
2051                 # extractable English language /watch/ URLs
2052                 if re.match(self._VPAGE_URL, url) is None:
2053                         request = urllib2.Request(url)
2054                         try:
2055                                 webpage = urllib2.urlopen(request).read()
2056                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2057                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2058                                 return
2059
2060                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2061                         if mobj is None:
2062                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2063                                 return
2064                         yahoo_id = mobj.group(1)
2065
2066                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2067                         if mobj is None:
2068                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2069                                 return
2070                         yahoo_vid = mobj.group(1)
2071
2072                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2073                         return self._real_extract(url, new_video=False)
2074
2075                 # Retrieve video webpage to extract further information
2076                 request = urllib2.Request(url)
2077                 try:
2078                         self.report_download_webpage(video_id)
2079                         webpage = urllib2.urlopen(request).read()
2080                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2082                         return
2083
2084                 # Extract uploader and title from webpage
2085                 self.report_extraction(video_id)
2086                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2087                 if mobj is None:
2088                         self._downloader.trouble(u'ERROR: unable to extract video title')
2089                         return
2090                 video_title = mobj.group(1).decode('utf-8')
2091                 simple_title = _simplify_title(video_title)
2092
2093                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2094                 if mobj is None:
2095                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2096                         return
2097                 video_uploader = mobj.group(1).decode('utf-8')
2098
2099                 # Extract video thumbnail
2100                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2101                 if mobj is None:
2102                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2103                         return
2104                 video_thumbnail = mobj.group(1).decode('utf-8')
2105
2106                 # Extract video description
2107                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: unable to extract video description')
2110                         return
2111                 video_description = mobj.group(1).decode('utf-8')
2112                 if not video_description:
2113                         video_description = 'No description available.'
2114
2115                 # Extract video height and width
2116                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2117                 if mobj is None:
2118                         self._downloader.trouble(u'ERROR: unable to extract video height')
2119                         return
2120                 yv_video_height = mobj.group(1)
2121
2122                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2123                 if mobj is None:
2124                         self._downloader.trouble(u'ERROR: unable to extract video width')
2125                         return
2126                 yv_video_width = mobj.group(1)
2127
2128                 # Retrieve video playlist to extract media URL
2129                 # I'm not completely sure what all these options are, but we
2130                 # seem to need most of them, otherwise the server sends a 401.
2131                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2132                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2133                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2134                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2135                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2136                 try:
2137                         self.report_download_webpage(video_id)
2138                         webpage = urllib2.urlopen(request).read()
2139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2141                         return
2142
2143                 # Extract media URL from playlist XML
2144                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2145                 if mobj is None:
2146                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2147                         return
2148                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2149                 video_url = _unescapeHTML(video_url)
2150
2151                 try:
2152                         # Process video information
2153                         self._downloader.process_info({
2154                                 'id':           video_id.decode('utf-8'),
2155                                 'url':          video_url,
2156                                 'uploader':     video_uploader,
2157                                 'upload_date':  u'NA',
2158                                 'title':        video_title,
2159                                 'stitle':       simple_title,
2160                                 'ext':          video_extension.decode('utf-8'),
2161                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2162                                 'description':  video_description,
2163                                 'thumbnail':    video_thumbnail,
2164                                 'player_url':   None,
2165                         })
2166                 except UnavailableVideoError:
2167                         self._downloader.trouble(u'\nERROR: unable to download video')
2168
2169
2170 class VimeoIE(InfoExtractor):
2171         """Information extractor for vimeo.com."""
2172
2173         # _VALID_URL matches Vimeo URLs
2174         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2175         IE_NAME = u'vimeo'
2176
2177         def __init__(self, downloader=None):
2178                 InfoExtractor.__init__(self, downloader)
2179
2180         def report_download_webpage(self, video_id):
2181                 """Report webpage download."""
2182                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2183
2184         def report_extraction(self, video_id):
2185                 """Report information extraction."""
2186                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2187
2188         def _real_extract(self, url, new_video=True):
2189                 # Extract ID from URL
2190                 mobj = re.match(self._VALID_URL, url)
2191                 if mobj is None:
2192                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2193                         return
2194
2195                 # At this point we have a new video
2196                 self._downloader.increment_downloads()
2197                 video_id = mobj.group(1)
2198
2199                 # Retrieve video webpage to extract further information
2200                 request = urllib2.Request(url, None, std_headers)
2201                 try:
2202                         self.report_download_webpage(video_id)
2203                         webpage = urllib2.urlopen(request).read()
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2206                         return
2207
2208                 # Now we begin extracting as much information as we can from what we
2209                 # retrieved. First we extract the information common to all extractors,
2210                 # and latter we extract those that are Vimeo specific.
2211                 self.report_extraction(video_id)
2212
2213                 # Extract the config JSON
2214                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2215                 try:
2216                         config = json.loads(config)
2217                 except:
2218                         self._downloader.trouble(u'ERROR: unable to extract info section')
2219                         return
2220
2221                 # Extract title
2222                 video_title = config["video"]["title"]
2223                 simple_title = _simplify_title(video_title)
2224
2225                 # Extract uploader
2226                 video_uploader = config["video"]["owner"]["name"]
2227
2228                 # Extract video thumbnail
2229                 video_thumbnail = config["video"]["thumbnail"]
2230
2231                 # Extract video description
2232                 video_description = get_element_by_id("description", webpage)
2233                 if video_description: video_description = clean_html(video_description.decode('utf8'))
2234                 else: video_description = ''
2235
2236                 # Extract upload date
2237                 video_upload_date = u'NA'
2238                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2239                 if mobj is not None:
2240                         video_upload_date = mobj.group(1)
2241
2242                 # Vimeo specific: extract request signature and timestamp
2243                 sig = config['request']['signature']
2244                 timestamp = config['request']['timestamp']
2245
2246                 # Vimeo specific: extract video codec and quality information
2247                 # TODO bind to format param
2248                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2249                 for codec in codecs:
2250                         if codec[0] in config["video"]["files"]:
2251                                 video_codec = codec[0]
2252                                 video_extension = codec[1]
2253                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2254                                 else: quality = 'sd'
2255                                 break
2256                 else:
2257                         self._downloader.trouble(u'ERROR: no known codec found')
2258                         return
2259
2260                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2261                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2262
2263                 try:
2264                         # Process video information
2265                         self._downloader.process_info({
2266                                 'id':           video_id,
2267                                 'url':          video_url,
2268                                 'uploader':     video_uploader,
2269                                 'upload_date':  video_upload_date,
2270                                 'title':        video_title,
2271                                 'stitle':       simple_title,
2272                                 'ext':          video_extension,
2273                                 'thumbnail':    video_thumbnail,
2274                                 'description':  video_description,
2275                                 'player_url':   None,
2276                         })
2277                 except UnavailableVideoError:
2278                         self._downloader.trouble(u'ERROR: unable to download video')
2279
2280
2281 class GenericIE(InfoExtractor):
2282         """Generic last-resort information extractor."""
2283
2284         _VALID_URL = r'.*'
2285         IE_NAME = u'generic'
2286
2287         def __init__(self, downloader=None):
2288                 InfoExtractor.__init__(self, downloader)
2289
2290         def report_download_webpage(self, video_id):
2291                 """Report webpage download."""
2292                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2293                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2294
2295         def report_extraction(self, video_id):
2296                 """Report information extraction."""
2297                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2298
2299         def _real_extract(self, url):
2300                 # At this point we have a new video
2301                 self._downloader.increment_downloads()
2302
2303                 video_id = url.split('/')[-1]
2304                 request = urllib2.Request(url)
2305                 try:
2306                         self.report_download_webpage(video_id)
2307                         webpage = urllib2.urlopen(request).read()
2308                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2310                         return
2311                 except ValueError, err:
2312                         # since this is the last-resort InfoExtractor, if
2313                         # this error is thrown, it'll be thrown here
2314                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2315                         return
2316
2317                 self.report_extraction(video_id)
2318                 # Start with something easy: JW Player in SWFObject
2319                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2320                 if mobj is None:
2321                         # Broaden the search a little bit
2322                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2323                 if mobj is None:
2324                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2325                         return
2326
2327                 # It's possible that one of the regexes
2328                 # matched, but returned an empty group:
2329                 if mobj.group(1) is None:
2330                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2331                         return
2332
2333                 video_url = urllib.unquote(mobj.group(1))
2334                 video_id = os.path.basename(video_url)
2335
2336                 # here's a fun little line of code for you:
2337                 video_extension = os.path.splitext(video_id)[1][1:]
2338                 video_id = os.path.splitext(video_id)[0]
2339
2340                 # it's tempting to parse this further, but you would
2341                 # have to take into account all the variations like
2342                 #   Video Title - Site Name
2343                 #   Site Name | Video Title
2344                 #   Video Title - Tagline | Site Name
2345                 # and so on and so forth; it's just not practical
2346                 mobj = re.search(r'<title>(.*)</title>', webpage)
2347                 if mobj is None:
2348                         self._downloader.trouble(u'ERROR: unable to extract title')
2349                         return
2350                 video_title = mobj.group(1).decode('utf-8')
2351                 video_title = sanitize_title(video_title)
2352                 simple_title = _simplify_title(video_title)
2353
2354                 # video uploader is domain name
2355                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2356                 if mobj is None:
2357                         self._downloader.trouble(u'ERROR: unable to extract title')
2358                         return
2359                 video_uploader = mobj.group(1).decode('utf-8')
2360
2361                 try:
2362                         # Process video information
2363                         self._downloader.process_info({
2364                                 'id':           video_id.decode('utf-8'),
2365                                 'url':          video_url.decode('utf-8'),
2366                                 'uploader':     video_uploader,
2367                                 'upload_date':  u'NA',
2368                                 'title':        video_title,
2369                                 'stitle':       simple_title,
2370                                 'ext':          video_extension.decode('utf-8'),
2371                                 'format':       u'NA',
2372                                 'player_url':   None,
2373                         })
2374                 except UnavailableVideoError, err:
2375                         self._downloader.trouble(u'\nERROR: unable to download video')
2376
2377
2378 class YoutubeSearchIE(InfoExtractor):
2379         """Information Extractor for YouTube search queries."""
2380         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2381         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2382         _youtube_ie = None
2383         _max_youtube_results = 1000
2384         IE_NAME = u'youtube:search'
2385
2386         def __init__(self, youtube_ie, downloader=None):
2387                 InfoExtractor.__init__(self, downloader)
2388                 self._youtube_ie = youtube_ie
2389
2390         def report_download_page(self, query, pagenum):
2391                 """Report attempt to download playlist page with given number."""
2392                 query = query.decode(preferredencoding())
2393                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2394
2395         def _real_initialize(self):
2396                 self._youtube_ie.initialize()
2397
2398         def _real_extract(self, query):
2399                 mobj = re.match(self._VALID_URL, query)
2400                 if mobj is None:
2401                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2402                         return
2403
2404                 prefix, query = query.split(':')
2405                 prefix = prefix[8:]
2406                 query = query.encode('utf-8')
2407                 if prefix == '':
2408                         self._download_n_results(query, 1)
2409                         return
2410                 elif prefix == 'all':
2411                         self._download_n_results(query, self._max_youtube_results)
2412                         return
2413                 else:
2414                         try:
2415                                 n = long(prefix)
2416                                 if n <= 0:
2417                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2418                                         return
2419                                 elif n > self._max_youtube_results:
2420                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2421                                         n = self._max_youtube_results
2422                                 self._download_n_results(query, n)
2423                                 return
2424                         except ValueError: # parsing prefix as integer fails
2425                                 self._download_n_results(query, 1)
2426                                 return
2427
2428         def _download_n_results(self, query, n):
2429                 """Downloads a specified number of results for a query"""
2430
2431                 video_ids = []
2432                 pagenum = 0
2433                 limit = n
2434
2435                 while (50 * pagenum) < limit:
2436                         self.report_download_page(query, pagenum+1)
2437                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2438                         request = urllib2.Request(result_url)
2439                         try:
2440                                 data = urllib2.urlopen(request).read()
2441                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2442                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2443                                 return
2444                         api_response = json.loads(data)['data']
2445
2446                         new_ids = list(video['id'] for video in api_response['items'])
2447                         video_ids += new_ids
2448
2449                         limit = min(n, api_response['totalItems'])
2450                         pagenum += 1
2451
2452                 if len(video_ids) > n:
2453                         video_ids = video_ids[:n]
2454                 for id in video_ids:
2455                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2456                 return
2457
2458
2459 class GoogleSearchIE(InfoExtractor):
2460         """Information Extractor for Google Video search queries."""
2461         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2462         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2463         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2464         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2465         _google_ie = None
2466         _max_google_results = 1000
2467         IE_NAME = u'video.google:search'
2468
2469         def __init__(self, google_ie, downloader=None):
2470                 InfoExtractor.__init__(self, downloader)
2471                 self._google_ie = google_ie
2472
2473         def report_download_page(self, query, pagenum):
2474                 """Report attempt to download playlist page with given number."""
2475                 query = query.decode(preferredencoding())
2476                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2477
2478         def _real_initialize(self):
2479                 self._google_ie.initialize()
2480
2481         def _real_extract(self, query):
2482                 mobj = re.match(self._VALID_URL, query)
2483                 if mobj is None:
2484                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2485                         return
2486
2487                 prefix, query = query.split(':')
2488                 prefix = prefix[8:]
2489                 query = query.encode('utf-8')
2490                 if prefix == '':
2491                         self._download_n_results(query, 1)
2492                         return
2493                 elif prefix == 'all':
2494                         self._download_n_results(query, self._max_google_results)
2495                         return
2496                 else:
2497                         try:
2498                                 n = long(prefix)
2499                                 if n <= 0:
2500                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2501                                         return
2502                                 elif n > self._max_google_results:
2503                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2504                                         n = self._max_google_results
2505                                 self._download_n_results(query, n)
2506                                 return
2507                         except ValueError: # parsing prefix as integer fails
2508                                 self._download_n_results(query, 1)
2509                                 return
2510
2511         def _download_n_results(self, query, n):
2512                 """Downloads a specified number of results for a query"""
2513
2514                 video_ids = []
2515                 pagenum = 0
2516
2517                 while True:
2518                         self.report_download_page(query, pagenum)
2519                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2520                         request = urllib2.Request(result_url)
2521                         try:
2522                                 page = urllib2.urlopen(request).read()
2523                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2525                                 return
2526
2527                         # Extract video identifiers
2528                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2529                                 video_id = mobj.group(1)
2530                                 if video_id not in video_ids:
2531                                         video_ids.append(video_id)
2532                                         if len(video_ids) == n:
2533                                                 # Specified n videos reached
2534                                                 for id in video_ids:
2535                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2536                                                 return
2537
2538                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2539                                 for id in video_ids:
2540                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2541                                 return
2542
2543                         pagenum = pagenum + 1
2544
2545
2546 class YahooSearchIE(InfoExtractor):
2547         """Information Extractor for Yahoo! Video search queries."""
2548         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2549         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2550         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2551         _MORE_PAGES_INDICATOR = r'\s*Next'
2552         _yahoo_ie = None
2553         _max_yahoo_results = 1000
2554         IE_NAME = u'video.yahoo:search'
2555
2556         def __init__(self, yahoo_ie, downloader=None):
2557                 InfoExtractor.__init__(self, downloader)
2558                 self._yahoo_ie = yahoo_ie
2559
2560         def report_download_page(self, query, pagenum):
2561                 """Report attempt to download playlist page with given number."""
2562                 query = query.decode(preferredencoding())
2563                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2564
2565         def _real_initialize(self):
2566                 self._yahoo_ie.initialize()
2567
2568         def _real_extract(self, query):
2569                 mobj = re.match(self._VALID_URL, query)
2570                 if mobj is None:
2571                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2572                         return
2573
2574                 prefix, query = query.split(':')
2575                 prefix = prefix[8:]
2576                 query = query.encode('utf-8')
2577                 if prefix == '':
2578                         self._download_n_results(query, 1)
2579                         return
2580                 elif prefix == 'all':
2581                         self._download_n_results(query, self._max_yahoo_results)
2582                         return
2583                 else:
2584                         try:
2585                                 n = long(prefix)
2586                                 if n <= 0:
2587                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2588                                         return
2589                                 elif n > self._max_yahoo_results:
2590                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2591                                         n = self._max_yahoo_results
2592                                 self._download_n_results(query, n)
2593                                 return
2594                         except ValueError: # parsing prefix as integer fails
2595                                 self._download_n_results(query, 1)
2596                                 return
2597
2598         def _download_n_results(self, query, n):
2599                 """Downloads a specified number of results for a query"""
2600
2601                 video_ids = []
2602                 already_seen = set()
2603                 pagenum = 1
2604
2605                 while True:
2606                         self.report_download_page(query, pagenum)
2607                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2608                         request = urllib2.Request(result_url)
2609                         try:
2610                                 page = urllib2.urlopen(request).read()
2611                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2613                                 return
2614
2615                         # Extract video identifiers
2616                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2617                                 video_id = mobj.group(1)
2618                                 if video_id not in already_seen:
2619                                         video_ids.append(video_id)
2620                                         already_seen.add(video_id)
2621                                         if len(video_ids) == n:
2622                                                 # Specified n videos reached
2623                                                 for id in video_ids:
2624                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2625                                                 return
2626
2627                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2628                                 for id in video_ids:
2629                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2630                                 return
2631
2632                         pagenum = pagenum + 1
2633
2634
2635 class YoutubePlaylistIE(InfoExtractor):
2636         """Information Extractor for YouTube playlists."""
2637
2638         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2639         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2640         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2641         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2642         _youtube_ie = None
2643         IE_NAME = u'youtube:playlist'
2644
2645         def __init__(self, youtube_ie, downloader=None):
2646                 InfoExtractor.__init__(self, downloader)
2647                 self._youtube_ie = youtube_ie
2648
2649         def report_download_page(self, playlist_id, pagenum):
2650                 """Report attempt to download playlist page with given number."""
2651                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2652
2653         def _real_initialize(self):
2654                 self._youtube_ie.initialize()
2655
2656         def _real_extract(self, url):
2657                 # Extract playlist id
2658                 mobj = re.match(self._VALID_URL, url)
2659                 if mobj is None:
2660                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2661                         return
2662
2663                 # Single video case
2664                 if mobj.group(3) is not None:
2665                         self._youtube_ie.extract(mobj.group(3))
2666                         return
2667
2668                 # Download playlist pages
2669                 # prefix is 'p' as default for playlists but there are other types that need extra care
2670                 playlist_prefix = mobj.group(1)
2671                 if playlist_prefix == 'a':
2672                         playlist_access = 'artist'
2673                 else:
2674                         playlist_prefix = 'p'
2675                         playlist_access = 'view_play_list'
2676                 playlist_id = mobj.group(2)
2677                 video_ids = []
2678                 pagenum = 1
2679
2680                 while True:
2681                         self.report_download_page(playlist_id, pagenum)
2682                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2683                         request = urllib2.Request(url)
2684                         try:
2685                                 page = urllib2.urlopen(request).read()
2686                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2687                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2688                                 return
2689
2690                         # Extract video identifiers
2691                         ids_in_page = []
2692                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2693                                 if mobj.group(1) not in ids_in_page:
2694                                         ids_in_page.append(mobj.group(1))
2695                         video_ids.extend(ids_in_page)
2696
2697                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2698                                 break
2699                         pagenum = pagenum + 1
2700
2701                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2702                 playlistend = self._downloader.params.get('playlistend', -1)
2703                 if playlistend == -1:
2704                         video_ids = video_ids[playliststart:]
2705                 else:
2706                         video_ids = video_ids[playliststart:playlistend]
2707
2708                 for id in video_ids:
2709                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2710                 return
2711
2712
2713 class YoutubeUserIE(InfoExtractor):
2714         """Information Extractor for YouTube users."""
2715
2716         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2717         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2718         _GDATA_PAGE_SIZE = 50
2719         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2720         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2721         _youtube_ie = None
2722         IE_NAME = u'youtube:user'
2723
2724         def __init__(self, youtube_ie, downloader=None):
2725                 InfoExtractor.__init__(self, downloader)
2726                 self._youtube_ie = youtube_ie
2727
2728         def report_download_page(self, username, start_index):
2729                 """Report attempt to download user page."""
2730                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2731                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2732
2733         def _real_initialize(self):
2734                 self._youtube_ie.initialize()
2735
2736         def _real_extract(self, url):
2737                 # Extract username
2738                 mobj = re.match(self._VALID_URL, url)
2739                 if mobj is None:
2740                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2741                         return
2742
2743                 username = mobj.group(1)
2744
2745                 # Download video ids using YouTube Data API. Result size per
2746                 # query is limited (currently to 50 videos) so we need to query
2747                 # page by page until there are no video ids - it means we got
2748                 # all of them.
2749
2750                 video_ids = []
2751                 pagenum = 0
2752
2753                 while True:
2754                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2755                         self.report_download_page(username, start_index)
2756
2757                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2758
2759                         try:
2760                                 page = urllib2.urlopen(request).read()
2761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2763                                 return
2764
2765                         # Extract video identifiers
2766                         ids_in_page = []
2767
2768                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2769                                 if mobj.group(1) not in ids_in_page:
2770                                         ids_in_page.append(mobj.group(1))
2771
2772                         video_ids.extend(ids_in_page)
2773
2774                         # A little optimization - if current page is not
2775                         # "full", ie. does not contain PAGE_SIZE video ids then
2776                         # we can assume that this page is the last one - there
2777                         # are no more ids on further pages - no need to query
2778                         # again.
2779
2780                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2781                                 break
2782
2783                         pagenum += 1
2784
2785                 all_ids_count = len(video_ids)
2786                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2787                 playlistend = self._downloader.params.get('playlistend', -1)
2788
2789                 if playlistend == -1:
2790                         video_ids = video_ids[playliststart:]
2791                 else:
2792                         video_ids = video_ids[playliststart:playlistend]
2793
2794                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2795                                 (username, all_ids_count, len(video_ids)))
2796
2797                 for video_id in video_ids:
2798                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2799
2800
2801 class DepositFilesIE(InfoExtractor):
2802         """Information extractor for depositfiles.com"""
2803
2804         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2805         IE_NAME = u'DepositFiles'
2806
2807         def __init__(self, downloader=None):
2808                 InfoExtractor.__init__(self, downloader)
2809
2810         def report_download_webpage(self, file_id):
2811                 """Report webpage download."""
2812                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2813
2814         def report_extraction(self, file_id):
2815                 """Report information extraction."""
2816                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2817
2818         def _real_extract(self, url):
2819                 # At this point we have a new file
2820                 self._downloader.increment_downloads()
2821
2822                 file_id = url.split('/')[-1]
2823                 # Rebuild url in english locale
2824                 url = 'http://depositfiles.com/en/files/' + file_id
2825
2826                 # Retrieve file webpage with 'Free download' button pressed
2827                 free_download_indication = { 'gateway_result' : '1' }
2828                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2829                 try:
2830                         self.report_download_webpage(file_id)
2831                         webpage = urllib2.urlopen(request).read()
2832                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2834                         return
2835
2836                 # Search for the real file URL
2837                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2838                 if (mobj is None) or (mobj.group(1) is None):
2839                         # Try to figure out reason of the error.
2840                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2841                         if (mobj is not None) and (mobj.group(1) is not None):
2842                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2843                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2844                         else:
2845                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2846                         return
2847
2848                 file_url = mobj.group(1)
2849                 file_extension = os.path.splitext(file_url)[1][1:]
2850
2851                 # Search for file title
2852                 mobj = re.search(r'<b title="(.*?)">', webpage)
2853                 if mobj is None:
2854                         self._downloader.trouble(u'ERROR: unable to extract title')
2855                         return
2856                 file_title = mobj.group(1).decode('utf-8')
2857
2858                 try:
2859                         # Process file information
2860                         self._downloader.process_info({
2861                                 'id':           file_id.decode('utf-8'),
2862                                 'url':          file_url.decode('utf-8'),
2863                                 'uploader':     u'NA',
2864                                 'upload_date':  u'NA',
2865                                 'title':        file_title,
2866                                 'stitle':       file_title,
2867                                 'ext':          file_extension.decode('utf-8'),
2868                                 'format':       u'NA',
2869                                 'player_url':   None,
2870                         })
2871                 except UnavailableVideoError, err:
2872                         self._downloader.trouble(u'ERROR: unable to download file')
2873
2874
2875 class FacebookIE(InfoExtractor):
2876         """Information Extractor for Facebook"""
2877
2878         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2879         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2880         _NETRC_MACHINE = 'facebook'
2881         _available_formats = ['video', 'highqual', 'lowqual']
2882         _video_extensions = {
2883                 'video': 'mp4',
2884                 'highqual': 'mp4',
2885                 'lowqual': 'mp4',
2886         }
2887         IE_NAME = u'facebook'
2888
2889         def __init__(self, downloader=None):
2890                 InfoExtractor.__init__(self, downloader)
2891
2892         def _reporter(self, message):
2893                 """Add header and report message."""
2894                 self._downloader.to_screen(u'[facebook] %s' % message)
2895
2896         def report_login(self):
2897                 """Report attempt to log in."""
2898                 self._reporter(u'Logging in')
2899
2900         def report_video_webpage_download(self, video_id):
2901                 """Report attempt to download video webpage."""
2902                 self._reporter(u'%s: Downloading video webpage' % video_id)
2903
2904         def report_information_extraction(self, video_id):
2905                 """Report attempt to extract video information."""
2906                 self._reporter(u'%s: Extracting video information' % video_id)
2907
2908         def _parse_page(self, video_webpage):
2909                 """Extract video information from page"""
2910                 # General data
2911                 data = {'title': r'\("video_title", "(.*?)"\)',
2912                         'description': r'<div class="datawrap">(.*?)</div>',
2913                         'owner': r'\("video_owner_name", "(.*?)"\)',
2914                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2915                         }
2916                 video_info = {}
2917                 for piece in data.keys():
2918                         mobj = re.search(data[piece], video_webpage)
2919                         if mobj is not None:
2920                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2921
2922                 # Video urls
2923                 video_urls = {}
2924                 for fmt in self._available_formats:
2925                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2926                         if mobj is not None:
2927                                 # URL is in a Javascript segment inside an escaped Unicode format within
2928                                 # the generally utf-8 page
2929                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2930                 video_info['video_urls'] = video_urls
2931
2932                 return video_info
2933
2934         def _real_initialize(self):
2935                 if self._downloader is None:
2936                         return
2937
2938                 useremail = None
2939                 password = None
2940                 downloader_params = self._downloader.params
2941
2942                 # Attempt to use provided username and password or .netrc data
2943                 if downloader_params.get('username', None) is not None:
2944                         useremail = downloader_params['username']
2945                         password = downloader_params['password']
2946                 elif downloader_params.get('usenetrc', False):
2947                         try:
2948                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2949                                 if info is not None:
2950                                         useremail = info[0]
2951                                         password = info[2]
2952                                 else:
2953                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2954                         except (IOError, netrc.NetrcParseError), err:
2955                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2956                                 return
2957
2958                 if useremail is None:
2959                         return
2960
2961                 # Log in
2962                 login_form = {
2963                         'email': useremail,
2964                         'pass': password,
2965                         'login': 'Log+In'
2966                         }
2967                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2968                 try:
2969                         self.report_login()
2970                         login_results = urllib2.urlopen(request).read()
2971                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2972                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2973                                 return
2974                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2975                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2976                         return
2977
2978         def _real_extract(self, url):
2979                 mobj = re.match(self._VALID_URL, url)
2980                 if mobj is None:
2981                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2982                         return
2983                 video_id = mobj.group('ID')
2984
2985                 # Get video webpage
2986                 self.report_video_webpage_download(video_id)
2987                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2988                 try:
2989                         page = urllib2.urlopen(request)
2990                         video_webpage = page.read()
2991                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2992                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2993                         return
2994
2995                 # Start extracting information
2996                 self.report_information_extraction(video_id)
2997
2998                 # Extract information
2999                 video_info = self._parse_page(video_webpage)
3000
3001                 # uploader
3002                 if 'owner' not in video_info:
3003                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3004                         return
3005                 video_uploader = video_info['owner']
3006
3007                 # title
3008                 if 'title' not in video_info:
3009                         self._downloader.trouble(u'ERROR: unable to extract video title')
3010                         return
3011                 video_title = video_info['title']
3012                 video_title = video_title.decode('utf-8')
3013                 video_title = sanitize_title(video_title)
3014
3015                 simple_title = _simplify_title(video_title)
3016
3017                 # thumbnail image
3018                 if 'thumbnail' not in video_info:
3019                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3020                         video_thumbnail = ''
3021                 else:
3022                         video_thumbnail = video_info['thumbnail']
3023
3024                 # upload date
3025                 upload_date = u'NA'
3026                 if 'upload_date' in video_info:
3027                         upload_time = video_info['upload_date']
3028                         timetuple = email.utils.parsedate_tz(upload_time)
3029                         if timetuple is not None:
3030                                 try:
3031                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3032                                 except:
3033                                         pass
3034
3035                 # description
3036                 video_description = video_info.get('description', 'No description available.')
3037
3038                 url_map = video_info['video_urls']
3039                 if len(url_map.keys()) > 0:
3040                         # Decide which formats to download
3041                         req_format = self._downloader.params.get('format', None)
3042                         format_limit = self._downloader.params.get('format_limit', None)
3043
3044                         if format_limit is not None and format_limit in self._available_formats:
3045                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3046                         else:
3047                                 format_list = self._available_formats
3048                         existing_formats = [x for x in format_list if x in url_map]
3049                         if len(existing_formats) == 0:
3050                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3051                                 return
3052                         if req_format is None:
3053                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3054                         elif req_format == 'worst':
3055                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3056                         elif req_format == '-1':
3057                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3058                         else:
3059                                 # Specific format
3060                                 if req_format not in url_map:
3061                                         self._downloader.trouble(u'ERROR: requested format not available')
3062                                         return
3063                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3064
3065                 for format_param, video_real_url in video_url_list:
3066
3067                         # At this point we have a new video
3068                         self._downloader.increment_downloads()
3069
3070                         # Extension
3071                         video_extension = self._video_extensions.get(format_param, 'mp4')
3072
3073                         try:
3074                                 # Process video information
3075                                 self._downloader.process_info({
3076                                         'id':           video_id.decode('utf-8'),
3077                                         'url':          video_real_url.decode('utf-8'),
3078                                         'uploader':     video_uploader.decode('utf-8'),
3079                                         'upload_date':  upload_date,
3080                                         'title':        video_title,
3081                                         'stitle':       simple_title,
3082                                         'ext':          video_extension.decode('utf-8'),
3083                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3084                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3085                                         'description':  video_description.decode('utf-8'),
3086                                         'player_url':   None,
3087                                 })
3088                         except UnavailableVideoError, err:
3089                                 self._downloader.trouble(u'\nERROR: unable to download video')
3090
3091 class BlipTVIE(InfoExtractor):
3092         """Information extractor for blip.tv"""
3093
3094         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3095         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3096         IE_NAME = u'blip.tv'
3097
3098         def report_extraction(self, file_id):
3099                 """Report information extraction."""
3100                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3101
3102         def report_direct_download(self, title):
3103                 """Report information extraction."""
3104                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3105
3106         def _real_extract(self, url):
3107                 mobj = re.match(self._VALID_URL, url)
3108                 if mobj is None:
3109                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3110                         return
3111
3112                 if '?' in url:
3113                         cchar = '&'
3114                 else:
3115                         cchar = '?'
3116                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3117                 request = urllib2.Request(json_url)
3118                 self.report_extraction(mobj.group(1))
3119                 info = None
3120                 try:
3121                         urlh = urllib2.urlopen(request)
3122                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3123                                 basename = url.split('/')[-1]
3124                                 title,ext = os.path.splitext(basename)
3125                                 title = title.decode('UTF-8')
3126                                 ext = ext.replace('.', '')
3127                                 self.report_direct_download(title)
3128                                 info = {
3129                                         'id': title,
3130                                         'url': url,
3131                                         'title': title,
3132                                         'stitle': _simplify_title(title),
3133                                         'ext': ext,
3134                                         'urlhandle': urlh
3135                                 }
3136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3137                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3138                         return
3139                 if info is None: # Regular URL
3140                         try:
3141                                 json_code = urlh.read()
3142                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3143                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3144                                 return
3145
3146                         try:
3147                                 json_data = json.loads(json_code)
3148                                 if 'Post' in json_data:
3149                                         data = json_data['Post']
3150                                 else:
3151                                         data = json_data
3152
3153                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3154                                 video_url = data['media']['url']
3155                                 umobj = re.match(self._URL_EXT, video_url)
3156                                 if umobj is None:
3157                                         raise ValueError('Can not determine filename extension')
3158                                 ext = umobj.group(1)
3159
3160                                 info = {
3161                                         'id': data['item_id'],
3162                                         'url': video_url,
3163                                         'uploader': data['display_name'],
3164                                         'upload_date': upload_date,
3165                                         'title': data['title'],
3166                                         'stitle': _simplify_title(data['title']),
3167                                         'ext': ext,
3168                                         'format': data['media']['mimeType'],
3169                                         'thumbnail': data['thumbnailUrl'],
3170                                         'description': data['description'],
3171                                         'player_url': data['embedUrl']
3172                                 }
3173                         except (ValueError,KeyError), err:
3174                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3175                                 return
3176
3177                 self._downloader.increment_downloads()
3178
3179                 try:
3180                         self._downloader.process_info(info)
3181                 except UnavailableVideoError, err:
3182                         self._downloader.trouble(u'\nERROR: unable to download video')
3183
3184
3185 class MyVideoIE(InfoExtractor):
3186         """Information Extractor for myvideo.de."""
3187
3188         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3189         IE_NAME = u'myvideo'
3190
3191         def __init__(self, downloader=None):
3192                 InfoExtractor.__init__(self, downloader)
3193
3194         def report_download_webpage(self, video_id):
3195                 """Report webpage download."""
3196                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3197
3198         def report_extraction(self, video_id):
3199                 """Report information extraction."""
3200                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3201
3202         def _real_extract(self,url):
3203                 mobj = re.match(self._VALID_URL, url)
3204                 if mobj is None:
3205                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3206                         return
3207
3208                 video_id = mobj.group(1)
3209
3210                 # Get video webpage
3211                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3212                 try:
3213                         self.report_download_webpage(video_id)
3214                         webpage = urllib2.urlopen(request).read()
3215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3217                         return
3218
3219                 self.report_extraction(video_id)
3220                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3221                                  webpage)
3222                 if mobj is None:
3223                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3224                         return
3225                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3226
3227                 mobj = re.search('<title>([^<]+)</title>', webpage)
3228                 if mobj is None:
3229                         self._downloader.trouble(u'ERROR: unable to extract title')
3230                         return
3231
3232                 video_title = mobj.group(1)
3233                 video_title = sanitize_title(video_title)
3234
3235                 simple_title = _simplify_title(video_title)
3236
3237                 try:
3238                         self._downloader.process_info({
3239                                 'id':           video_id,
3240                                 'url':          video_url,
3241                                 'uploader':     u'NA',
3242                                 'upload_date':  u'NA',
3243                                 'title':        video_title,
3244                                 'stitle':       simple_title,
3245                                 'ext':          u'flv',
3246                                 'format':       u'NA',
3247                                 'player_url':   None,
3248                         })
3249                 except UnavailableVideoError:
3250                         self._downloader.trouble(u'\nERROR: Unable to download video')
3251
3252 class ComedyCentralIE(InfoExtractor):
3253         """Information extractor for The Daily Show and Colbert Report """
3254
3255         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3256         IE_NAME = u'comedycentral'
3257
3258         def report_extraction(self, episode_id):
3259                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3260
3261         def report_config_download(self, episode_id):
3262                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3263
3264         def report_index_download(self, episode_id):
3265                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3266
3267         def report_player_url(self, episode_id):
3268                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3269
3270         def _real_extract(self, url):
3271                 mobj = re.match(self._VALID_URL, url)
3272                 if mobj is None:
3273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3274                         return
3275
3276                 if mobj.group('shortname'):
3277                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3278                                 url = u'http://www.thedailyshow.com/full-episodes/'
3279                         else:
3280                                 url = u'http://www.colbertnation.com/full-episodes/'
3281                         mobj = re.match(self._VALID_URL, url)
3282                         assert mobj is not None
3283
3284                 dlNewest = not mobj.group('episode')
3285                 if dlNewest:
3286                         epTitle = mobj.group('showname')
3287                 else:
3288                         epTitle = mobj.group('episode')
3289
3290                 req = urllib2.Request(url)
3291                 self.report_extraction(epTitle)
3292                 try:
3293                         htmlHandle = urllib2.urlopen(req)
3294                         html = htmlHandle.read()
3295                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3296                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3297                         return
3298                 if dlNewest:
3299                         url = htmlHandle.geturl()
3300                         mobj = re.match(self._VALID_URL, url)
3301                         if mobj is None:
3302                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3303                                 return
3304                         if mobj.group('episode') == '':
3305                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3306                                 return
3307                         epTitle = mobj.group('episode')
3308
3309                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3310                 if len(mMovieParams) == 0:
3311                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3312                         return
3313
3314                 playerUrl_raw = mMovieParams[0][0]
3315                 self.report_player_url(epTitle)
3316                 try:
3317                         urlHandle = urllib2.urlopen(playerUrl_raw)
3318                         playerUrl = urlHandle.geturl()
3319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3320                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3321                         return
3322
3323                 uri = mMovieParams[0][1]
3324                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3325                 self.report_index_download(epTitle)
3326                 try:
3327                         indexXml = urllib2.urlopen(indexUrl).read()
3328                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3329                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3330                         return
3331
3332                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3333                 itemEls = idoc.findall('.//item')
3334                 for itemEl in itemEls:
3335                         mediaId = itemEl.findall('./guid')[0].text
3336                         shortMediaId = mediaId.split(':')[-1]
3337                         showId = mediaId.split(':')[-2].replace('.com', '')
3338                         officialTitle = itemEl.findall('./title')[0].text
3339                         officialDate = itemEl.findall('./pubDate')[0].text
3340
3341                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3342                                                 urllib.urlencode({'uri': mediaId}))
3343                         configReq = urllib2.Request(configUrl)
3344                         self.report_config_download(epTitle)
3345                         try:
3346                                 configXml = urllib2.urlopen(configReq).read()
3347                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3348                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3349                                 return
3350
3351                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3352                         turls = []
3353                         for rendition in cdoc.findall('.//rendition'):
3354                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3355                                 turls.append(finfo)
3356
3357                         if len(turls) == 0:
3358                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3359                                 continue
3360
3361                         # For now, just pick the highest bitrate
3362                         format,video_url = turls[-1]
3363
3364                         self._downloader.increment_downloads()
3365
3366                         effTitle = showId + u'-' + epTitle
3367                         info = {
3368                                 'id': shortMediaId,
3369                                 'url': video_url,
3370                                 'uploader': showId,
3371                                 'upload_date': officialDate,
3372                                 'title': effTitle,
3373                                 'stitle': _simplify_title(effTitle),
3374                                 'ext': 'mp4',
3375                                 'format': format,
3376                                 'thumbnail': None,
3377                                 'description': officialTitle,
3378                                 'player_url': playerUrl
3379                         }
3380
3381                         try:
3382                                 self._downloader.process_info(info)
3383                         except UnavailableVideoError, err:
3384                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3385                                 continue
3386
3387
3388 class EscapistIE(InfoExtractor):
3389         """Information extractor for The Escapist """
3390
3391         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3392         IE_NAME = u'escapist'
3393
3394         def report_extraction(self, showName):
3395                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3396
3397         def report_config_download(self, showName):
3398                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3399
3400         def _real_extract(self, url):
3401                 mobj = re.match(self._VALID_URL, url)
3402                 if mobj is None:
3403                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3404                         return
3405                 showName = mobj.group('showname')
3406                 videoId = mobj.group('episode')
3407
3408                 self.report_extraction(showName)
3409                 try:
3410                         webPage = urllib2.urlopen(url).read()
3411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3413                         return
3414
3415                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3416                 description = _unescapeHTML(descMatch.group(1))
3417                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3418                 imgUrl = _unescapeHTML(imgMatch.group(1))
3419                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3420                 playerUrl = _unescapeHTML(playerUrlMatch.group(1))
3421                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3422                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3423
3424                 self.report_config_download(showName)
3425                 try:
3426                         configJSON = urllib2.urlopen(configUrl).read()
3427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3428                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3429                         return
3430
3431                 # Technically, it's JavaScript, not JSON
3432                 configJSON = configJSON.replace("'", '"')
3433
3434                 try:
3435                         config = json.loads(configJSON)
3436                 except (ValueError,), err:
3437                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3438                         return
3439
3440                 playlist = config['playlist']
3441                 videoUrl = playlist[1]['url']
3442
3443                 self._downloader.increment_downloads()
3444                 info = {
3445                         'id': videoId,
3446                         'url': videoUrl,
3447                         'uploader': showName,
3448                         'upload_date': None,
3449                         'title': showName,
3450                         'stitle': _simplify_title(showName),
3451                         'ext': 'flv',
3452                         'format': 'flv',
3453                         'thumbnail': imgUrl,
3454                         'description': description,
3455                         'player_url': playerUrl,
3456                 }
3457
3458                 try:
3459                         self._downloader.process_info(info)
3460                 except UnavailableVideoError, err:
3461                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3462
3463
3464 class CollegeHumorIE(InfoExtractor):
3465         """Information extractor for collegehumor.com"""
3466
3467         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3468         IE_NAME = u'collegehumor'
3469
3470         def report_webpage(self, video_id):
3471                 """Report information extraction."""
3472                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3473
3474         def report_extraction(self, video_id):
3475                 """Report information extraction."""
3476                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3477
3478         def _real_extract(self, url):
3479                 mobj = re.match(self._VALID_URL, url)
3480                 if mobj is None:
3481                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3482                         return
3483                 video_id = mobj.group('videoid')
3484
3485                 self.report_webpage(video_id)
3486                 request = urllib2.Request(url)
3487                 try:
3488                         webpage = urllib2.urlopen(request).read()
3489                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3490                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3491                         return
3492
3493                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3494                 if m is None:
3495                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3496                         return
3497                 internal_video_id = m.group('internalvideoid')
3498
3499                 info = {
3500                         'id': video_id,
3501                         'internal_id': internal_video_id,
3502                 }
3503
3504                 self.report_extraction(video_id)
3505                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3506                 try:
3507                         metaXml = urllib2.urlopen(xmlUrl).read()
3508                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3509                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3510                         return
3511
3512                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3513                 try:
3514                         videoNode = mdoc.findall('./video')[0]
3515                         info['description'] = videoNode.findall('./description')[0].text
3516                         info['title'] = videoNode.findall('./caption')[0].text
3517                         info['stitle'] = _simplify_title(info['title'])
3518                         info['url'] = videoNode.findall('./file')[0].text
3519                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3520                         info['ext'] = info['url'].rpartition('.')[2]
3521                         info['format'] = info['ext']
3522                 except IndexError:
3523                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3524                         return
3525
3526                 self._downloader.increment_downloads()
3527
3528                 try:
3529                         self._downloader.process_info(info)
3530                 except UnavailableVideoError, err:
3531                         self._downloader.trouble(u'\nERROR: unable to download video')
3532
3533
3534 class XVideosIE(InfoExtractor):
3535         """Information extractor for xvideos.com"""
3536
3537         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3538         IE_NAME = u'xvideos'
3539
3540         def report_webpage(self, video_id):
3541                 """Report information extraction."""
3542                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3543
3544         def report_extraction(self, video_id):
3545                 """Report information extraction."""
3546                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3547
3548         def _real_extract(self, url):
3549                 mobj = re.match(self._VALID_URL, url)
3550                 if mobj is None:
3551                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3552                         return
3553                 video_id = mobj.group(1).decode('utf-8')
3554
3555                 self.report_webpage(video_id)
3556
3557                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3558                 try:
3559                         webpage = urllib2.urlopen(request).read()
3560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3561                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3562                         return
3563
3564                 self.report_extraction(video_id)
3565
3566
3567                 # Extract video URL
3568                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3569                 if mobj is None:
3570                         self._downloader.trouble(u'ERROR: unable to extract video url')
3571                         return
3572                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3573
3574
3575                 # Extract title
3576                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3577                 if mobj is None:
3578                         self._downloader.trouble(u'ERROR: unable to extract video title')
3579                         return
3580                 video_title = mobj.group(1).decode('utf-8')
3581
3582
3583                 # Extract video thumbnail
3584                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3585                 if mobj is None:
3586                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3587                         return
3588                 video_thumbnail = mobj.group(1).decode('utf-8')
3589
3590
3591
3592                 self._downloader.increment_downloads()
3593                 info = {
3594                         'id': video_id,
3595                         'url': video_url,
3596                         'uploader': None,
3597                         'upload_date': None,
3598                         'title': video_title,
3599                         'stitle': _simplify_title(video_title),
3600                         'ext': 'flv',
3601                         'format': 'flv',
3602                         'thumbnail': video_thumbnail,
3603                         'description': None,
3604                         'player_url': None,
3605                 }
3606
3607                 try:
3608                         self._downloader.process_info(info)
3609                 except UnavailableVideoError, err:
3610                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3611
3612
3613 class SoundcloudIE(InfoExtractor):
3614         """Information extractor for soundcloud.com
3615            To access the media, the uid of the song and a stream token
3616            must be extracted from the page source and the script must make
3617            a request to media.soundcloud.com/crossdomain.xml. Then
3618            the media can be grabbed by requesting from an url composed
3619            of the stream token and uid
3620          """
3621
3622         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3623         IE_NAME = u'soundcloud'
3624
3625         def __init__(self, downloader=None):
3626                 InfoExtractor.__init__(self, downloader)
3627
3628         def report_webpage(self, video_id):
3629                 """Report information extraction."""
3630                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3631
3632         def report_extraction(self, video_id):
3633                 """Report information extraction."""
3634                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3635
3636         def _real_extract(self, url):
3637                 mobj = re.match(self._VALID_URL, url)
3638                 if mobj is None:
3639                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3640                         return
3641
3642                 # extract uploader (which is in the url)
3643                 uploader = mobj.group(1).decode('utf-8')
3644                 # extract simple title (uploader + slug of song title)
3645                 slug_title =  mobj.group(2).decode('utf-8')
3646                 simple_title = uploader + '-' + slug_title
3647
3648                 self.report_webpage('%s/%s' % (uploader, slug_title))
3649
3650                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3651                 try:
3652                         webpage = urllib2.urlopen(request).read()
3653                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3654                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3655                         return
3656
3657                 self.report_extraction('%s/%s' % (uploader, slug_title))
3658
3659                 # extract uid and stream token that soundcloud hands out for access
3660                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3661                 if mobj:
3662                         video_id = mobj.group(1)
3663                         stream_token = mobj.group(2)
3664
3665                 # extract unsimplified title
3666                 mobj = re.search('"title":"(.*?)",', webpage)
3667                 if mobj:
3668                         title = mobj.group(1)
3669
3670                 # construct media url (with uid/token)
3671                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3672                 mediaURL = mediaURL % (video_id, stream_token)
3673
3674                 # description
3675                 description = u'No description available'
3676                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3677                 if mobj:
3678                         description = mobj.group(1)
3679
3680                 # upload date
3681                 upload_date = None
3682                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3683                 if mobj:
3684                         try:
3685                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3686                         except Exception, e:
3687                                 print str(e)
3688
3689                 # for soundcloud, a request to a cross domain is required for cookies
3690                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3691
3692                 try:
3693                         self._downloader.process_info({
3694                                 'id':           video_id.decode('utf-8'),
3695                                 'url':          mediaURL,
3696                                 'uploader':     uploader.decode('utf-8'),
3697                                 'upload_date':  upload_date,
3698                                 'title':        simple_title.decode('utf-8'),
3699                                 'stitle':       simple_title.decode('utf-8'),
3700                                 'ext':          u'mp3',
3701                                 'format':       u'NA',
3702                                 'player_url':   None,
3703                                 'description': description.decode('utf-8')
3704                         })
3705                 except UnavailableVideoError:
3706                         self._downloader.trouble(u'\nERROR: unable to download video')
3707
3708
3709 class InfoQIE(InfoExtractor):
3710         """Information extractor for infoq.com"""
3711
3712         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3713         IE_NAME = u'infoq'
3714
3715         def report_webpage(self, video_id):
3716                 """Report information extraction."""
3717                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3718
3719         def report_extraction(self, video_id):
3720                 """Report information extraction."""
3721                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3722
3723         def _real_extract(self, url):
3724                 mobj = re.match(self._VALID_URL, url)
3725                 if mobj is None:
3726                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3727                         return
3728
3729                 self.report_webpage(url)
3730
3731                 request = urllib2.Request(url)
3732                 try:
3733                         webpage = urllib2.urlopen(request).read()
3734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3735                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3736                         return
3737
3738                 self.report_extraction(url)
3739
3740
3741                 # Extract video URL
3742                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3743                 if mobj is None:
3744                         self._downloader.trouble(u'ERROR: unable to extract video url')
3745                         return
3746                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3747
3748
3749                 # Extract title
3750                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3751                 if mobj is None:
3752                         self._downloader.trouble(u'ERROR: unable to extract video title')
3753                         return
3754                 video_title = mobj.group(1).decode('utf-8')
3755
3756                 # Extract description
3757                 video_description = u'No description available.'
3758                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3759                 if mobj is not None:
3760                         video_description = mobj.group(1).decode('utf-8')
3761
3762                 video_filename = video_url.split('/')[-1]
3763                 video_id, extension = video_filename.split('.')
3764
3765                 self._downloader.increment_downloads()
3766                 info = {
3767                         'id': video_id,
3768                         'url': video_url,
3769                         'uploader': None,
3770                         'upload_date': None,
3771                         'title': video_title,
3772                         'stitle': _simplify_title(video_title),
3773                         'ext': extension,
3774                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3775                         'thumbnail': None,
3776                         'description': video_description,
3777                         'player_url': None,
3778                 }
3779
3780                 try:
3781                         self._downloader.process_info(info)
3782                 except UnavailableVideoError, err:
3783                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3784
3785 class MixcloudIE(InfoExtractor):
3786         """Information extractor for www.mixcloud.com"""
3787         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3788         IE_NAME = u'mixcloud'
3789
3790         def __init__(self, downloader=None):
3791                 InfoExtractor.__init__(self, downloader)
3792
3793         def report_download_json(self, file_id):
3794                 """Report JSON download."""
3795                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3796
3797         def report_extraction(self, file_id):
3798                 """Report information extraction."""
3799                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3800
3801         def get_urls(self, jsonData, fmt, bitrate='best'):
3802                 """Get urls from 'audio_formats' section in json"""
3803                 file_url = None
3804                 try:
3805                         bitrate_list = jsonData[fmt]
3806                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3807                                 bitrate = max(bitrate_list) # select highest
3808
3809                         url_list = jsonData[fmt][bitrate]
3810                 except TypeError: # we have no bitrate info.
3811                         url_list = jsonData[fmt]
3812
3813                 return url_list
3814
3815         def check_urls(self, url_list):
3816                 """Returns 1st active url from list"""
3817                 for url in url_list:
3818                         try:
3819                                 urllib2.urlopen(url)
3820                                 return url
3821                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3822                                 url = None
3823
3824                 return None
3825
3826         def _print_formats(self, formats):
3827                 print 'Available formats:'
3828                 for fmt in formats.keys():
3829                         for b in formats[fmt]:
3830                                 try:
3831                                         ext = formats[fmt][b][0]
3832                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3833                                 except TypeError: # we have no bitrate info
3834                                         ext = formats[fmt][0]
3835                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3836                                         break
3837
3838         def _real_extract(self, url):
3839                 mobj = re.match(self._VALID_URL, url)
3840                 if mobj is None:
3841                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3842                         return
3843                 # extract uploader & filename from url
3844                 uploader = mobj.group(1).decode('utf-8')
3845                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3846
3847                 # construct API request
3848                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3849                 # retrieve .json file with links to files
3850                 request = urllib2.Request(file_url)
3851                 try:
3852                         self.report_download_json(file_url)
3853                         jsonData = urllib2.urlopen(request).read()
3854                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3855                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3856                         return
3857
3858                 # parse JSON
3859                 json_data = json.loads(jsonData)
3860                 player_url = json_data['player_swf_url']
3861                 formats = dict(json_data['audio_formats'])
3862
3863                 req_format = self._downloader.params.get('format', None)
3864                 bitrate = None
3865
3866                 if self._downloader.params.get('listformats', None):
3867                         self._print_formats(formats)
3868                         return
3869
3870                 if req_format is None or req_format == 'best':
3871                         for format_param in formats.keys():
3872                                 url_list = self.get_urls(formats, format_param)
3873                                 # check urls
3874                                 file_url = self.check_urls(url_list)
3875                                 if file_url is not None:
3876                                         break # got it!
3877                 else:
3878                         if req_format not in formats.keys():
3879                                 self._downloader.trouble(u'ERROR: format is not available')
3880                                 return
3881
3882                         url_list = self.get_urls(formats, req_format)
3883                         file_url = self.check_urls(url_list)
3884                         format_param = req_format
3885
3886                 # We have audio
3887                 self._downloader.increment_downloads()
3888                 try:
3889                         # Process file information
3890                         self._downloader.process_info({
3891                                 'id': file_id.decode('utf-8'),
3892                                 'url': file_url.decode('utf-8'),
3893                                 'uploader':     uploader.decode('utf-8'),
3894                                 'upload_date': u'NA',
3895                                 'title': json_data['name'],
3896                                 'stitle': _simplify_title(json_data['name']),
3897                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3898                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3899                                 'thumbnail': json_data['thumbnail_url'],
3900                                 'description': json_data['description'],
3901                                 'player_url': player_url.decode('utf-8'),
3902                         })
3903                 except UnavailableVideoError, err:
3904                         self._downloader.trouble(u'ERROR: unable to download file')
3905
3906 class StanfordOpenClassroomIE(InfoExtractor):
3907         """Information extractor for Stanford's Open ClassRoom"""
3908
3909         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3910         IE_NAME = u'stanfordoc'
3911
3912         def report_download_webpage(self, objid):
3913                 """Report information extraction."""
3914                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3915
3916         def report_extraction(self, video_id):
3917                 """Report information extraction."""
3918                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3919
3920         def _real_extract(self, url):
3921                 mobj = re.match(self._VALID_URL, url)
3922                 if mobj is None:
3923                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3924                         return
3925
3926                 if mobj.group('course') and mobj.group('video'): # A specific video
3927                         course = mobj.group('course')
3928                         video = mobj.group('video')
3929                         info = {
3930                                 'id': _simplify_title(course + '_' + video),
3931                         }
3932
3933                         self.report_extraction(info['id'])
3934                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3935                         xmlUrl = baseUrl + video + '.xml'
3936                         try:
3937                                 metaXml = urllib2.urlopen(xmlUrl).read()
3938                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3939                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3940                                 return
3941                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3942                         try:
3943                                 info['title'] = mdoc.findall('./title')[0].text
3944                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3945                         except IndexError:
3946                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3947                                 return
3948                         info['stitle'] = _simplify_title(info['title'])
3949                         info['ext'] = info['url'].rpartition('.')[2]
3950                         info['format'] = info['ext']
3951                         self._downloader.increment_downloads()
3952                         try:
3953                                 self._downloader.process_info(info)
3954                         except UnavailableVideoError, err:
3955                                 self._downloader.trouble(u'\nERROR: unable to download video')
3956                 elif mobj.group('course'): # A course page
3957                         course = mobj.group('course')
3958                         info = {
3959                                 'id': _simplify_title(course),
3960                                 'type': 'playlist',
3961                         }
3962
3963                         self.report_download_webpage(info['id'])
3964                         try:
3965                                 coursepage = urllib2.urlopen(url).read()
3966                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3967                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3968                                 return
3969
3970                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3971                         if m:
3972                                 info['title'] = _unescapeHTML(m.group(1))
3973                         else:
3974                                 info['title'] = info['id']
3975                         info['stitle'] = _simplify_title(info['title'])
3976
3977                         m = re.search('<description>([^<]+)</description>', coursepage)
3978                         if m:
3979                                 info['description'] = _unescapeHTML(m.group(1))
3980
3981                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3982                         info['list'] = [
3983                                 {
3984                                         'type': 'reference',
3985                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
3986                                 }
3987                                         for vpage in links]
3988
3989                         for entry in info['list']:
3990                                 assert entry['type'] == 'reference'
3991                                 self.extract(entry['url'])
3992                 else: # Root page
3993                         info = {
3994                                 'id': 'Stanford OpenClassroom',
3995                                 'type': 'playlist',
3996                         }
3997
3998                         self.report_download_webpage(info['id'])
3999                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4000                         try:
4001                                 rootpage = urllib2.urlopen(rootURL).read()
4002                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4003                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4004                                 return
4005
4006                         info['title'] = info['id']
4007                         info['stitle'] = _simplify_title(info['title'])
4008
4009                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4010                         info['list'] = [
4011                                 {
4012                                         'type': 'reference',
4013                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
4014                                 }
4015                                         for cpage in links]
4016
4017                         for entry in info['list']:
4018                                 assert entry['type'] == 'reference'
4019                                 self.extract(entry['url'])
4020
4021 class MTVIE(InfoExtractor):
4022         """Information extractor for MTV.com"""
4023
4024         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4025         IE_NAME = u'mtv'
4026
4027         def report_webpage(self, video_id):
4028                 """Report information extraction."""
4029                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4030
4031         def report_extraction(self, video_id):
4032                 """Report information extraction."""
4033                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4034
4035         def _real_extract(self, url):
4036                 mobj = re.match(self._VALID_URL, url)
4037                 if mobj is None:
4038                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4039                         return
4040                 if not mobj.group('proto'):
4041                         url = 'http://' + url
4042                 video_id = mobj.group('videoid')
4043                 self.report_webpage(video_id)
4044
4045                 request = urllib2.Request(url)
4046                 try:
4047                         webpage = urllib2.urlopen(request).read()
4048                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4049                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4050                         return
4051
4052                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4053                 if mobj is None:
4054                         self._downloader.trouble(u'ERROR: unable to extract song name')
4055                         return
4056                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4057                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4058                 if mobj is None:
4059                         self._downloader.trouble(u'ERROR: unable to extract performer')
4060                         return
4061                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4062                 video_title = performer + ' - ' + song_name
4063
4064                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4065                 if mobj is None:
4066                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4067                         return
4068                 mtvn_uri = mobj.group(1)
4069
4070                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4071                 if mobj is None:
4072                         self._downloader.trouble(u'ERROR: unable to extract content id')
4073                         return
4074                 content_id = mobj.group(1)
4075
4076                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4077                 self.report_extraction(video_id)
4078                 request = urllib2.Request(videogen_url)
4079                 try:
4080                         metadataXml = urllib2.urlopen(request).read()
4081                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4082                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4083                         return
4084
4085                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4086                 renditions = mdoc.findall('.//rendition')
4087
4088                 # For now, always pick the highest quality.
4089                 rendition = renditions[-1]
4090
4091                 try:
4092                         _,_,ext = rendition.attrib['type'].partition('/')
4093                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4094                         video_url = rendition.find('./src').text
4095                 except KeyError:
4096                         self._downloader.trouble('Invalid rendition field.')
4097                         return
4098
4099                 self._downloader.increment_downloads()
4100                 info = {
4101                         'id': video_id,
4102                         'url': video_url,
4103                         'uploader': performer,
4104                         'title': video_title,
4105                         'stitle': _simplify_title(video_title),
4106                         'ext': ext,
4107                         'format': format,
4108                 }
4109
4110                 try:
4111                         self._downloader.process_info(info)
4112                 except UnavailableVideoError, err:
4113                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4114
4115
4116 class PostProcessor(object):
4117         """Post Processor class.
4118
4119         PostProcessor objects can be added to downloaders with their
4120         add_post_processor() method. When the downloader has finished a
4121         successful download, it will take its internal chain of PostProcessors
4122         and start calling the run() method on each one of them, first with
4123         an initial argument and then with the returned value of the previous
4124         PostProcessor.
4125
4126         The chain will be stopped if one of them ever returns None or the end
4127         of the chain is reached.
4128
4129         PostProcessor objects follow a "mutual registration" process similar
4130         to InfoExtractor objects.
4131         """
4132
4133         _downloader = None
4134
4135         def __init__(self, downloader=None):
4136                 self._downloader = downloader
4137
4138         def set_downloader(self, downloader):
4139                 """Sets the downloader for this PP."""
4140                 self._downloader = downloader
4141
4142         def run(self, information):
4143                 """Run the PostProcessor.
4144
4145                 The "information" argument is a dictionary like the ones
4146                 composed by InfoExtractors. The only difference is that this
4147                 one has an extra field called "filepath" that points to the
4148                 downloaded file.
4149
4150                 When this method returns None, the postprocessing chain is
4151                 stopped. However, this method may return an information
4152                 dictionary that will be passed to the next postprocessing
4153                 object in the chain. It can be the one it received after
4154                 changing some fields.
4155
4156                 In addition, this method may raise a PostProcessingError
4157                 exception that will be taken into account by the downloader
4158                 it was called from.
4159                 """
4160                 return information # by default, do nothing
4161
4162 class AudioConversionError(BaseException):
4163         def __init__(self, message):
4164                 self.message = message
4165
4166 class FFmpegExtractAudioPP(PostProcessor):
4167
4168         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4169                 PostProcessor.__init__(self, downloader)
4170                 if preferredcodec is None:
4171                         preferredcodec = 'best'
4172                 self._preferredcodec = preferredcodec
4173                 self._preferredquality = preferredquality
4174                 self._keepvideo = keepvideo
4175
4176         @staticmethod
4177         def get_audio_codec(path):
4178                 try:
4179                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4180                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4181                         output = handle.communicate()[0]
4182                         if handle.wait() != 0:
4183                                 return None
4184                 except (IOError, OSError):
4185                         return None
4186                 audio_codec = None
4187                 for line in output.split('\n'):
4188                         if line.startswith('codec_name='):
4189                                 audio_codec = line.split('=')[1].strip()
4190                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4191                                 return audio_codec
4192                 return None
4193
4194         @staticmethod
4195         def run_ffmpeg(path, out_path, codec, more_opts):
4196                 if codec is None:
4197                         acodec_opts = []
4198                 else:
4199                         acodec_opts = ['-acodec', codec]
4200                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4201                 try:
4202                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4203                         stdout,stderr = p.communicate()
4204                 except (IOError, OSError):
4205                         e = sys.exc_info()[1]
4206                         if isinstance(e, OSError) and e.errno == 2:
4207                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4208                         else:
4209                                 raise e
4210                 if p.returncode != 0:
4211                         msg = stderr.strip().split('\n')[-1]
4212                         raise AudioConversionError(msg)
4213
4214         def run(self, information):
4215                 path = information['filepath']
4216
4217                 filecodec = self.get_audio_codec(path)
4218                 if filecodec is None:
4219                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4220                         return None
4221
4222                 more_opts = []
4223                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4224                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4225                                 # Lossless, but in another container
4226                                 acodec = 'copy'
4227                                 extension = self._preferredcodec
4228                                 more_opts = ['-absf', 'aac_adtstoasc']
4229                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4230                                 # Lossless if possible
4231                                 acodec = 'copy'
4232                                 extension = filecodec
4233                                 if filecodec == 'aac':
4234                                         more_opts = ['-f', 'adts']
4235                                 if filecodec == 'vorbis':
4236                                         extension = 'ogg'
4237                         else:
4238                                 # MP3 otherwise.
4239                                 acodec = 'libmp3lame'
4240                                 extension = 'mp3'
4241                                 more_opts = []
4242                                 if self._preferredquality is not None:
4243                                         more_opts += ['-ab', self._preferredquality]
4244                 else:
4245                         # We convert the audio (lossy)
4246                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4247                         extension = self._preferredcodec
4248                         more_opts = []
4249                         if self._preferredquality is not None:
4250                                 more_opts += ['-ab', self._preferredquality]
4251                         if self._preferredcodec == 'aac':
4252                                 more_opts += ['-f', 'adts']
4253                         if self._preferredcodec == 'm4a':
4254                                 more_opts += ['-absf', 'aac_adtstoasc']
4255                         if self._preferredcodec == 'vorbis':
4256                                 extension = 'ogg'
4257                         if self._preferredcodec == 'wav':
4258                                 extension = 'wav'
4259                                 more_opts += ['-f', 'wav']
4260
4261                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4262                 new_path = prefix + sep + extension
4263                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4264                 try:
4265                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4266                 except:
4267                         etype,e,tb = sys.exc_info()
4268                         if isinstance(e, AudioConversionError):
4269                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4270                         else:
4271                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4272                         return None
4273
4274                 # Try to update the date time for extracted audio file.
4275                 if information.get('filetime') is not None:
4276                         try:
4277                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4278                         except:
4279                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4280
4281                 if not self._keepvideo:
4282                         try:
4283                                 os.remove(_encodeFilename(path))
4284                         except (IOError, OSError):
4285                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4286                                 return None
4287
4288                 information['filepath'] = new_path
4289                 return information
4290
4291
4292 def updateSelf(downloader, filename):
4293         ''' Update the program file with the latest version from the repository '''
4294         # Note: downloader only used for options
4295         if not os.access(filename, os.W_OK):
4296                 sys.exit('ERROR: no write permissions on %s' % filename)
4297
4298         downloader.to_screen(u'Updating to latest version...')
4299
4300         try:
4301                 try:
4302                         urlh = urllib.urlopen(UPDATE_URL)
4303                         newcontent = urlh.read()
4304
4305                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4306                         if vmatch is not None and vmatch.group(1) == __version__:
4307                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4308                                 return
4309                 finally:
4310                         urlh.close()
4311         except (IOError, OSError), err:
4312                 sys.exit('ERROR: unable to download latest version')
4313
4314         try:
4315                 outf = open(filename, 'wb')
4316                 try:
4317                         outf.write(newcontent)
4318                 finally:
4319                         outf.close()
4320         except (IOError, OSError), err:
4321                 sys.exit('ERROR: unable to overwrite current version')
4322
4323         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4324
4325 def parseOpts():
4326         def _readOptions(filename_bytes):
4327                 try:
4328                         optionf = open(filename_bytes)
4329                 except IOError:
4330                         return [] # silently skip if file is not present
4331                 try:
4332                         res = []
4333                         for l in optionf:
4334                                 res += shlex.split(l, comments=True)
4335                 finally:
4336                         optionf.close()
4337                 return res
4338
4339         def _format_option_string(option):
4340                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4341
4342                 opts = []
4343
4344                 if option._short_opts: opts.append(option._short_opts[0])
4345                 if option._long_opts: opts.append(option._long_opts[0])
4346                 if len(opts) > 1: opts.insert(1, ', ')
4347
4348                 if option.takes_value(): opts.append(' %s' % option.metavar)
4349
4350                 return "".join(opts)
4351
4352         def _find_term_columns():
4353                 columns = os.environ.get('COLUMNS', None)
4354                 if columns:
4355                         return int(columns)
4356
4357                 try:
4358                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4359                         out,err = sp.communicate()
4360                         return int(out.split()[1])
4361                 except:
4362                         pass
4363                 return None
4364
4365         max_width = 80
4366         max_help_position = 80
4367
4368         # No need to wrap help messages if we're on a wide console
4369         columns = _find_term_columns()
4370         if columns: max_width = columns
4371
4372         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4373         fmt.format_option_strings = _format_option_string
4374
4375         kw = {
4376                 'version'   : __version__,
4377                 'formatter' : fmt,
4378                 'usage' : '%prog [options] url [url...]',
4379                 'conflict_handler' : 'resolve',
4380         }
4381
4382         parser = optparse.OptionParser(**kw)
4383
4384         # option groups
4385         general        = optparse.OptionGroup(parser, 'General Options')
4386         selection      = optparse.OptionGroup(parser, 'Video Selection')
4387         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4388         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4389         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4390         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4391         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4392
4393         general.add_option('-h', '--help',
4394                         action='help', help='print this help text and exit')
4395         general.add_option('-v', '--version',
4396                         action='version', help='print program version and exit')
4397         general.add_option('-U', '--update',
4398                         action='store_true', dest='update_self', help='update this program to latest version')
4399         general.add_option('-i', '--ignore-errors',
4400                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4401         general.add_option('-r', '--rate-limit',
4402                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4403         general.add_option('-R', '--retries',
4404                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4405         general.add_option('--dump-user-agent',
4406                         action='store_true', dest='dump_user_agent',
4407                         help='display the current browser identification', default=False)
4408         general.add_option('--list-extractors',
4409                         action='store_true', dest='list_extractors',
4410                         help='List all supported extractors and the URLs they would handle', default=False)
4411
4412         selection.add_option('--playlist-start',
4413                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4414         selection.add_option('--playlist-end',
4415                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4416         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4417         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4418         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4419
4420         authentication.add_option('-u', '--username',
4421                         dest='username', metavar='USERNAME', help='account username')
4422         authentication.add_option('-p', '--password',
4423                         dest='password', metavar='PASSWORD', help='account password')
4424         authentication.add_option('-n', '--netrc',
4425                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4426
4427
4428         video_format.add_option('-f', '--format',
4429                         action='store', dest='format', metavar='FORMAT', help='video format code')
4430         video_format.add_option('--all-formats',
4431                         action='store_const', dest='format', help='download all available video formats', const='all')
4432         video_format.add_option('--prefer-free-formats',
4433                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4434         video_format.add_option('--max-quality',
4435                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4436         video_format.add_option('-F', '--list-formats',
4437                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4438         video_format.add_option('--write-srt',
4439                         action='store_true', dest='writesubtitles',
4440                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4441         video_format.add_option('--srt-lang',
4442                         action='store', dest='subtitleslang', metavar='LANG',
4443                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4444
4445
4446         verbosity.add_option('-q', '--quiet',
4447                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4448         verbosity.add_option('-s', '--simulate',
4449                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4450         verbosity.add_option('--skip-download',
4451                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4452         verbosity.add_option('-g', '--get-url',
4453                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4454         verbosity.add_option('-e', '--get-title',
4455                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4456         verbosity.add_option('--get-thumbnail',
4457                         action='store_true', dest='getthumbnail',
4458                         help='simulate, quiet but print thumbnail URL', default=False)
4459         verbosity.add_option('--get-description',
4460                         action='store_true', dest='getdescription',
4461                         help='simulate, quiet but print video description', default=False)
4462         verbosity.add_option('--get-filename',
4463                         action='store_true', dest='getfilename',
4464                         help='simulate, quiet but print output filename', default=False)
4465         verbosity.add_option('--get-format',
4466                         action='store_true', dest='getformat',
4467                         help='simulate, quiet but print output format', default=False)
4468         verbosity.add_option('--no-progress',
4469                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4470         verbosity.add_option('--console-title',
4471                         action='store_true', dest='consoletitle',
4472                         help='display progress in console titlebar', default=False)
4473         verbosity.add_option('-v', '--verbose',
4474                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4475
4476
4477         filesystem.add_option('-t', '--title',
4478                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4479         filesystem.add_option('-l', '--literal',
4480                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4481         filesystem.add_option('-A', '--auto-number',
4482                         action='store_true', dest='autonumber',
4483                         help='number downloaded files starting from 00000', default=False)
4484         filesystem.add_option('-o', '--output',
4485                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4486         filesystem.add_option('-a', '--batch-file',
4487                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4488         filesystem.add_option('-w', '--no-overwrites',
4489                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4490         filesystem.add_option('-c', '--continue',
4491                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4492         filesystem.add_option('--no-continue',
4493                         action='store_false', dest='continue_dl',
4494                         help='do not resume partially downloaded files (restart from beginning)')
4495         filesystem.add_option('--cookies',
4496                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4497         filesystem.add_option('--no-part',
4498                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4499         filesystem.add_option('--no-mtime',
4500                         action='store_false', dest='updatetime',
4501                         help='do not use the Last-modified header to set the file modification time', default=True)
4502         filesystem.add_option('--write-description',
4503                         action='store_true', dest='writedescription',
4504                         help='write video description to a .description file', default=False)
4505         filesystem.add_option('--write-info-json',
4506                         action='store_true', dest='writeinfojson',
4507                         help='write video metadata to a .info.json file', default=False)
4508
4509
4510         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4511                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4512         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4513                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4514         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4515                         help='ffmpeg audio bitrate specification, 128k by default')
4516         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4517                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4518
4519
4520         parser.add_option_group(general)
4521         parser.add_option_group(selection)
4522         parser.add_option_group(filesystem)
4523         parser.add_option_group(verbosity)
4524         parser.add_option_group(video_format)
4525         parser.add_option_group(authentication)
4526         parser.add_option_group(postproc)
4527
4528         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4529         if xdg_config_home:
4530                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4531         else:
4532                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4533         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4534         opts, args = parser.parse_args(argv)
4535
4536         return parser, opts, args
4537
4538 def gen_extractors():
4539         """ Return a list of an instance of every supported extractor.
4540         The order does matter; the first extractor matched is the one handling the URL.
4541         """
4542         youtube_ie = YoutubeIE()
4543         google_ie = GoogleIE()
4544         yahoo_ie = YahooIE()
4545         return [
4546                 YoutubePlaylistIE(youtube_ie),
4547                 YoutubeUserIE(youtube_ie),
4548                 YoutubeSearchIE(youtube_ie),
4549                 youtube_ie,
4550                 MetacafeIE(youtube_ie),
4551                 DailymotionIE(),
4552                 google_ie,
4553                 GoogleSearchIE(google_ie),
4554                 PhotobucketIE(),
4555                 yahoo_ie,
4556                 YahooSearchIE(yahoo_ie),
4557                 DepositFilesIE(),
4558                 FacebookIE(),
4559                 BlipTVIE(),
4560                 VimeoIE(),
4561                 MyVideoIE(),
4562                 ComedyCentralIE(),
4563                 EscapistIE(),
4564                 CollegeHumorIE(),
4565                 XVideosIE(),
4566                 SoundcloudIE(),
4567                 InfoQIE(),
4568                 MixcloudIE(),
4569                 StanfordOpenClassroomIE(),
4570                 MTVIE(),
4571
4572                 GenericIE()
4573         ]
4574
4575 def _real_main():
4576         parser, opts, args = parseOpts()
4577
4578         # Open appropriate CookieJar
4579         if opts.cookiefile is None:
4580                 jar = cookielib.CookieJar()
4581         else:
4582                 try:
4583                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4584                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4585                                 jar.load()
4586                 except (IOError, OSError), err:
4587                         sys.exit(u'ERROR: unable to open cookie file')
4588
4589         # Dump user agent
4590         if opts.dump_user_agent:
4591                 print std_headers['User-Agent']
4592                 sys.exit(0)
4593
4594         # Batch file verification
4595         batchurls = []
4596         if opts.batchfile is not None:
4597                 try:
4598                         if opts.batchfile == '-':
4599                                 batchfd = sys.stdin
4600                         else:
4601                                 batchfd = open(opts.batchfile, 'r')
4602                         batchurls = batchfd.readlines()
4603                         batchurls = [x.strip() for x in batchurls]
4604                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4605                 except IOError:
4606                         sys.exit(u'ERROR: batch file could not be read')
4607         all_urls = batchurls + args
4608
4609         # General configuration
4610         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4611         proxy_handler = urllib2.ProxyHandler()
4612         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4613         urllib2.install_opener(opener)
4614         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4615
4616         if opts.verbose:
4617                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4618
4619         extractors = gen_extractors()
4620
4621         if opts.list_extractors:
4622                 for ie in extractors:
4623                         print(ie.IE_NAME)
4624                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4625                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4626                         for mu in matchedUrls:
4627                                 print(u'  ' + mu)
4628                 sys.exit(0)
4629
4630         # Conflicting, missing and erroneous options
4631         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4632                 parser.error(u'using .netrc conflicts with giving username/password')
4633         if opts.password is not None and opts.username is None:
4634                 parser.error(u'account username missing')
4635         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4636                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4637         if opts.usetitle and opts.useliteral:
4638                 parser.error(u'using title conflicts with using literal title')
4639         if opts.username is not None and opts.password is None:
4640                 opts.password = getpass.getpass(u'Type account password and press return:')
4641         if opts.ratelimit is not None:
4642                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4643                 if numeric_limit is None:
4644                         parser.error(u'invalid rate limit specified')
4645                 opts.ratelimit = numeric_limit
4646         if opts.retries is not None:
4647                 try:
4648                         opts.retries = long(opts.retries)
4649                 except (TypeError, ValueError), err:
4650                         parser.error(u'invalid retry count specified')
4651         try:
4652                 opts.playliststart = int(opts.playliststart)
4653                 if opts.playliststart <= 0:
4654                         raise ValueError(u'Playlist start must be positive')
4655         except (TypeError, ValueError), err:
4656                 parser.error(u'invalid playlist start number specified')
4657         try:
4658                 opts.playlistend = int(opts.playlistend)
4659                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4660                         raise ValueError(u'Playlist end must be greater than playlist start')
4661         except (TypeError, ValueError), err:
4662                 parser.error(u'invalid playlist end number specified')
4663         if opts.extractaudio:
4664                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4665                         parser.error(u'invalid audio format specified')
4666
4667         # File downloader
4668         fd = FileDownloader({
4669                 'usenetrc': opts.usenetrc,
4670                 'username': opts.username,
4671                 'password': opts.password,
4672                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4673                 'forceurl': opts.geturl,
4674                 'forcetitle': opts.gettitle,
4675                 'forcethumbnail': opts.getthumbnail,
4676                 'forcedescription': opts.getdescription,
4677                 'forcefilename': opts.getfilename,
4678                 'forceformat': opts.getformat,
4679                 'simulate': opts.simulate,
4680                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4681                 'format': opts.format,
4682                 'format_limit': opts.format_limit,
4683                 'listformats': opts.listformats,
4684                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4685                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4686                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4687                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4688                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4689                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4690                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4691                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4692                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4693                         or u'%(id)s.%(ext)s'),
4694                 'ignoreerrors': opts.ignoreerrors,
4695                 'ratelimit': opts.ratelimit,
4696                 'nooverwrites': opts.nooverwrites,
4697                 'retries': opts.retries,
4698                 'continuedl': opts.continue_dl,
4699                 'noprogress': opts.noprogress,
4700                 'playliststart': opts.playliststart,
4701                 'playlistend': opts.playlistend,
4702                 'logtostderr': opts.outtmpl == '-',
4703                 'consoletitle': opts.consoletitle,
4704                 'nopart': opts.nopart,
4705                 'updatetime': opts.updatetime,
4706                 'writedescription': opts.writedescription,
4707                 'writeinfojson': opts.writeinfojson,
4708                 'writesubtitles': opts.writesubtitles,
4709                 'subtitleslang': opts.subtitleslang,
4710                 'matchtitle': opts.matchtitle,
4711                 'rejecttitle': opts.rejecttitle,
4712                 'max_downloads': opts.max_downloads,
4713                 'prefer_free_formats': opts.prefer_free_formats,
4714                 'verbose': opts.verbose,
4715                 })
4716         for extractor in extractors:
4717                 fd.add_info_extractor(extractor)
4718
4719         # PostProcessors
4720         if opts.extractaudio:
4721                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4722
4723         # Update version
4724         if opts.update_self:
4725                 updateSelf(fd, sys.argv[0])
4726
4727         # Maybe do nothing
4728         if len(all_urls) < 1:
4729                 if not opts.update_self:
4730                         parser.error(u'you must provide at least one URL')
4731                 else:
4732                         sys.exit()
4733
4734         try:
4735                 retcode = fd.download(all_urls)
4736         except MaxDownloadsReached:
4737                 fd.to_screen(u'--max-download limit reached, aborting.')
4738                 retcode = 101
4739
4740         # Dump cookie jar if requested
4741         if opts.cookiefile is not None:
4742                 try:
4743                         jar.save()
4744                 except (IOError, OSError), err:
4745                         sys.exit(u'ERROR: unable to save cookie jar')
4746
4747         sys.exit(retcode)
4748
4749 def main():
4750         try:
4751                 _real_main()
4752         except DownloadError:
4753                 sys.exit(1)
4754         except SameFileError:
4755                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4756         except KeyboardInterrupt:
4757                 sys.exit(u'\nERROR: Interrupted by user')
4758
4759 if __name__ == '__main__':
4760         main()
4761
4762 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: