_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         for f in node.findall(xpath):
 284             if f.attrib.get(key) == val:
 285                 return f
 286         return None
 287
 288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 289 # the namespace parameter
 290 def xpath_with_ns(path, ns_map):
 291     components = [c.split(':') for c in path.split('/')]
 292     replaced = []
 293     for c in components:
 294         if len(c) == 1:
 295             replaced.append(c[0])
 296         else:
 297             ns, tag = c
 298             replaced.append('{%s}%s' % (ns_map[ns], tag))
 299     return '/'.join(replaced)
 300
 301
 302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 303 class BaseHTMLParser(compat_html_parser.HTMLParser):
 304     def __init(self):
 305         compat_html_parser.HTMLParser.__init__(self)
 306         self.html = None
 307
 308     def loads(self, html):
 309         self.html = html
 310         self.feed(html)
 311         self.close()
 312
 313 class AttrParser(BaseHTMLParser):
 314     """Modified HTMLParser that isolates a tag with the specified attribute"""
 315     def __init__(self, attribute, value):
 316         self.attribute = attribute
 317         self.value = value
 318         self.result = None
 319         self.started = False
 320         self.depth = {}
 321         self.watch_startpos = False
 322         self.error_count = 0
 323         BaseHTMLParser.__init__(self)
 324
 325     def error(self, message):
 326         if self.error_count > 10 or self.started:
 327             raise compat_html_parser.HTMLParseError(message, self.getpos())
 328         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 329         self.error_count += 1
 330         self.goahead(1)
 331
 332     def handle_starttag(self, tag, attrs):
 333         attrs = dict(attrs)
 334         if self.started:
 335             self.find_startpos(None)
 336         if self.attribute in attrs and attrs[self.attribute] == self.value:
 337             self.result = [tag]
 338             self.started = True
 339             self.watch_startpos = True
 340         if self.started:
 341             if not tag in self.depth: self.depth[tag] = 0
 342             self.depth[tag] += 1
 343
 344     def handle_endtag(self, tag):
 345         if self.started:
 346             if tag in self.depth: self.depth[tag] -= 1
 347             if self.depth[self.result[0]] == 0:
 348                 self.started = False
 349                 self.result.append(self.getpos())
 350
 351     def find_startpos(self, x):
 352         """Needed to put the start position of the result (self.result[1])
 353         after the opening tag with the requested id"""
 354         if self.watch_startpos:
 355             self.watch_startpos = False
 356             self.result.append(self.getpos())
 357     handle_entityref = handle_charref = handle_data = handle_comment = \
 358     handle_decl = handle_pi = unknown_decl = find_startpos
 359
 360     def get_result(self):
 361         if self.result is None:
 362             return None
 363         if len(self.result) != 3:
 364             return None
 365         lines = self.html.split('\n')
 366         lines = lines[self.result[1][0]-1:self.result[2][0]]
 367         lines[0] = lines[0][self.result[1][1]:]
 368         if len(lines) == 1:
 369             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 370         lines[-1] = lines[-1][:self.result[2][1]]
 371         return '\n'.join(lines).strip()
 372 # Hack for https://github.com/rg3/youtube-dl/issues/662
 373 if sys.version_info < (2, 7, 3):
 374     AttrParser.parse_endtag = (lambda self, i:
 375         i + len("</scr'+'ipt>")
 376         if self.rawdata[i:].startswith("</scr'+'ipt>")
 377         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 378
 379 def get_element_by_id(id, html):
 380     """Return the content of the tag with the specified ID in the passed HTML document"""
 381     return get_element_by_attribute("id", id, html)
 382
 383 def get_element_by_attribute(attribute, value, html):
 384     """Return the content of the tag with the specified attribute in the passed HTML document"""
 385     parser = AttrParser(attribute, value)
 386     try:
 387         parser.loads(html)
 388     except compat_html_parser.HTMLParseError:
 389         pass
 390     return parser.get_result()
 391
 392 class MetaParser(BaseHTMLParser):
 393     """
 394     Modified HTMLParser that isolates a meta tag with the specified name
 395     attribute.
 396     """
 397     def __init__(self, name):
 398         BaseHTMLParser.__init__(self)
 399         self.name = name
 400         self.content = None
 401         self.result = None
 402
 403     def handle_starttag(self, tag, attrs):
 404         if tag != 'meta':
 405             return
 406         attrs = dict(attrs)
 407         if attrs.get('name') == self.name:
 408             self.result = attrs.get('content')
 409
 410     def get_result(self):
 411         return self.result
 412
 413 def get_meta_content(name, html):
 414     """
 415     Return the content attribute from the meta tag with the given name attribute.
 416     """
 417     parser = MetaParser(name)
 418     try:
 419         parser.loads(html)
 420     except compat_html_parser.HTMLParseError:
 421         pass
 422     return parser.get_result()
 423
 424
 425 def clean_html(html):
 426     """Clean an HTML snippet into a readable string"""
 427     # Newline vs <br />
 428     html = html.replace('\n', ' ')
 429     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 430     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 431     # Strip html tags
 432     html = re.sub('<.*?>', '', html)
 433     # Replace html entities
 434     html = unescapeHTML(html)
 435     return html.strip()
 436
 437
 438 def sanitize_open(filename, open_mode):
 439     """Try to open the given filename, and slightly tweak it if this fails.
 440
 441     Attempts to open the given filename. If this fails, it tries to change
 442     the filename slightly, step by step, until it's either able to open it
 443     or it fails and raises a final exception, like the standard open()
 444     function.
 445
 446     It returns the tuple (stream, definitive_file_name).
 447     """
 448     try:
 449         if filename == u'-':
 450             if sys.platform == 'win32':
 451                 import msvcrt
 452                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 453             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 454         stream = open(encodeFilename(filename), open_mode)
 455         return (stream, filename)
 456     except (IOError, OSError) as err:
 457         if err.errno in (errno.EACCES,):
 458             raise
 459
 460         # In case of error, try to remove win32 forbidden chars
 461         alt_filename = os.path.join(
 462                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 463                         for path_part in os.path.split(filename)
 464                        )
 465         if alt_filename == filename:
 466             raise
 467         else:
 468             # An exception here should be caught in the caller
 469             stream = open(encodeFilename(filename), open_mode)
 470             return (stream, alt_filename)
 471
 472
 473 def timeconvert(timestr):
 474     """Convert RFC 2822 defined time string into system timestamp"""
 475     timestamp = None
 476     timetuple = email.utils.parsedate_tz(timestr)
 477     if timetuple is not None:
 478         timestamp = email.utils.mktime_tz(timetuple)
 479     return timestamp
 480
 481 def sanitize_filename(s, restricted=False, is_id=False):
 482     """Sanitizes a string so it could be used as part of a filename.
 483     If restricted is set, use a stricter subset of allowed characters.
 484     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 485     """
 486     def replace_insane(char):
 487         if char == '?' or ord(char) < 32 or ord(char) == 127:
 488             return ''
 489         elif char == '"':
 490             return '' if restricted else '\''
 491         elif char == ':':
 492             return '_-' if restricted else ' -'
 493         elif char in '\\/|*<>':
 494             return '_'
 495         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 496             return '_'
 497         if restricted and ord(char) > 127:
 498             return '_'
 499         return char
 500
 501     result = u''.join(map(replace_insane, s))
 502     if not is_id:
 503         while '__' in result:
 504             result = result.replace('__', '_')
 505         result = result.strip('_')
 506         # Common case of "Foreign band name - English song title"
 507         if restricted and result.startswith('-_'):
 508             result = result[2:]
 509         if not result:
 510             result = '_'
 511     return result
 512
 513 def orderedSet(iterable):
 514     """ Remove all duplicates from the input iterable """
 515     res = []
 516     for el in iterable:
 517         if el not in res:
 518             res.append(el)
 519     return res
 520
 521
 522 def _htmlentity_transform(entity):
 523     """Transforms an HTML entity to a character."""
 524     # Known non-numeric HTML entity
 525     if entity in compat_html_entities.name2codepoint:
 526         return compat_chr(compat_html_entities.name2codepoint[entity])
 527
 528     mobj = re.match(r'#(x?[0-9]+)', entity)
 529     if mobj is not None:
 530         numstr = mobj.group(1)
 531         if numstr.startswith(u'x'):
 532             base = 16
 533             numstr = u'0%s' % numstr
 534         else:
 535             base = 10
 536         return compat_chr(int(numstr, base))
 537
 538     # Unknown entity in name, return its literal representation
 539     return (u'&%s;' % entity)
 540
 541
 542 def unescapeHTML(s):
 543     if s is None:
 544         return None
 545     assert type(s) == compat_str
 546
 547     return re.sub(
 548         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 549
 550
 551 def encodeFilename(s, for_subprocess=False):
 552     """
 553     @param s The name of the file
 554     """
 555
 556     assert type(s) == compat_str
 557
 558     # Python 3 has a Unicode API
 559     if sys.version_info >= (3, 0):
 560         return s
 561
 562     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 563         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 564         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 565         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 566         if not for_subprocess:
 567             return s
 568         else:
 569             # For subprocess calls, encode with locale encoding
 570             # Refer to http://stackoverflow.com/a/9951851/35070
 571             encoding = preferredencoding()
 572     else:
 573         encoding = sys.getfilesystemencoding()
 574     if encoding is None:
 575         encoding = 'utf-8'
 576     return s.encode(encoding, 'ignore')
 577
 578
 579 def encodeArgument(s):
 580     if not isinstance(s, compat_str):
 581         # Legacy code that uses byte strings
 582         # Uncomment the following line after fixing all post processors
 583         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 584         s = s.decode('ascii')
 585     return encodeFilename(s, True)
 586
 587
 588 def decodeOption(optval):
 589     if optval is None:
 590         return optval
 591     if isinstance(optval, bytes):
 592         optval = optval.decode(preferredencoding())
 593
 594     assert isinstance(optval, compat_str)
 595     return optval
 596
 597 def formatSeconds(secs):
 598     if secs > 3600:
 599         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 600     elif secs > 60:
 601         return '%d:%02d' % (secs // 60, secs % 60)
 602     else:
 603         return '%d' % secs
 604
 605
 606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 607     if sys.version_info < (3, 2):
 608         import httplib
 609
 610         class HTTPSConnectionV3(httplib.HTTPSConnection):
 611             def __init__(self, *args, **kwargs):
 612                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 613
 614             def connect(self):
 615                 sock = socket.create_connection((self.host, self.port), self.timeout)
 616                 if getattr(self, '_tunnel_host', False):
 617                     self.sock = sock
 618                     self._tunnel()
 619                 try:
 620                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 621                 except ssl.SSLError:
 622                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 623
 624         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 625             def https_open(self, req):
 626                 return self.do_open(HTTPSConnectionV3, req)
 627         return HTTPSHandlerV3(**kwargs)
 628     else:
 629         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 630         context.verify_mode = (ssl.CERT_NONE
 631                                if opts_no_check_certificate
 632                                else ssl.CERT_REQUIRED)
 633         context.set_default_verify_paths()
 634         try:
 635             context.load_default_certs()
 636         except AttributeError:
 637             pass  # Python < 3.4
 638         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 639
 640 class ExtractorError(Exception):
 641     """Error during info extraction."""
 642     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 643         """ tb, if given, is the original traceback (so that it can be printed out).
 644         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 645         """
 646
 647         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 648             expected = True
 649         if video_id is not None:
 650             msg = video_id + ': ' + msg
 651         if not expected:
 652             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 653         super(ExtractorError, self).__init__(msg)
 654
 655         self.traceback = tb
 656         self.exc_info = sys.exc_info()  # preserve original exception
 657         self.cause = cause
 658         self.video_id = video_id
 659
 660     def format_traceback(self):
 661         if self.traceback is None:
 662             return None
 663         return u''.join(traceback.format_tb(self.traceback))
 664
 665
 666 class RegexNotFoundError(ExtractorError):
 667     """Error when a regex didn't match"""
 668     pass
 669
 670
 671 class DownloadError(Exception):
 672     """Download Error exception.
 673
 674     This exception may be thrown by FileDownloader objects if they are not
 675     configured to continue on errors. They will contain the appropriate
 676     error message.
 677     """
 678     def __init__(self, msg, exc_info=None):
 679         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 680         super(DownloadError, self).__init__(msg)
 681         self.exc_info = exc_info
 682
 683
 684 class SameFileError(Exception):
 685     """Same File exception.
 686
 687     This exception will be thrown by FileDownloader objects if they detect
 688     multiple files would have to be downloaded to the same file on disk.
 689     """
 690     pass
 691
 692
 693 class PostProcessingError(Exception):
 694     """Post Processing exception.
 695
 696     This exception may be raised by PostProcessor's .run() method to
 697     indicate an error in the postprocessing task.
 698     """
 699     def __init__(self, msg):
 700         self.msg = msg
 701
 702 class MaxDownloadsReached(Exception):
 703     """ --max-downloads limit has been reached. """
 704     pass
 705
 706
 707 class UnavailableVideoError(Exception):
 708     """Unavailable Format exception.
 709
 710     This exception will be thrown when a video is requested
 711     in a format that is not available for that video.
 712     """
 713     pass
 714
 715
 716 class ContentTooShortError(Exception):
 717     """Content Too Short exception.
 718
 719     This exception may be raised by FileDownloader objects when a file they
 720     download is too small for what the server announced first, indicating
 721     the connection was probably interrupted.
 722     """
 723     # Both in bytes
 724     downloaded = None
 725     expected = None
 726
 727     def __init__(self, downloaded, expected):
 728         self.downloaded = downloaded
 729         self.expected = expected
 730
 731 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 732     """Handler for HTTP requests and responses.
 733
 734     This class, when installed with an OpenerDirector, automatically adds
 735     the standard headers to every HTTP request and handles gzipped and
 736     deflated responses from web servers. If compression is to be avoided in
 737     a particular request, the original request in the program code only has
 738     to include the HTTP header "Youtubedl-No-Compression", which will be
 739     removed before making the real request.
 740
 741     Part of this code was copied from:
 742
 743     http://techknack.net/python-urllib2-handlers/
 744
 745     Andrew Rowls, the author of that code, agreed to release it to the
 746     public domain.
 747     """
 748
 749     @staticmethod
 750     def deflate(data):
 751         try:
 752             return zlib.decompress(data, -zlib.MAX_WBITS)
 753         except zlib.error:
 754             return zlib.decompress(data)
 755
 756     @staticmethod
 757     def addinfourl_wrapper(stream, headers, url, code):
 758         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 759             return compat_urllib_request.addinfourl(stream, headers, url, code)
 760         ret = compat_urllib_request.addinfourl(stream, headers, url)
 761         ret.code = code
 762         return ret
 763
 764     def http_request(self, req):
 765         for h, v in std_headers.items():
 766             if h not in req.headers:
 767                 req.add_header(h, v)
 768         if 'Youtubedl-no-compression' in req.headers:
 769             if 'Accept-encoding' in req.headers:
 770                 del req.headers['Accept-encoding']
 771             del req.headers['Youtubedl-no-compression']
 772         if 'Youtubedl-user-agent' in req.headers:
 773             if 'User-agent' in req.headers:
 774                 del req.headers['User-agent']
 775             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 776             del req.headers['Youtubedl-user-agent']
 777         return req
 778
 779     def http_response(self, req, resp):
 780         old_resp = resp
 781         # gzip
 782         if resp.headers.get('Content-encoding', '') == 'gzip':
 783             content = resp.read()
 784             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 785             try:
 786                 uncompressed = io.BytesIO(gz.read())
 787             except IOError as original_ioerror:
 788                 # There may be junk add the end of the file
 789                 # See http://stackoverflow.com/q/4928560/35070 for details
 790                 for i in range(1, 1024):
 791                     try:
 792                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 793                         uncompressed = io.BytesIO(gz.read())
 794                     except IOError:
 795                         continue
 796                     break
 797                 else:
 798                     raise original_ioerror
 799             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 800             resp.msg = old_resp.msg
 801         # deflate
 802         if resp.headers.get('Content-encoding', '') == 'deflate':
 803             gz = io.BytesIO(self.deflate(resp.read()))
 804             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 805             resp.msg = old_resp.msg
 806         return resp
 807
 808     https_request = http_request
 809     https_response = http_response
 810
 811
 812 def parse_iso8601(date_str, delimiter='T'):
 813     """ Return a UNIX timestamp from the given date """
 814
 815     if date_str is None:
 816         return None
 817
 818     m = re.search(
 819         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 820         date_str)
 821     if not m:
 822         timezone = datetime.timedelta()
 823     else:
 824         date_str = date_str[:-len(m.group(0))]
 825         if not m.group('sign'):
 826             timezone = datetime.timedelta()
 827         else:
 828             sign = 1 if m.group('sign') == '+' else -1
 829             timezone = datetime.timedelta(
 830                 hours=sign * int(m.group('hours')),
 831                 minutes=sign * int(m.group('minutes')))
 832     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 833     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 834     return calendar.timegm(dt.timetuple())
 835
 836
 837 def unified_strdate(date_str):
 838     """Return a string with the date in the format YYYYMMDD"""
 839
 840     if date_str is None:
 841         return None
 842
 843     upload_date = None
 844     #Replace commas
 845     date_str = date_str.replace(',', ' ')
 846     # %z (UTC offset) is only supported in python>=3.2
 847     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 848     format_expressions = [
 849         '%d %B %Y',
 850         '%d %b %Y',
 851         '%B %d %Y',
 852         '%b %d %Y',
 853         '%b %dst %Y %I:%M%p',
 854         '%b %dnd %Y %I:%M%p',
 855         '%b %dth %Y %I:%M%p',
 856         '%Y-%m-%d',
 857         '%Y/%m/%d',
 858         '%d.%m.%Y',
 859         '%d/%m/%Y',
 860         '%d/%m/%y',
 861         '%Y/%m/%d %H:%M:%S',
 862         '%Y-%m-%d %H:%M:%S',
 863         '%d.%m.%Y %H:%M',
 864         '%d.%m.%Y %H.%M',
 865         '%Y-%m-%dT%H:%M:%SZ',
 866         '%Y-%m-%dT%H:%M:%S.%fZ',
 867         '%Y-%m-%dT%H:%M:%S.%f0Z',
 868         '%Y-%m-%dT%H:%M:%S',
 869         '%Y-%m-%dT%H:%M:%S.%f',
 870         '%Y-%m-%dT%H:%M',
 871     ]
 872     for expression in format_expressions:
 873         try:
 874             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 875         except ValueError:
 876             pass
 877     if upload_date is None:
 878         timetuple = email.utils.parsedate_tz(date_str)
 879         if timetuple:
 880             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 881     return upload_date
 882
 883 def determine_ext(url, default_ext=u'unknown_video'):
 884     if url is None:
 885         return default_ext
 886     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 887     if re.match(r'^[A-Za-z0-9]+$', guess):
 888         return guess
 889     else:
 890         return default_ext
 891
 892 def subtitles_filename(filename, sub_lang, sub_format):
 893     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 894
 895 def date_from_str(date_str):
 896     """
 897     Return a datetime object from a string in the format YYYYMMDD or
 898     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 899     today = datetime.date.today()
 900     if date_str == 'now'or date_str == 'today':
 901         return today
 902     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 903     if match is not None:
 904         sign = match.group('sign')
 905         time = int(match.group('time'))
 906         if sign == '-':
 907             time = -time
 908         unit = match.group('unit')
 909         #A bad aproximation?
 910         if unit == 'month':
 911             unit = 'day'
 912             time *= 30
 913         elif unit == 'year':
 914             unit = 'day'
 915             time *= 365
 916         unit += 's'
 917         delta = datetime.timedelta(**{unit: time})
 918         return today + delta
 919     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 920
 921 def hyphenate_date(date_str):
 922     """
 923     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 924     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 925     if match is not None:
 926         return '-'.join(match.groups())
 927     else:
 928         return date_str
 929
 930 class DateRange(object):
 931     """Represents a time interval between two dates"""
 932     def __init__(self, start=None, end=None):
 933         """start and end must be strings in the format accepted by date"""
 934         if start is not None:
 935             self.start = date_from_str(start)
 936         else:
 937             self.start = datetime.datetime.min.date()
 938         if end is not None:
 939             self.end = date_from_str(end)
 940         else:
 941             self.end = datetime.datetime.max.date()
 942         if self.start > self.end:
 943             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 944     @classmethod
 945     def day(cls, day):
 946         """Returns a range that only contains the given day"""
 947         return cls(day,day)
 948     def __contains__(self, date):
 949         """Check if the date is in the range"""
 950         if not isinstance(date, datetime.date):
 951             date = date_from_str(date)
 952         return self.start <= date <= self.end
 953     def __str__(self):
 954         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 955
 956
 957 def platform_name():
 958     """ Returns the platform name as a compat_str """
 959     res = platform.platform()
 960     if isinstance(res, bytes):
 961         res = res.decode(preferredencoding())
 962
 963     assert isinstance(res, compat_str)
 964     return res
 965
 966
 967 def _windows_write_string(s, out):
 968     """ Returns True if the string was written using special methods,
 969     False if it has yet to be written out."""
 970     # Adapted from http://stackoverflow.com/a/3259271/35070
 971
 972     import ctypes
 973     import ctypes.wintypes
 974
 975     WIN_OUTPUT_IDS = {
 976         1: -11,
 977         2: -12,
 978     }
 979
 980     try:
 981         fileno = out.fileno()
 982     except AttributeError:
 983         # If the output stream doesn't have a fileno, it's virtual
 984         return False
 985     if fileno not in WIN_OUTPUT_IDS:
 986         return False
 987
 988     GetStdHandle = ctypes.WINFUNCTYPE(
 989         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 990         ("GetStdHandle", ctypes.windll.kernel32))
 991     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 992
 993     WriteConsoleW = ctypes.WINFUNCTYPE(
 994         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 995         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 996         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 997     written = ctypes.wintypes.DWORD(0)
 998
 999     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1000     FILE_TYPE_CHAR = 0x0002
1001     FILE_TYPE_REMOTE = 0x8000
1002     GetConsoleMode = ctypes.WINFUNCTYPE(
1003         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1004         ctypes.POINTER(ctypes.wintypes.DWORD))(
1005         ("GetConsoleMode", ctypes.windll.kernel32))
1006     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1007
1008     def not_a_console(handle):
1009         if handle == INVALID_HANDLE_VALUE or handle is None:
1010             return True
1011         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1012                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1013
1014     if not_a_console(h):
1015         return False
1016
1017     def next_nonbmp_pos(s):
1018         try:
1019             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1020         except StopIteration:
1021             return len(s)
1022
1023     while s:
1024         count = min(next_nonbmp_pos(s), 1024)
1025
1026         ret = WriteConsoleW(
1027             h, s, count if count else 2, ctypes.byref(written), None)
1028         if ret == 0:
1029             raise OSError('Failed to write string')
1030         if not count:  # We just wrote a non-BMP character
1031             assert written.value == 2
1032             s = s[1:]
1033         else:
1034             assert written.value > 0
1035             s = s[written.value:]
1036     return True
1037
1038
1039 def write_string(s, out=None, encoding=None):
1040     if out is None:
1041         out = sys.stderr
1042     assert type(s) == compat_str
1043
1044     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1045         if _windows_write_string(s, out):
1046             return
1047
1048     if ('b' in getattr(out, 'mode', '') or
1049             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1050         byt = s.encode(encoding or preferredencoding(), 'ignore')
1051         out.write(byt)
1052     elif hasattr(out, 'buffer'):
1053         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1054         byt = s.encode(enc, 'ignore')
1055         out.buffer.write(byt)
1056     else:
1057         out.write(s)
1058     out.flush()
1059
1060
1061 def bytes_to_intlist(bs):
1062     if not bs:
1063         return []
1064     if isinstance(bs[0], int):  # Python 3
1065         return list(bs)
1066     else:
1067         return [ord(c) for c in bs]
1068
1069
1070 def intlist_to_bytes(xs):
1071     if not xs:
1072         return b''
1073     if isinstance(chr(0), bytes):  # Python 2
1074         return ''.join([chr(x) for x in xs])
1075     else:
1076         return bytes(xs)
1077
1078
1079 # Cross-platform file locking
1080 if sys.platform == 'win32':
1081     import ctypes.wintypes
1082     import msvcrt
1083
1084     class OVERLAPPED(ctypes.Structure):
1085         _fields_ = [
1086             ('Internal', ctypes.wintypes.LPVOID),
1087             ('InternalHigh', ctypes.wintypes.LPVOID),
1088             ('Offset', ctypes.wintypes.DWORD),
1089             ('OffsetHigh', ctypes.wintypes.DWORD),
1090             ('hEvent', ctypes.wintypes.HANDLE),
1091         ]
1092
1093     kernel32 = ctypes.windll.kernel32
1094     LockFileEx = kernel32.LockFileEx
1095     LockFileEx.argtypes = [
1096         ctypes.wintypes.HANDLE,     # hFile
1097         ctypes.wintypes.DWORD,      # dwFlags
1098         ctypes.wintypes.DWORD,      # dwReserved
1099         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1100         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1101         ctypes.POINTER(OVERLAPPED)  # Overlapped
1102     ]
1103     LockFileEx.restype = ctypes.wintypes.BOOL
1104     UnlockFileEx = kernel32.UnlockFileEx
1105     UnlockFileEx.argtypes = [
1106         ctypes.wintypes.HANDLE,     # hFile
1107         ctypes.wintypes.DWORD,      # dwReserved
1108         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1109         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1110         ctypes.POINTER(OVERLAPPED)  # Overlapped
1111     ]
1112     UnlockFileEx.restype = ctypes.wintypes.BOOL
1113     whole_low = 0xffffffff
1114     whole_high = 0x7fffffff
1115
1116     def _lock_file(f, exclusive):
1117         overlapped = OVERLAPPED()
1118         overlapped.Offset = 0
1119         overlapped.OffsetHigh = 0
1120         overlapped.hEvent = 0
1121         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1122         handle = msvcrt.get_osfhandle(f.fileno())
1123         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1124                           whole_low, whole_high, f._lock_file_overlapped_p):
1125             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1126
1127     def _unlock_file(f):
1128         assert f._lock_file_overlapped_p
1129         handle = msvcrt.get_osfhandle(f.fileno())
1130         if not UnlockFileEx(handle, 0,
1131                             whole_low, whole_high, f._lock_file_overlapped_p):
1132             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1133
1134 else:
1135     import fcntl
1136
1137     def _lock_file(f, exclusive):
1138         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1139
1140     def _unlock_file(f):
1141         fcntl.flock(f, fcntl.LOCK_UN)
1142
1143
1144 class locked_file(object):
1145     def __init__(self, filename, mode, encoding=None):
1146         assert mode in ['r', 'a', 'w']
1147         self.f = io.open(filename, mode, encoding=encoding)
1148         self.mode = mode
1149
1150     def __enter__(self):
1151         exclusive = self.mode != 'r'
1152         try:
1153             _lock_file(self.f, exclusive)
1154         except IOError:
1155             self.f.close()
1156             raise
1157         return self
1158
1159     def __exit__(self, etype, value, traceback):
1160         try:
1161             _unlock_file(self.f)
1162         finally:
1163             self.f.close()
1164
1165     def __iter__(self):
1166         return iter(self.f)
1167
1168     def write(self, *args):
1169         return self.f.write(*args)
1170
1171     def read(self, *args):
1172         return self.f.read(*args)
1173
1174
1175 def shell_quote(args):
1176     quoted_args = []
1177     encoding = sys.getfilesystemencoding()
1178     if encoding is None:
1179         encoding = 'utf-8'
1180     for a in args:
1181         if isinstance(a, bytes):
1182             # We may get a filename encoded with 'encodeFilename'
1183             a = a.decode(encoding)
1184         quoted_args.append(pipes.quote(a))
1185     return u' '.join(quoted_args)
1186
1187
1188 def takewhile_inclusive(pred, seq):
1189     """ Like itertools.takewhile, but include the latest evaluated element
1190         (the first element so that Not pred(e)) """
1191     for e in seq:
1192         yield e
1193         if not pred(e):
1194             return
1195
1196
1197 def smuggle_url(url, data):
1198     """ Pass additional data in a URL for internal use. """
1199
1200     sdata = compat_urllib_parse.urlencode(
1201         {u'__youtubedl_smuggle': json.dumps(data)})
1202     return url + u'#' + sdata
1203
1204
1205 def unsmuggle_url(smug_url, default=None):
1206     if not '#__youtubedl_smuggle' in smug_url:
1207         return smug_url, default
1208     url, _, sdata = smug_url.rpartition(u'#')
1209     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1210     data = json.loads(jsond)
1211     return url, data
1212
1213
1214 def format_bytes(bytes):
1215     if bytes is None:
1216         return u'N/A'
1217     if type(bytes) is str:
1218         bytes = float(bytes)
1219     if bytes == 0.0:
1220         exponent = 0
1221     else:
1222         exponent = int(math.log(bytes, 1024.0))
1223     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1224     converted = float(bytes) / float(1024 ** exponent)
1225     return u'%.2f%s' % (converted, suffix)
1226
1227
1228 def get_term_width():
1229     columns = os.environ.get('COLUMNS', None)
1230     if columns:
1231         return int(columns)
1232
1233     try:
1234         sp = subprocess.Popen(
1235             ['stty', 'size'],
1236             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1237         out, err = sp.communicate()
1238         return int(out.split()[1])
1239     except:
1240         pass
1241     return None
1242
1243
1244 def month_by_name(name):
1245     """ Return the number of a month by (locale-independently) English name """
1246
1247     ENGLISH_NAMES = [
1248         u'January', u'February', u'March', u'April', u'May', u'June',
1249         u'July', u'August', u'September', u'October', u'November', u'December']
1250     try:
1251         return ENGLISH_NAMES.index(name) + 1
1252     except ValueError:
1253         return None
1254
1255
1256 def fix_xml_ampersands(xml_str):
1257     """Replace all the '&' by '&amp;' in XML"""
1258     return re.sub(
1259         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1260         u'&amp;',
1261         xml_str)
1262
1263
1264 def setproctitle(title):
1265     assert isinstance(title, compat_str)
1266     try:
1267         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1268     except OSError:
1269         return
1270     title_bytes = title.encode('utf-8')
1271     buf = ctypes.create_string_buffer(len(title_bytes))
1272     buf.value = title_bytes
1273     try:
1274         libc.prctl(15, buf, 0, 0, 0)
1275     except AttributeError:
1276         return  # Strange libc, just skip this
1277
1278
1279 def remove_start(s, start):
1280     if s.startswith(start):
1281         return s[len(start):]
1282     return s
1283
1284
1285 def remove_end(s, end):
1286     if s.endswith(end):
1287         return s[:-len(end)]
1288     return s
1289
1290
1291 def url_basename(url):
1292     path = compat_urlparse.urlparse(url).path
1293     return path.strip(u'/').split(u'/')[-1]
1294
1295
1296 class HEADRequest(compat_urllib_request.Request):
1297     def get_method(self):
1298         return "HEAD"
1299
1300
1301 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1302     if get_attr:
1303         if v is not None:
1304             v = getattr(v, get_attr, None)
1305     if v == '':
1306         v = None
1307     return default if v is None else (int(v) * invscale // scale)
1308
1309
1310 def str_or_none(v, default=None):
1311     return default if v is None else compat_str(v)
1312
1313
1314 def str_to_int(int_str):
1315     """ A more relaxed version of int_or_none """
1316     if int_str is None:
1317         return None
1318     int_str = re.sub(r'[,\.\+]', u'', int_str)
1319     return int(int_str)
1320
1321
1322 def float_or_none(v, scale=1, invscale=1, default=None):
1323     return default if v is None else (float(v) * invscale / scale)
1324
1325
1326 def parse_duration(s):
1327     if s is None:
1328         return None
1329
1330     s = s.strip()
1331
1332     m = re.match(
1333         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1334     if not m:
1335         return None
1336     res = int(m.group('secs'))
1337     if m.group('mins'):
1338         res += int(m.group('mins')) * 60
1339         if m.group('hours'):
1340             res += int(m.group('hours')) * 60 * 60
1341     if m.group('ms'):
1342         res += float(m.group('ms'))
1343     return res
1344
1345
1346 def prepend_extension(filename, ext):
1347     name, real_ext = os.path.splitext(filename)
1348     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1349
1350
1351 def check_executable(exe, args=[]):
1352     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1353     args can be a list of arguments for a short output (like -version) """
1354     try:
1355         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1356     except OSError:
1357         return False
1358     return exe
1359
1360
1361 class PagedList(object):
1362     def __init__(self, pagefunc, pagesize):
1363         self._pagefunc = pagefunc
1364         self._pagesize = pagesize
1365
1366     def __len__(self):
1367         # This is only useful for tests
1368         return len(self.getslice())
1369
1370     def getslice(self, start=0, end=None):
1371         res = []
1372         for pagenum in itertools.count(start // self._pagesize):
1373             firstid = pagenum * self._pagesize
1374             nextfirstid = pagenum * self._pagesize + self._pagesize
1375             if start >= nextfirstid:
1376                 continue
1377
1378             page_results = list(self._pagefunc(pagenum))
1379
1380             startv = (
1381                 start % self._pagesize
1382                 if firstid <= start < nextfirstid
1383                 else 0)
1384
1385             endv = (
1386                 ((end - 1) % self._pagesize) + 1
1387                 if (end is not None and firstid <= end <= nextfirstid)
1388                 else None)
1389
1390             if startv != 0 or endv is not None:
1391                 page_results = page_results[startv:endv]
1392             res.extend(page_results)
1393
1394             # A little optimization - if current page is not "full", ie. does
1395             # not contain page_size videos then we can assume that this page
1396             # is the last one - there are no more ids on further pages -
1397             # i.e. no need to query again.
1398             if len(page_results) + startv < self._pagesize:
1399                 break
1400
1401             # If we got the whole page, but the next page is not interesting,
1402             # break out early as well
1403             if end == nextfirstid:
1404                 break
1405         return res
1406
1407
1408 def uppercase_escape(s):
1409     unicode_escape = codecs.getdecoder('unicode_escape')
1410     return re.sub(
1411         r'\\U[0-9a-fA-F]{8}',
1412         lambda m: unicode_escape(m.group(0))[0],
1413         s)
1414
1415 try:
1416     struct.pack(u'!I', 0)
1417 except TypeError:
1418     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1419     def struct_pack(spec, *args):
1420         if isinstance(spec, compat_str):
1421             spec = spec.encode('ascii')
1422         return struct.pack(spec, *args)
1423
1424     def struct_unpack(spec, *args):
1425         if isinstance(spec, compat_str):
1426             spec = spec.encode('ascii')
1427         return struct.unpack(spec, *args)
1428 else:
1429     struct_pack = struct.pack
1430     struct_unpack = struct.unpack
1431
1432
1433 def read_batch_urls(batch_fd):
1434     def fixup(url):
1435         if not isinstance(url, compat_str):
1436             url = url.decode('utf-8', 'replace')
1437         BOM_UTF8 = u'\xef\xbb\xbf'
1438         if url.startswith(BOM_UTF8):
1439             url = url[len(BOM_UTF8):]
1440         url = url.strip()
1441         if url.startswith(('#', ';', ']')):
1442             return False
1443         return url
1444
1445     with contextlib.closing(batch_fd) as fd:
1446         return [url for url in map(fixup, fd) if url]
1447
1448
1449 def urlencode_postdata(*args, **kargs):
1450     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1451
1452
1453 try:
1454     etree_iter = xml.etree.ElementTree.Element.iter
1455 except AttributeError:  # Python <=2.6
1456     etree_iter = lambda n: n.findall('.//*')
1457
1458
1459 def parse_xml(s):
1460     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1461         def doctype(self, name, pubid, system):
1462             pass  # Ignore doctypes
1463
1464     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1465     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1466     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1467     # Fix up XML parser in Python 2.x
1468     if sys.version_info < (3, 0):
1469         for n in etree_iter(tree):
1470             if n.text is not None:
1471                 if not isinstance(n.text, compat_str):
1472                     n.text = n.text.decode('utf-8')
1473     return tree
1474
1475
1476 if sys.version_info < (3, 0) and sys.platform == 'win32':
1477     def compat_getpass(prompt, *args, **kwargs):
1478         if isinstance(prompt, compat_str):
1479             prompt = prompt.encode(preferredencoding())
1480         return getpass.getpass(prompt, *args, **kwargs)
1481 else:
1482     compat_getpass = getpass.getpass
1483
1484
1485 US_RATINGS = {
1486     'G': 0,
1487     'PG': 10,
1488     'PG-13': 13,
1489     'R': 16,
1490     'NC': 18,
1491 }
1492
1493
1494 def strip_jsonp(code):
1495     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1496
1497
1498 def js_to_json(code):
1499     def fix_kv(m):
1500         key = m.group(2)
1501         if key.startswith("'"):
1502             assert key.endswith("'")
1503             assert '"' not in key
1504             key = '"%s"' % key[1:-1]
1505         elif not key.startswith('"'):
1506             key = '"%s"' % key
1507
1508         value = m.group(4)
1509         if value.startswith("'"):
1510             assert value.endswith("'")
1511             assert '"' not in value
1512             value = '"%s"' % value[1:-1]
1513
1514         return m.group(1) + key + m.group(3) + value
1515
1516     res = re.sub(r'''(?x)
1517             ([{,]\s*)
1518             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1519             (:\s*)
1520             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1521         ''', fix_kv, code)
1522     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1523     return res
1524
1525
1526 def qualities(quality_ids):
1527     """ Get a numeric quality value out of a list of possible values """
1528     def q(qid):
1529         try:
1530             return quality_ids.index(qid)
1531         except ValueError:
1532             return -1
1533     return q
1534
1535
1536 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1537
1538 try:
1539     subprocess_check_output = subprocess.check_output
1540 except AttributeError:
1541     def subprocess_check_output(*args, **kwargs):
1542         assert 'input' not in kwargs
1543         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1544         output, _ = p.communicate()
1545         ret = p.poll()
1546         if ret:
1547             raise subprocess.CalledProcessError(ret, p.args, output=output)
1548         return output