git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 def compat_ord(c):
 196     if type(c) is int: return c
 197     else: return ord(c)
 198
 199 # This is not clearly defined otherwise
 200 compiled_regex_type = type(re.compile(''))
 201
 202 std_headers = {
 203     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 204     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 205     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 206     'Accept-Encoding': 'gzip, deflate',
 207     'Accept-Language': 'en-us,en;q=0.5',
 208 }
 209
 210 def preferredencoding():
 211     """Get preferred encoding.
 212
 213     Returns the best encoding scheme for the system, based on
 214     locale.getpreferredencoding() and some further tweaks.
 215     """
 216     try:
 217         pref = locale.getpreferredencoding()
 218         u'TEST'.encode(pref)
 219     except:
 220         pref = 'UTF-8'
 221
 222     return pref
 223
 224 if sys.version_info < (3,0):
 225     def compat_print(s):
 226         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 227 else:
 228     def compat_print(s):
 229         assert type(s) == type(u'')
 230         print(s)
 231
 232
 233 def write_json_file(obj, fn):
 234     """ Encode obj as JSON and write it to fn, atomically """
 235
 236     args = {
 237         'suffix': '.tmp',
 238         'prefix': os.path.basename(fn) + '.',
 239         'dir': os.path.dirname(fn),
 240         'delete': False,
 241     }
 242
 243     # In Python 2.x, json.dump expects a bytestream.
 244     # In Python 3.x, it writes to a character stream
 245     if sys.version_info < (3, 0):
 246         args['mode'] = 'wb'
 247     else:
 248         args.update({
 249             'mode': 'w',
 250             'encoding': 'utf-8',
 251         })
 252
 253     tf = tempfile.NamedTemporaryFile(**args)
 254
 255     try:
 256         with tf:
 257             json.dump(obj, tf)
 258         os.rename(tf.name, fn)
 259     except:
 260         try:
 261             os.remove(tf.name)
 262         except OSError:
 263             pass
 264         raise
 265
 266
 267 if sys.version_info >= (2, 7):
 268     def find_xpath_attr(node, xpath, key, val):
 269         """ Find the xpath xpath[@key=val] """
 270         assert re.match(r'^[a-zA-Z-]+$', key)
 271         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 272         expr = xpath + u"[@%s='%s']" % (key, val)
 273         return node.find(expr)
 274 else:
 275     def find_xpath_attr(node, xpath, key, val):
 276         for f in node.findall(xpath):
 277             if f.attrib.get(key) == val:
 278                 return f
 279         return None
 280
 281 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 282 # the namespace parameter
 283 def xpath_with_ns(path, ns_map):
 284     components = [c.split(':') for c in path.split('/')]
 285     replaced = []
 286     for c in components:
 287         if len(c) == 1:
 288             replaced.append(c[0])
 289         else:
 290             ns, tag = c
 291             replaced.append('{%s}%s' % (ns_map[ns], tag))
 292     return '/'.join(replaced)
 293
 294 def htmlentity_transform(matchobj):
 295     """Transforms an HTML entity to a character.
 296
 297     This function receives a match object and is intended to be used with
 298     the re.sub() function.
 299     """
 300     entity = matchobj.group(1)
 301
 302     # Known non-numeric HTML entity
 303     if entity in compat_html_entities.name2codepoint:
 304         return compat_chr(compat_html_entities.name2codepoint[entity])
 305
 306     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 307     if mobj is not None:
 308         numstr = mobj.group(1)
 309         if numstr.startswith(u'x'):
 310             base = 16
 311             numstr = u'0%s' % numstr
 312         else:
 313             base = 10
 314         return compat_chr(int(numstr, base))
 315
 316     # Unknown entity in name, return its literal representation
 317     return (u'&%s;' % entity)
 318
 319 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 320 class BaseHTMLParser(compat_html_parser.HTMLParser):
 321     def __init(self):
 322         compat_html_parser.HTMLParser.__init__(self)
 323         self.html = None
 324
 325     def loads(self, html):
 326         self.html = html
 327         self.feed(html)
 328         self.close()
 329
 330 class AttrParser(BaseHTMLParser):
 331     """Modified HTMLParser that isolates a tag with the specified attribute"""
 332     def __init__(self, attribute, value):
 333         self.attribute = attribute
 334         self.value = value
 335         self.result = None
 336         self.started = False
 337         self.depth = {}
 338         self.watch_startpos = False
 339         self.error_count = 0
 340         BaseHTMLParser.__init__(self)
 341
 342     def error(self, message):
 343         if self.error_count > 10 or self.started:
 344             raise compat_html_parser.HTMLParseError(message, self.getpos())
 345         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 346         self.error_count += 1
 347         self.goahead(1)
 348
 349     def handle_starttag(self, tag, attrs):
 350         attrs = dict(attrs)
 351         if self.started:
 352             self.find_startpos(None)
 353         if self.attribute in attrs and attrs[self.attribute] == self.value:
 354             self.result = [tag]
 355             self.started = True
 356             self.watch_startpos = True
 357         if self.started:
 358             if not tag in self.depth: self.depth[tag] = 0
 359             self.depth[tag] += 1
 360
 361     def handle_endtag(self, tag):
 362         if self.started:
 363             if tag in self.depth: self.depth[tag] -= 1
 364             if self.depth[self.result[0]] == 0:
 365                 self.started = False
 366                 self.result.append(self.getpos())
 367
 368     def find_startpos(self, x):
 369         """Needed to put the start position of the result (self.result[1])
 370         after the opening tag with the requested id"""
 371         if self.watch_startpos:
 372             self.watch_startpos = False
 373             self.result.append(self.getpos())
 374     handle_entityref = handle_charref = handle_data = handle_comment = \
 375     handle_decl = handle_pi = unknown_decl = find_startpos
 376
 377     def get_result(self):
 378         if self.result is None:
 379             return None
 380         if len(self.result) != 3:
 381             return None
 382         lines = self.html.split('\n')
 383         lines = lines[self.result[1][0]-1:self.result[2][0]]
 384         lines[0] = lines[0][self.result[1][1]:]
 385         if len(lines) == 1:
 386             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 387         lines[-1] = lines[-1][:self.result[2][1]]
 388         return '\n'.join(lines).strip()
 389 # Hack for https://github.com/rg3/youtube-dl/issues/662
 390 if sys.version_info < (2, 7, 3):
 391     AttrParser.parse_endtag = (lambda self, i:
 392         i + len("</scr'+'ipt>")
 393         if self.rawdata[i:].startswith("</scr'+'ipt>")
 394         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 395
 396 def get_element_by_id(id, html):
 397     """Return the content of the tag with the specified ID in the passed HTML document"""
 398     return get_element_by_attribute("id", id, html)
 399
 400 def get_element_by_attribute(attribute, value, html):
 401     """Return the content of the tag with the specified attribute in the passed HTML document"""
 402     parser = AttrParser(attribute, value)
 403     try:
 404         parser.loads(html)
 405     except compat_html_parser.HTMLParseError:
 406         pass
 407     return parser.get_result()
 408
 409 class MetaParser(BaseHTMLParser):
 410     """
 411     Modified HTMLParser that isolates a meta tag with the specified name
 412     attribute.
 413     """
 414     def __init__(self, name):
 415         BaseHTMLParser.__init__(self)
 416         self.name = name
 417         self.content = None
 418         self.result = None
 419
 420     def handle_starttag(self, tag, attrs):
 421         if tag != 'meta':
 422             return
 423         attrs = dict(attrs)
 424         if attrs.get('name') == self.name:
 425             self.result = attrs.get('content')
 426
 427     def get_result(self):
 428         return self.result
 429
 430 def get_meta_content(name, html):
 431     """
 432     Return the content attribute from the meta tag with the given name attribute.
 433     """
 434     parser = MetaParser(name)
 435     try:
 436         parser.loads(html)
 437     except compat_html_parser.HTMLParseError:
 438         pass
 439     return parser.get_result()
 440
 441
 442 def clean_html(html):
 443     """Clean an HTML snippet into a readable string"""
 444     # Newline vs <br />
 445     html = html.replace('\n', ' ')
 446     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 447     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 448     # Strip html tags
 449     html = re.sub('<.*?>', '', html)
 450     # Replace html entities
 451     html = unescapeHTML(html)
 452     return html.strip()
 453
 454
 455 def sanitize_open(filename, open_mode):
 456     """Try to open the given filename, and slightly tweak it if this fails.
 457
 458     Attempts to open the given filename. If this fails, it tries to change
 459     the filename slightly, step by step, until it's either able to open it
 460     or it fails and raises a final exception, like the standard open()
 461     function.
 462
 463     It returns the tuple (stream, definitive_file_name).
 464     """
 465     try:
 466         if filename == u'-':
 467             if sys.platform == 'win32':
 468                 import msvcrt
 469                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 470             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 471         stream = open(encodeFilename(filename), open_mode)
 472         return (stream, filename)
 473     except (IOError, OSError) as err:
 474         if err.errno in (errno.EACCES,):
 475             raise
 476
 477         # In case of error, try to remove win32 forbidden chars
 478         alt_filename = os.path.join(
 479                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 480                         for path_part in os.path.split(filename)
 481                        )
 482         if alt_filename == filename:
 483             raise
 484         else:
 485             # An exception here should be caught in the caller
 486             stream = open(encodeFilename(filename), open_mode)
 487             return (stream, alt_filename)
 488
 489
 490 def timeconvert(timestr):
 491     """Convert RFC 2822 defined time string into system timestamp"""
 492     timestamp = None
 493     timetuple = email.utils.parsedate_tz(timestr)
 494     if timetuple is not None:
 495         timestamp = email.utils.mktime_tz(timetuple)
 496     return timestamp
 497
 498 def sanitize_filename(s, restricted=False, is_id=False):
 499     """Sanitizes a string so it could be used as part of a filename.
 500     If restricted is set, use a stricter subset of allowed characters.
 501     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 502     """
 503     def replace_insane(char):
 504         if char == '?' or ord(char) < 32 or ord(char) == 127:
 505             return ''
 506         elif char == '"':
 507             return '' if restricted else '\''
 508         elif char == ':':
 509             return '_-' if restricted else ' -'
 510         elif char in '\\/|*<>':
 511             return '_'
 512         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 513             return '_'
 514         if restricted and ord(char) > 127:
 515             return '_'
 516         return char
 517
 518     result = u''.join(map(replace_insane, s))
 519     if not is_id:
 520         while '__' in result:
 521             result = result.replace('__', '_')
 522         result = result.strip('_')
 523         # Common case of "Foreign band name - English song title"
 524         if restricted and result.startswith('-_'):
 525             result = result[2:]
 526         if not result:
 527             result = '_'
 528     return result
 529
 530 def orderedSet(iterable):
 531     """ Remove all duplicates from the input iterable """
 532     res = []
 533     for el in iterable:
 534         if el not in res:
 535             res.append(el)
 536     return res
 537
 538
 539 def unescapeHTML(s):
 540     if s is None:
 541         return None
 542     assert type(s) == compat_str
 543
 544     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 545     return result
 546
 547
 548 def encodeFilename(s, for_subprocess=False):
 549     """
 550     @param s The name of the file
 551     """
 552
 553     assert type(s) == compat_str
 554
 555     # Python 3 has a Unicode API
 556     if sys.version_info >= (3, 0):
 557         return s
 558
 559     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 560         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 561         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 562         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 563         if not for_subprocess:
 564             return s
 565         else:
 566             # For subprocess calls, encode with locale encoding
 567             # Refer to http://stackoverflow.com/a/9951851/35070
 568             encoding = preferredencoding()
 569     else:
 570         encoding = sys.getfilesystemencoding()
 571     if encoding is None:
 572         encoding = 'utf-8'
 573     return s.encode(encoding, 'ignore')
 574
 575
 576 def encodeArgument(s):
 577     if not isinstance(s, compat_str):
 578         # Legacy code that uses byte strings
 579         # Uncomment the following line after fixing all post processors
 580         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 581         s = s.decode('ascii')
 582     return encodeFilename(s, True)
 583
 584
 585 def decodeOption(optval):
 586     if optval is None:
 587         return optval
 588     if isinstance(optval, bytes):
 589         optval = optval.decode(preferredencoding())
 590
 591     assert isinstance(optval, compat_str)
 592     return optval
 593
 594 def formatSeconds(secs):
 595     if secs > 3600:
 596         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 597     elif secs > 60:
 598         return '%d:%02d' % (secs // 60, secs % 60)
 599     else:
 600         return '%d' % secs
 601
 602
 603 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 604     if sys.version_info < (3, 2):
 605         import httplib
 606
 607         class HTTPSConnectionV3(httplib.HTTPSConnection):
 608             def __init__(self, *args, **kwargs):
 609                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 610
 611             def connect(self):
 612                 sock = socket.create_connection((self.host, self.port), self.timeout)
 613                 if getattr(self, '_tunnel_host', False):
 614                     self.sock = sock
 615                     self._tunnel()
 616                 try:
 617                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 618                 except ssl.SSLError:
 619                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 620
 621         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 622             def https_open(self, req):
 623                 return self.do_open(HTTPSConnectionV3, req)
 624         return HTTPSHandlerV3(**kwargs)
 625     else:
 626         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 627         context.verify_mode = (ssl.CERT_NONE
 628                                if opts_no_check_certificate
 629                                else ssl.CERT_REQUIRED)
 630         context.set_default_verify_paths()
 631         try:
 632             context.load_default_certs()
 633         except AttributeError:
 634             pass  # Python < 3.4
 635         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 636
 637 class ExtractorError(Exception):
 638     """Error during info extraction."""
 639     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 640         """ tb, if given, is the original traceback (so that it can be printed out).
 641         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 642         """
 643
 644         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 645             expected = True
 646         if video_id is not None:
 647             msg = video_id + ': ' + msg
 648         if not expected:
 649             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 650         super(ExtractorError, self).__init__(msg)
 651
 652         self.traceback = tb
 653         self.exc_info = sys.exc_info()  # preserve original exception
 654         self.cause = cause
 655         self.video_id = video_id
 656
 657     def format_traceback(self):
 658         if self.traceback is None:
 659             return None
 660         return u''.join(traceback.format_tb(self.traceback))
 661
 662
 663 class RegexNotFoundError(ExtractorError):
 664     """Error when a regex didn't match"""
 665     pass
 666
 667
 668 class DownloadError(Exception):
 669     """Download Error exception.
 670
 671     This exception may be thrown by FileDownloader objects if they are not
 672     configured to continue on errors. They will contain the appropriate
 673     error message.
 674     """
 675     def __init__(self, msg, exc_info=None):
 676         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 677         super(DownloadError, self).__init__(msg)
 678         self.exc_info = exc_info
 679
 680
 681 class SameFileError(Exception):
 682     """Same File exception.
 683
 684     This exception will be thrown by FileDownloader objects if they detect
 685     multiple files would have to be downloaded to the same file on disk.
 686     """
 687     pass
 688
 689
 690 class PostProcessingError(Exception):
 691     """Post Processing exception.
 692
 693     This exception may be raised by PostProcessor's .run() method to
 694     indicate an error in the postprocessing task.
 695     """
 696     def __init__(self, msg):
 697         self.msg = msg
 698
 699 class MaxDownloadsReached(Exception):
 700     """ --max-downloads limit has been reached. """
 701     pass
 702
 703
 704 class UnavailableVideoError(Exception):
 705     """Unavailable Format exception.
 706
 707     This exception will be thrown when a video is requested
 708     in a format that is not available for that video.
 709     """
 710     pass
 711
 712
 713 class ContentTooShortError(Exception):
 714     """Content Too Short exception.
 715
 716     This exception may be raised by FileDownloader objects when a file they
 717     download is too small for what the server announced first, indicating
 718     the connection was probably interrupted.
 719     """
 720     # Both in bytes
 721     downloaded = None
 722     expected = None
 723
 724     def __init__(self, downloaded, expected):
 725         self.downloaded = downloaded
 726         self.expected = expected
 727
 728 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 729     """Handler for HTTP requests and responses.
 730
 731     This class, when installed with an OpenerDirector, automatically adds
 732     the standard headers to every HTTP request and handles gzipped and
 733     deflated responses from web servers. If compression is to be avoided in
 734     a particular request, the original request in the program code only has
 735     to include the HTTP header "Youtubedl-No-Compression", which will be
 736     removed before making the real request.
 737
 738     Part of this code was copied from:
 739
 740     http://techknack.net/python-urllib2-handlers/
 741
 742     Andrew Rowls, the author of that code, agreed to release it to the
 743     public domain.
 744     """
 745
 746     @staticmethod
 747     def deflate(data):
 748         try:
 749             return zlib.decompress(data, -zlib.MAX_WBITS)
 750         except zlib.error:
 751             return zlib.decompress(data)
 752
 753     @staticmethod
 754     def addinfourl_wrapper(stream, headers, url, code):
 755         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 756             return compat_urllib_request.addinfourl(stream, headers, url, code)
 757         ret = compat_urllib_request.addinfourl(stream, headers, url)
 758         ret.code = code
 759         return ret
 760
 761     def http_request(self, req):
 762         for h,v in std_headers.items():
 763             if h in req.headers:
 764                 del req.headers[h]
 765             req.add_header(h, v)
 766         if 'Youtubedl-no-compression' in req.headers:
 767             if 'Accept-encoding' in req.headers:
 768                 del req.headers['Accept-encoding']
 769             del req.headers['Youtubedl-no-compression']
 770         if 'Youtubedl-user-agent' in req.headers:
 771             if 'User-agent' in req.headers:
 772                 del req.headers['User-agent']
 773             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 774             del req.headers['Youtubedl-user-agent']
 775         return req
 776
 777     def http_response(self, req, resp):
 778         old_resp = resp
 779         # gzip
 780         if resp.headers.get('Content-encoding', '') == 'gzip':
 781             content = resp.read()
 782             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 783             try:
 784                 uncompressed = io.BytesIO(gz.read())
 785             except IOError as original_ioerror:
 786                 # There may be junk add the end of the file
 787                 # See http://stackoverflow.com/q/4928560/35070 for details
 788                 for i in range(1, 1024):
 789                     try:
 790                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 791                         uncompressed = io.BytesIO(gz.read())
 792                     except IOError:
 793                         continue
 794                     break
 795                 else:
 796                     raise original_ioerror
 797             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 798             resp.msg = old_resp.msg
 799         # deflate
 800         if resp.headers.get('Content-encoding', '') == 'deflate':
 801             gz = io.BytesIO(self.deflate(resp.read()))
 802             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 803             resp.msg = old_resp.msg
 804         return resp
 805
 806     https_request = http_request
 807     https_response = http_response
 808
 809
 810 def parse_iso8601(date_str, delimiter='T'):
 811     """ Return a UNIX timestamp from the given date """
 812
 813     if date_str is None:
 814         return None
 815
 816     m = re.search(
 817         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 818         date_str)
 819     if not m:
 820         timezone = datetime.timedelta()
 821     else:
 822         date_str = date_str[:-len(m.group(0))]
 823         if not m.group('sign'):
 824             timezone = datetime.timedelta()
 825         else:
 826             sign = 1 if m.group('sign') == '+' else -1
 827             timezone = datetime.timedelta(
 828                 hours=sign * int(m.group('hours')),
 829                 minutes=sign * int(m.group('minutes')))
 830     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 831     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 832     return calendar.timegm(dt.timetuple())
 833
 834
 835 def unified_strdate(date_str):
 836     """Return a string with the date in the format YYYYMMDD"""
 837
 838     if date_str is None:
 839         return None
 840
 841     upload_date = None
 842     #Replace commas
 843     date_str = date_str.replace(',', ' ')
 844     # %z (UTC offset) is only supported in python>=3.2
 845     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 846     format_expressions = [
 847         '%d %B %Y',
 848         '%d %b %Y',
 849         '%B %d %Y',
 850         '%b %d %Y',
 851         '%b %dst %Y %I:%M%p',
 852         '%b %dnd %Y %I:%M%p',
 853         '%b %dth %Y %I:%M%p',
 854         '%Y-%m-%d',
 855         '%Y/%m/%d',
 856         '%d.%m.%Y',
 857         '%d/%m/%Y',
 858         '%d/%m/%y',
 859         '%Y/%m/%d %H:%M:%S',
 860         '%Y-%m-%d %H:%M:%S',
 861         '%d.%m.%Y %H:%M',
 862         '%d.%m.%Y %H.%M',
 863         '%Y-%m-%dT%H:%M:%SZ',
 864         '%Y-%m-%dT%H:%M:%S.%fZ',
 865         '%Y-%m-%dT%H:%M:%S.%f0Z',
 866         '%Y-%m-%dT%H:%M:%S',
 867         '%Y-%m-%dT%H:%M:%S.%f',
 868         '%Y-%m-%dT%H:%M',
 869     ]
 870     for expression in format_expressions:
 871         try:
 872             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 873         except ValueError:
 874             pass
 875     if upload_date is None:
 876         timetuple = email.utils.parsedate_tz(date_str)
 877         if timetuple:
 878             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 879     return upload_date
 880
 881 def determine_ext(url, default_ext=u'unknown_video'):
 882     if url is None:
 883         return default_ext
 884     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 885     if re.match(r'^[A-Za-z0-9]+$', guess):
 886         return guess
 887     else:
 888         return default_ext
 889
 890 def subtitles_filename(filename, sub_lang, sub_format):
 891     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 892
 893 def date_from_str(date_str):
 894     """
 895     Return a datetime object from a string in the format YYYYMMDD or
 896     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 897     today = datetime.date.today()
 898     if date_str == 'now'or date_str == 'today':
 899         return today
 900     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 901     if match is not None:
 902         sign = match.group('sign')
 903         time = int(match.group('time'))
 904         if sign == '-':
 905             time = -time
 906         unit = match.group('unit')
 907         #A bad aproximation?
 908         if unit == 'month':
 909             unit = 'day'
 910             time *= 30
 911         elif unit == 'year':
 912             unit = 'day'
 913             time *= 365
 914         unit += 's'
 915         delta = datetime.timedelta(**{unit: time})
 916         return today + delta
 917     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 918
 919 def hyphenate_date(date_str):
 920     """
 921     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 922     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 923     if match is not None:
 924         return '-'.join(match.groups())
 925     else:
 926         return date_str
 927
 928 class DateRange(object):
 929     """Represents a time interval between two dates"""
 930     def __init__(self, start=None, end=None):
 931         """start and end must be strings in the format accepted by date"""
 932         if start is not None:
 933             self.start = date_from_str(start)
 934         else:
 935             self.start = datetime.datetime.min.date()
 936         if end is not None:
 937             self.end = date_from_str(end)
 938         else:
 939             self.end = datetime.datetime.max.date()
 940         if self.start > self.end:
 941             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 942     @classmethod
 943     def day(cls, day):
 944         """Returns a range that only contains the given day"""
 945         return cls(day,day)
 946     def __contains__(self, date):
 947         """Check if the date is in the range"""
 948         if not isinstance(date, datetime.date):
 949             date = date_from_str(date)
 950         return self.start <= date <= self.end
 951     def __str__(self):
 952         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 953
 954
 955 def platform_name():
 956     """ Returns the platform name as a compat_str """
 957     res = platform.platform()
 958     if isinstance(res, bytes):
 959         res = res.decode(preferredencoding())
 960
 961     assert isinstance(res, compat_str)
 962     return res
 963
 964
 965 def _windows_write_string(s, out):
 966     """ Returns True if the string was written using special methods,
 967     False if it has yet to be written out."""
 968     # Adapted from http://stackoverflow.com/a/3259271/35070
 969
 970     import ctypes
 971     import ctypes.wintypes
 972
 973     WIN_OUTPUT_IDS = {
 974         1: -11,
 975         2: -12,
 976     }
 977
 978     try:
 979         fileno = out.fileno()
 980     except AttributeError:
 981         # If the output stream doesn't have a fileno, it's virtual
 982         return False
 983     if fileno not in WIN_OUTPUT_IDS:
 984         return False
 985
 986     GetStdHandle = ctypes.WINFUNCTYPE(
 987         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 988         ("GetStdHandle", ctypes.windll.kernel32))
 989     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 990
 991     WriteConsoleW = ctypes.WINFUNCTYPE(
 992         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 993         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 994         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 995     written = ctypes.wintypes.DWORD(0)
 996
 997     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 998     FILE_TYPE_CHAR = 0x0002
 999     FILE_TYPE_REMOTE = 0x8000
1000     GetConsoleMode = ctypes.WINFUNCTYPE(
1001         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1002         ctypes.POINTER(ctypes.wintypes.DWORD))(
1003         ("GetConsoleMode", ctypes.windll.kernel32))
1004     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1005
1006     def not_a_console(handle):
1007         if handle == INVALID_HANDLE_VALUE or handle is None:
1008             return True
1009         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1010                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1011
1012     if not_a_console(h):
1013         return False
1014
1015     def next_nonbmp_pos(s):
1016         try:
1017             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1018         except StopIteration:
1019             return len(s)
1020
1021     while s:
1022         count = min(next_nonbmp_pos(s), 1024)
1023
1024         ret = WriteConsoleW(
1025             h, s, count if count else 2, ctypes.byref(written), None)
1026         if ret == 0:
1027             raise OSError('Failed to write string')
1028         if not count:  # We just wrote a non-BMP character
1029             assert written.value == 2
1030             s = s[1:]
1031         else:
1032             assert written.value > 0
1033             s = s[written.value:]
1034     return True
1035
1036
1037 def write_string(s, out=None, encoding=None):
1038     if out is None:
1039         out = sys.stderr
1040     assert type(s) == compat_str
1041
1042     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1043         if _windows_write_string(s, out):
1044             return
1045
1046     if ('b' in getattr(out, 'mode', '') or
1047             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1048         byt = s.encode(encoding or preferredencoding(), 'ignore')
1049         out.write(byt)
1050     elif hasattr(out, 'buffer'):
1051         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1052         byt = s.encode(enc, 'ignore')
1053         out.buffer.write(byt)
1054     else:
1055         out.write(s)
1056     out.flush()
1057
1058
1059 def bytes_to_intlist(bs):
1060     if not bs:
1061         return []
1062     if isinstance(bs[0], int):  # Python 3
1063         return list(bs)
1064     else:
1065         return [ord(c) for c in bs]
1066
1067
1068 def intlist_to_bytes(xs):
1069     if not xs:
1070         return b''
1071     if isinstance(chr(0), bytes):  # Python 2
1072         return ''.join([chr(x) for x in xs])
1073     else:
1074         return bytes(xs)
1075
1076
1077 def get_cachedir(params={}):
1078     cache_root = os.environ.get('XDG_CACHE_HOME',
1079                                 os.path.expanduser('~/.cache'))
1080     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1081
1082
1083 # Cross-platform file locking
1084 if sys.platform == 'win32':
1085     import ctypes.wintypes
1086     import msvcrt
1087
1088     class OVERLAPPED(ctypes.Structure):
1089         _fields_ = [
1090             ('Internal', ctypes.wintypes.LPVOID),
1091             ('InternalHigh', ctypes.wintypes.LPVOID),
1092             ('Offset', ctypes.wintypes.DWORD),
1093             ('OffsetHigh', ctypes.wintypes.DWORD),
1094             ('hEvent', ctypes.wintypes.HANDLE),
1095         ]
1096
1097     kernel32 = ctypes.windll.kernel32
1098     LockFileEx = kernel32.LockFileEx
1099     LockFileEx.argtypes = [
1100         ctypes.wintypes.HANDLE,     # hFile
1101         ctypes.wintypes.DWORD,      # dwFlags
1102         ctypes.wintypes.DWORD,      # dwReserved
1103         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1104         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1105         ctypes.POINTER(OVERLAPPED)  # Overlapped
1106     ]
1107     LockFileEx.restype = ctypes.wintypes.BOOL
1108     UnlockFileEx = kernel32.UnlockFileEx
1109     UnlockFileEx.argtypes = [
1110         ctypes.wintypes.HANDLE,     # hFile
1111         ctypes.wintypes.DWORD,      # dwReserved
1112         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1113         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1114         ctypes.POINTER(OVERLAPPED)  # Overlapped
1115     ]
1116     UnlockFileEx.restype = ctypes.wintypes.BOOL
1117     whole_low = 0xffffffff
1118     whole_high = 0x7fffffff
1119
1120     def _lock_file(f, exclusive):
1121         overlapped = OVERLAPPED()
1122         overlapped.Offset = 0
1123         overlapped.OffsetHigh = 0
1124         overlapped.hEvent = 0
1125         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1126         handle = msvcrt.get_osfhandle(f.fileno())
1127         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1128                           whole_low, whole_high, f._lock_file_overlapped_p):
1129             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1130
1131     def _unlock_file(f):
1132         assert f._lock_file_overlapped_p
1133         handle = msvcrt.get_osfhandle(f.fileno())
1134         if not UnlockFileEx(handle, 0,
1135                             whole_low, whole_high, f._lock_file_overlapped_p):
1136             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1137
1138 else:
1139     import fcntl
1140
1141     def _lock_file(f, exclusive):
1142         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1143
1144     def _unlock_file(f):
1145         fcntl.lockf(f, fcntl.LOCK_UN)
1146
1147
1148 class locked_file(object):
1149     def __init__(self, filename, mode, encoding=None):
1150         assert mode in ['r', 'a', 'w']
1151         self.f = io.open(filename, mode, encoding=encoding)
1152         self.mode = mode
1153
1154     def __enter__(self):
1155         exclusive = self.mode != 'r'
1156         try:
1157             _lock_file(self.f, exclusive)
1158         except IOError:
1159             self.f.close()
1160             raise
1161         return self
1162
1163     def __exit__(self, etype, value, traceback):
1164         try:
1165             _unlock_file(self.f)
1166         finally:
1167             self.f.close()
1168
1169     def __iter__(self):
1170         return iter(self.f)
1171
1172     def write(self, *args):
1173         return self.f.write(*args)
1174
1175     def read(self, *args):
1176         return self.f.read(*args)
1177
1178
1179 def shell_quote(args):
1180     quoted_args = []
1181     encoding = sys.getfilesystemencoding()
1182     if encoding is None:
1183         encoding = 'utf-8'
1184     for a in args:
1185         if isinstance(a, bytes):
1186             # We may get a filename encoded with 'encodeFilename'
1187             a = a.decode(encoding)
1188         quoted_args.append(pipes.quote(a))
1189     return u' '.join(quoted_args)
1190
1191
1192 def takewhile_inclusive(pred, seq):
1193     """ Like itertools.takewhile, but include the latest evaluated element
1194         (the first element so that Not pred(e)) """
1195     for e in seq:
1196         yield e
1197         if not pred(e):
1198             return
1199
1200
1201 def smuggle_url(url, data):
1202     """ Pass additional data in a URL for internal use. """
1203
1204     sdata = compat_urllib_parse.urlencode(
1205         {u'__youtubedl_smuggle': json.dumps(data)})
1206     return url + u'#' + sdata
1207
1208
1209 def unsmuggle_url(smug_url, default=None):
1210     if not '#__youtubedl_smuggle' in smug_url:
1211         return smug_url, default
1212     url, _, sdata = smug_url.rpartition(u'#')
1213     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1214     data = json.loads(jsond)
1215     return url, data
1216
1217
1218 def format_bytes(bytes):
1219     if bytes is None:
1220         return u'N/A'
1221     if type(bytes) is str:
1222         bytes = float(bytes)
1223     if bytes == 0.0:
1224         exponent = 0
1225     else:
1226         exponent = int(math.log(bytes, 1024.0))
1227     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1228     converted = float(bytes) / float(1024 ** exponent)
1229     return u'%.2f%s' % (converted, suffix)
1230
1231
1232 def get_term_width():
1233     columns = os.environ.get('COLUMNS', None)
1234     if columns:
1235         return int(columns)
1236
1237     try:
1238         sp = subprocess.Popen(
1239             ['stty', 'size'],
1240             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1241         out, err = sp.communicate()
1242         return int(out.split()[1])
1243     except:
1244         pass
1245     return None
1246
1247
1248 def month_by_name(name):
1249     """ Return the number of a month by (locale-independently) English name """
1250
1251     ENGLISH_NAMES = [
1252         u'January', u'February', u'March', u'April', u'May', u'June',
1253         u'July', u'August', u'September', u'October', u'November', u'December']
1254     try:
1255         return ENGLISH_NAMES.index(name) + 1
1256     except ValueError:
1257         return None
1258
1259
1260 def fix_xml_ampersands(xml_str):
1261     """Replace all the '&' by '&amp;' in XML"""
1262     return re.sub(
1263         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1264         u'&amp;',
1265         xml_str)
1266
1267
1268 def setproctitle(title):
1269     assert isinstance(title, compat_str)
1270     try:
1271         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1272     except OSError:
1273         return
1274     title_bytes = title.encode('utf-8')
1275     buf = ctypes.create_string_buffer(len(title_bytes))
1276     buf.value = title_bytes
1277     try:
1278         libc.prctl(15, buf, 0, 0, 0)
1279     except AttributeError:
1280         return  # Strange libc, just skip this
1281
1282
1283 def remove_start(s, start):
1284     if s.startswith(start):
1285         return s[len(start):]
1286     return s
1287
1288
1289 def remove_end(s, end):
1290     if s.endswith(end):
1291         return s[:-len(end)]
1292     return s
1293
1294
1295 def url_basename(url):
1296     path = compat_urlparse.urlparse(url).path
1297     return path.strip(u'/').split(u'/')[-1]
1298
1299
1300 class HEADRequest(compat_urllib_request.Request):
1301     def get_method(self):
1302         return "HEAD"
1303
1304
1305 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1306     if get_attr:
1307         if v is not None:
1308             v = getattr(v, get_attr, None)
1309     if v == '':
1310         v = None
1311     return default if v is None else (int(v) * invscale // scale)
1312
1313
1314 def str_or_none(v, default=None):
1315     return default if v is None else compat_str(v)
1316
1317
1318 def str_to_int(int_str):
1319     if int_str is None:
1320         return None
1321     int_str = re.sub(r'[,\.]', u'', int_str)
1322     return int(int_str)
1323
1324
1325 def float_or_none(v, scale=1, invscale=1, default=None):
1326     return default if v is None else (float(v) * invscale / scale)
1327
1328
1329 def parse_duration(s):
1330     if s is None:
1331         return None
1332
1333     m = re.match(
1334         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1335     if not m:
1336         return None
1337     res = int(m.group('secs'))
1338     if m.group('mins'):
1339         res += int(m.group('mins')) * 60
1340         if m.group('hours'):
1341             res += int(m.group('hours')) * 60 * 60
1342     return res
1343
1344
1345 def prepend_extension(filename, ext):
1346     name, real_ext = os.path.splitext(filename)
1347     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1348
1349
1350 def check_executable(exe, args=[]):
1351     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1352     args can be a list of arguments for a short output (like -version) """
1353     try:
1354         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1355     except OSError:
1356         return False
1357     return exe
1358
1359
1360 class PagedList(object):
1361     def __init__(self, pagefunc, pagesize):
1362         self._pagefunc = pagefunc
1363         self._pagesize = pagesize
1364
1365     def __len__(self):
1366         # This is only useful for tests
1367         return len(self.getslice())
1368
1369     def getslice(self, start=0, end=None):
1370         res = []
1371         for pagenum in itertools.count(start // self._pagesize):
1372             firstid = pagenum * self._pagesize
1373             nextfirstid = pagenum * self._pagesize + self._pagesize
1374             if start >= nextfirstid:
1375                 continue
1376
1377             page_results = list(self._pagefunc(pagenum))
1378
1379             startv = (
1380                 start % self._pagesize
1381                 if firstid <= start < nextfirstid
1382                 else 0)
1383
1384             endv = (
1385                 ((end - 1) % self._pagesize) + 1
1386                 if (end is not None and firstid <= end <= nextfirstid)
1387                 else None)
1388
1389             if startv != 0 or endv is not None:
1390                 page_results = page_results[startv:endv]
1391             res.extend(page_results)
1392
1393             # A little optimization - if current page is not "full", ie. does
1394             # not contain page_size videos then we can assume that this page
1395             # is the last one - there are no more ids on further pages -
1396             # i.e. no need to query again.
1397             if len(page_results) + startv < self._pagesize:
1398                 break
1399
1400             # If we got the whole page, but the next page is not interesting,
1401             # break out early as well
1402             if end == nextfirstid:
1403                 break
1404         return res
1405
1406
1407 def uppercase_escape(s):
1408     unicode_escape = codecs.getdecoder('unicode_escape')
1409     return re.sub(
1410         r'\\U[0-9a-fA-F]{8}',
1411         lambda m: unicode_escape(m.group(0))[0],
1412         s)
1413
1414 try:
1415     struct.pack(u'!I', 0)
1416 except TypeError:
1417     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1418     def struct_pack(spec, *args):
1419         if isinstance(spec, compat_str):
1420             spec = spec.encode('ascii')
1421         return struct.pack(spec, *args)
1422
1423     def struct_unpack(spec, *args):
1424         if isinstance(spec, compat_str):
1425             spec = spec.encode('ascii')
1426         return struct.unpack(spec, *args)
1427 else:
1428     struct_pack = struct.pack
1429     struct_unpack = struct.unpack
1430
1431
1432 def read_batch_urls(batch_fd):
1433     def fixup(url):
1434         if not isinstance(url, compat_str):
1435             url = url.decode('utf-8', 'replace')
1436         BOM_UTF8 = u'\xef\xbb\xbf'
1437         if url.startswith(BOM_UTF8):
1438             url = url[len(BOM_UTF8):]
1439         url = url.strip()
1440         if url.startswith(('#', ';', ']')):
1441             return False
1442         return url
1443
1444     with contextlib.closing(batch_fd) as fd:
1445         return [url for url in map(fixup, fd) if url]
1446
1447
1448 def urlencode_postdata(*args, **kargs):
1449     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1450
1451
1452 def parse_xml(s):
1453     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1454         def doctype(self, name, pubid, system):
1455             pass  # Ignore doctypes
1456
1457     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1458     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1459     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1460
1461
1462 if sys.version_info < (3, 0) and sys.platform == 'win32':
1463     def compat_getpass(prompt, *args, **kwargs):
1464         if isinstance(prompt, compat_str):
1465             prompt = prompt.encode(preferredencoding())
1466         return getpass.getpass(prompt, *args, **kwargs)
1467 else:
1468     compat_getpass = getpass.getpass
1469
1470
1471 US_RATINGS = {
1472     'G': 0,
1473     'PG': 10,
1474     'PG-13': 13,
1475     'R': 16,
1476     'NC': 18,
1477 }
1478
1479
1480 def strip_jsonp(code):
1481     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1482
1483
1484 def js_to_json(code):
1485     def fix_kv(m):
1486         key = m.group(2)
1487         if key.startswith("'"):
1488             assert key.endswith("'")
1489             assert '"' not in key
1490             key = '"%s"' % key[1:-1]
1491         elif not key.startswith('"'):
1492             key = '"%s"' % key
1493
1494         value = m.group(4)
1495         if value.startswith("'"):
1496             assert value.endswith("'")
1497             assert '"' not in value
1498             value = '"%s"' % value[1:-1]
1499
1500         return m.group(1) + key + m.group(3) + value
1501
1502     res = re.sub(r'''(?x)
1503             ([{,]\s*)
1504             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1505             (:\s*)
1506             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1507         ''', fix_kv, code)
1508     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1509     return res
1510
1511
1512 def qualities(quality_ids):
1513     """ Get a numeric quality value out of a list of possible values """
1514     def q(qid):
1515         try:
1516             return quality_ids.index(qid)
1517         except ValueError:
1518             return -1
1519     return q
1520
1521
1522 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1523
1524 try:
1525     subprocess_check_output = subprocess.check_output
1526 except AttributeError:
1527     def subprocess_check_output(*args, **kwargs):
1528         assert 'input' not in kwargs
1529         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1530         output, _ = p.communicate()
1531         ret = p.poll()
1532         if ret:
1533             raise subprocess.CalledProcessError(ret, p.args, output=output)
1534         return output