_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 def compat_ord(c):
 196     if type(c) is int: return c
 197     else: return ord(c)
 198
 199 # This is not clearly defined otherwise
 200 compiled_regex_type = type(re.compile(''))
 201
 202 std_headers = {
 203     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 204     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 205     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 206     'Accept-Encoding': 'gzip, deflate',
 207     'Accept-Language': 'en-us,en;q=0.5',
 208 }
 209
 210 def preferredencoding():
 211     """Get preferred encoding.
 212
 213     Returns the best encoding scheme for the system, based on
 214     locale.getpreferredencoding() and some further tweaks.
 215     """
 216     try:
 217         pref = locale.getpreferredencoding()
 218         u'TEST'.encode(pref)
 219     except:
 220         pref = 'UTF-8'
 221
 222     return pref
 223
 224 if sys.version_info < (3,0):
 225     def compat_print(s):
 226         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 227 else:
 228     def compat_print(s):
 229         assert type(s) == type(u'')
 230         print(s)
 231
 232
 233 def write_json_file(obj, fn):
 234     """ Encode obj as JSON and write it to fn, atomically """
 235
 236     args = {
 237         'suffix': '.tmp',
 238         'prefix': os.path.basename(fn) + '.',
 239         'dir': os.path.dirname(fn),
 240         'delete': False,
 241     }
 242
 243     # In Python 2.x, json.dump expects a bytestream.
 244     # In Python 3.x, it writes to a character stream
 245     if sys.version_info < (3, 0):
 246         args['mode'] = 'wb'
 247     else:
 248         args.update({
 249             'mode': 'w',
 250             'encoding': 'utf-8',
 251         })
 252
 253     tf = tempfile.NamedTemporaryFile(**args)
 254
 255     try:
 256         with tf:
 257             json.dump(obj, tf)
 258         os.rename(tf.name, fn)
 259     except:
 260         try:
 261             os.remove(tf.name)
 262         except OSError:
 263             pass
 264         raise
 265
 266
 267 if sys.version_info >= (2, 7):
 268     def find_xpath_attr(node, xpath, key, val):
 269         """ Find the xpath xpath[@key=val] """
 270         assert re.match(r'^[a-zA-Z-]+$', key)
 271         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 272         expr = xpath + u"[@%s='%s']" % (key, val)
 273         return node.find(expr)
 274 else:
 275     def find_xpath_attr(node, xpath, key, val):
 276         for f in node.findall(xpath):
 277             if f.attrib.get(key) == val:
 278                 return f
 279         return None
 280
 281 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 282 # the namespace parameter
 283 def xpath_with_ns(path, ns_map):
 284     components = [c.split(':') for c in path.split('/')]
 285     replaced = []
 286     for c in components:
 287         if len(c) == 1:
 288             replaced.append(c[0])
 289         else:
 290             ns, tag = c
 291             replaced.append('{%s}%s' % (ns_map[ns], tag))
 292     return '/'.join(replaced)
 293
 294 def htmlentity_transform(matchobj):
 295     """Transforms an HTML entity to a character.
 296
 297     This function receives a match object and is intended to be used with
 298     the re.sub() function.
 299     """
 300     entity = matchobj.group(1)
 301
 302     # Known non-numeric HTML entity
 303     if entity in compat_html_entities.name2codepoint:
 304         return compat_chr(compat_html_entities.name2codepoint[entity])
 305
 306     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 307     if mobj is not None:
 308         numstr = mobj.group(1)
 309         if numstr.startswith(u'x'):
 310             base = 16
 311             numstr = u'0%s' % numstr
 312         else:
 313             base = 10
 314         return compat_chr(int(numstr, base))
 315
 316     # Unknown entity in name, return its literal representation
 317     return (u'&%s;' % entity)
 318
 319 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 320 class BaseHTMLParser(compat_html_parser.HTMLParser):
 321     def __init(self):
 322         compat_html_parser.HTMLParser.__init__(self)
 323         self.html = None
 324
 325     def loads(self, html):
 326         self.html = html
 327         self.feed(html)
 328         self.close()
 329
 330 class AttrParser(BaseHTMLParser):
 331     """Modified HTMLParser that isolates a tag with the specified attribute"""
 332     def __init__(self, attribute, value):
 333         self.attribute = attribute
 334         self.value = value
 335         self.result = None
 336         self.started = False
 337         self.depth = {}
 338         self.watch_startpos = False
 339         self.error_count = 0
 340         BaseHTMLParser.__init__(self)
 341
 342     def error(self, message):
 343         if self.error_count > 10 or self.started:
 344             raise compat_html_parser.HTMLParseError(message, self.getpos())
 345         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 346         self.error_count += 1
 347         self.goahead(1)
 348
 349     def handle_starttag(self, tag, attrs):
 350         attrs = dict(attrs)
 351         if self.started:
 352             self.find_startpos(None)
 353         if self.attribute in attrs and attrs[self.attribute] == self.value:
 354             self.result = [tag]
 355             self.started = True
 356             self.watch_startpos = True
 357         if self.started:
 358             if not tag in self.depth: self.depth[tag] = 0
 359             self.depth[tag] += 1
 360
 361     def handle_endtag(self, tag):
 362         if self.started:
 363             if tag in self.depth: self.depth[tag] -= 1
 364             if self.depth[self.result[0]] == 0:
 365                 self.started = False
 366                 self.result.append(self.getpos())
 367
 368     def find_startpos(self, x):
 369         """Needed to put the start position of the result (self.result[1])
 370         after the opening tag with the requested id"""
 371         if self.watch_startpos:
 372             self.watch_startpos = False
 373             self.result.append(self.getpos())
 374     handle_entityref = handle_charref = handle_data = handle_comment = \
 375     handle_decl = handle_pi = unknown_decl = find_startpos
 376
 377     def get_result(self):
 378         if self.result is None:
 379             return None
 380         if len(self.result) != 3:
 381             return None
 382         lines = self.html.split('\n')
 383         lines = lines[self.result[1][0]-1:self.result[2][0]]
 384         lines[0] = lines[0][self.result[1][1]:]
 385         if len(lines) == 1:
 386             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 387         lines[-1] = lines[-1][:self.result[2][1]]
 388         return '\n'.join(lines).strip()
 389 # Hack for https://github.com/rg3/youtube-dl/issues/662
 390 if sys.version_info < (2, 7, 3):
 391     AttrParser.parse_endtag = (lambda self, i:
 392         i + len("</scr'+'ipt>")
 393         if self.rawdata[i:].startswith("</scr'+'ipt>")
 394         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 395
 396 def get_element_by_id(id, html):
 397     """Return the content of the tag with the specified ID in the passed HTML document"""
 398     return get_element_by_attribute("id", id, html)
 399
 400 def get_element_by_attribute(attribute, value, html):
 401     """Return the content of the tag with the specified attribute in the passed HTML document"""
 402     parser = AttrParser(attribute, value)
 403     try:
 404         parser.loads(html)
 405     except compat_html_parser.HTMLParseError:
 406         pass
 407     return parser.get_result()
 408
 409 class MetaParser(BaseHTMLParser):
 410     """
 411     Modified HTMLParser that isolates a meta tag with the specified name
 412     attribute.
 413     """
 414     def __init__(self, name):
 415         BaseHTMLParser.__init__(self)
 416         self.name = name
 417         self.content = None
 418         self.result = None
 419
 420     def handle_starttag(self, tag, attrs):
 421         if tag != 'meta':
 422             return
 423         attrs = dict(attrs)
 424         if attrs.get('name') == self.name:
 425             self.result = attrs.get('content')
 426
 427     def get_result(self):
 428         return self.result
 429
 430 def get_meta_content(name, html):
 431     """
 432     Return the content attribute from the meta tag with the given name attribute.
 433     """
 434     parser = MetaParser(name)
 435     try:
 436         parser.loads(html)
 437     except compat_html_parser.HTMLParseError:
 438         pass
 439     return parser.get_result()
 440
 441
 442 def clean_html(html):
 443     """Clean an HTML snippet into a readable string"""
 444     # Newline vs <br />
 445     html = html.replace('\n', ' ')
 446     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 447     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 448     # Strip html tags
 449     html = re.sub('<.*?>', '', html)
 450     # Replace html entities
 451     html = unescapeHTML(html)
 452     return html.strip()
 453
 454
 455 def sanitize_open(filename, open_mode):
 456     """Try to open the given filename, and slightly tweak it if this fails.
 457
 458     Attempts to open the given filename. If this fails, it tries to change
 459     the filename slightly, step by step, until it's either able to open it
 460     or it fails and raises a final exception, like the standard open()
 461     function.
 462
 463     It returns the tuple (stream, definitive_file_name).
 464     """
 465     try:
 466         if filename == u'-':
 467             if sys.platform == 'win32':
 468                 import msvcrt
 469                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 470             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 471         stream = open(encodeFilename(filename), open_mode)
 472         return (stream, filename)
 473     except (IOError, OSError) as err:
 474         if err.errno in (errno.EACCES,):
 475             raise
 476
 477         # In case of error, try to remove win32 forbidden chars
 478         alt_filename = os.path.join(
 479                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 480                         for path_part in os.path.split(filename)
 481                        )
 482         if alt_filename == filename:
 483             raise
 484         else:
 485             # An exception here should be caught in the caller
 486             stream = open(encodeFilename(filename), open_mode)
 487             return (stream, alt_filename)
 488
 489
 490 def timeconvert(timestr):
 491     """Convert RFC 2822 defined time string into system timestamp"""
 492     timestamp = None
 493     timetuple = email.utils.parsedate_tz(timestr)
 494     if timetuple is not None:
 495         timestamp = email.utils.mktime_tz(timetuple)
 496     return timestamp
 497
 498 def sanitize_filename(s, restricted=False, is_id=False):
 499     """Sanitizes a string so it could be used as part of a filename.
 500     If restricted is set, use a stricter subset of allowed characters.
 501     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 502     """
 503     def replace_insane(char):
 504         if char == '?' or ord(char) < 32 or ord(char) == 127:
 505             return ''
 506         elif char == '"':
 507             return '' if restricted else '\''
 508         elif char == ':':
 509             return '_-' if restricted else ' -'
 510         elif char in '\\/|*<>':
 511             return '_'
 512         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 513             return '_'
 514         if restricted and ord(char) > 127:
 515             return '_'
 516         return char
 517
 518     result = u''.join(map(replace_insane, s))
 519     if not is_id:
 520         while '__' in result:
 521             result = result.replace('__', '_')
 522         result = result.strip('_')
 523         # Common case of "Foreign band name - English song title"
 524         if restricted and result.startswith('-_'):
 525             result = result[2:]
 526         if not result:
 527             result = '_'
 528     return result
 529
 530 def orderedSet(iterable):
 531     """ Remove all duplicates from the input iterable """
 532     res = []
 533     for el in iterable:
 534         if el not in res:
 535             res.append(el)
 536     return res
 537
 538
 539 def unescapeHTML(s):
 540     if s is None:
 541         return None
 542     assert type(s) == compat_str
 543
 544     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 545     return result
 546
 547
 548 def encodeFilename(s, for_subprocess=False):
 549     """
 550     @param s The name of the file
 551     """
 552
 553     assert type(s) == compat_str
 554
 555     # Python 3 has a Unicode API
 556     if sys.version_info >= (3, 0):
 557         return s
 558
 559     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 560         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 561         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 562         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 563         if not for_subprocess:
 564             return s
 565         else:
 566             # For subprocess calls, encode with locale encoding
 567             # Refer to http://stackoverflow.com/a/9951851/35070
 568             encoding = preferredencoding()
 569     else:
 570         encoding = sys.getfilesystemencoding()
 571     if encoding is None:
 572         encoding = 'utf-8'
 573     return s.encode(encoding, 'ignore')
 574
 575
 576 def encodeArgument(s):
 577     if not isinstance(s, compat_str):
 578         # Legacy code that uses byte strings
 579         # Uncomment the following line after fixing all post processors
 580         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 581         s = s.decode('ascii')
 582     return encodeFilename(s, True)
 583
 584
 585 def decodeOption(optval):
 586     if optval is None:
 587         return optval
 588     if isinstance(optval, bytes):
 589         optval = optval.decode(preferredencoding())
 590
 591     assert isinstance(optval, compat_str)
 592     return optval
 593
 594 def formatSeconds(secs):
 595     if secs > 3600:
 596         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 597     elif secs > 60:
 598         return '%d:%02d' % (secs // 60, secs % 60)
 599     else:
 600         return '%d' % secs
 601
 602
 603 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 604     if sys.version_info < (3, 2):
 605         import httplib
 606
 607         class HTTPSConnectionV3(httplib.HTTPSConnection):
 608             def __init__(self, *args, **kwargs):
 609                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 610
 611             def connect(self):
 612                 sock = socket.create_connection((self.host, self.port), self.timeout)
 613                 if getattr(self, '_tunnel_host', False):
 614                     self.sock = sock
 615                     self._tunnel()
 616                 try:
 617                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 618                 except ssl.SSLError:
 619                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 620
 621         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 622             def https_open(self, req):
 623                 return self.do_open(HTTPSConnectionV3, req)
 624         return HTTPSHandlerV3(**kwargs)
 625     else:
 626         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 627         context.verify_mode = (ssl.CERT_NONE
 628                                if opts_no_check_certificate
 629                                else ssl.CERT_REQUIRED)
 630         context.set_default_verify_paths()
 631         try:
 632             context.load_default_certs()
 633         except AttributeError:
 634             pass  # Python < 3.4
 635         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 636
 637 class ExtractorError(Exception):
 638     """Error during info extraction."""
 639     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 640         """ tb, if given, is the original traceback (so that it can be printed out).
 641         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 642         """
 643
 644         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 645             expected = True
 646         if video_id is not None:
 647             msg = video_id + ': ' + msg
 648         if not expected:
 649             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 650         super(ExtractorError, self).__init__(msg)
 651
 652         self.traceback = tb
 653         self.exc_info = sys.exc_info()  # preserve original exception
 654         self.cause = cause
 655         self.video_id = video_id
 656
 657     def format_traceback(self):
 658         if self.traceback is None:
 659             return None
 660         return u''.join(traceback.format_tb(self.traceback))
 661
 662
 663 class RegexNotFoundError(ExtractorError):
 664     """Error when a regex didn't match"""
 665     pass
 666
 667
 668 class DownloadError(Exception):
 669     """Download Error exception.
 670
 671     This exception may be thrown by FileDownloader objects if they are not
 672     configured to continue on errors. They will contain the appropriate
 673     error message.
 674     """
 675     def __init__(self, msg, exc_info=None):
 676         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 677         super(DownloadError, self).__init__(msg)
 678         self.exc_info = exc_info
 679
 680
 681 class SameFileError(Exception):
 682     """Same File exception.
 683
 684     This exception will be thrown by FileDownloader objects if they detect
 685     multiple files would have to be downloaded to the same file on disk.
 686     """
 687     pass
 688
 689
 690 class PostProcessingError(Exception):
 691     """Post Processing exception.
 692
 693     This exception may be raised by PostProcessor's .run() method to
 694     indicate an error in the postprocessing task.
 695     """
 696     def __init__(self, msg):
 697         self.msg = msg
 698
 699 class MaxDownloadsReached(Exception):
 700     """ --max-downloads limit has been reached. """
 701     pass
 702
 703
 704 class UnavailableVideoError(Exception):
 705     """Unavailable Format exception.
 706
 707     This exception will be thrown when a video is requested
 708     in a format that is not available for that video.
 709     """
 710     pass
 711
 712
 713 class ContentTooShortError(Exception):
 714     """Content Too Short exception.
 715
 716     This exception may be raised by FileDownloader objects when a file they
 717     download is too small for what the server announced first, indicating
 718     the connection was probably interrupted.
 719     """
 720     # Both in bytes
 721     downloaded = None
 722     expected = None
 723
 724     def __init__(self, downloaded, expected):
 725         self.downloaded = downloaded
 726         self.expected = expected
 727
 728 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 729     """Handler for HTTP requests and responses.
 730
 731     This class, when installed with an OpenerDirector, automatically adds
 732     the standard headers to every HTTP request and handles gzipped and
 733     deflated responses from web servers. If compression is to be avoided in
 734     a particular request, the original request in the program code only has
 735     to include the HTTP header "Youtubedl-No-Compression", which will be
 736     removed before making the real request.
 737
 738     Part of this code was copied from:
 739
 740     http://techknack.net/python-urllib2-handlers/
 741
 742     Andrew Rowls, the author of that code, agreed to release it to the
 743     public domain.
 744     """
 745
 746     @staticmethod
 747     def deflate(data):
 748         try:
 749             return zlib.decompress(data, -zlib.MAX_WBITS)
 750         except zlib.error:
 751             return zlib.decompress(data)
 752
 753     @staticmethod
 754     def addinfourl_wrapper(stream, headers, url, code):
 755         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 756             return compat_urllib_request.addinfourl(stream, headers, url, code)
 757         ret = compat_urllib_request.addinfourl(stream, headers, url)
 758         ret.code = code
 759         return ret
 760
 761     def http_request(self, req):
 762         for h,v in std_headers.items():
 763             if h in req.headers:
 764                 del req.headers[h]
 765             req.add_header(h, v)
 766         if 'Youtubedl-no-compression' in req.headers:
 767             if 'Accept-encoding' in req.headers:
 768                 del req.headers['Accept-encoding']
 769             del req.headers['Youtubedl-no-compression']
 770         if 'Youtubedl-user-agent' in req.headers:
 771             if 'User-agent' in req.headers:
 772                 del req.headers['User-agent']
 773             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 774             del req.headers['Youtubedl-user-agent']
 775         return req
 776
 777     def http_response(self, req, resp):
 778         old_resp = resp
 779         # gzip
 780         if resp.headers.get('Content-encoding', '') == 'gzip':
 781             content = resp.read()
 782             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 783             try:
 784                 uncompressed = io.BytesIO(gz.read())
 785             except IOError as original_ioerror:
 786                 # There may be junk add the end of the file
 787                 # See http://stackoverflow.com/q/4928560/35070 for details
 788                 for i in range(1, 1024):
 789                     try:
 790                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 791                         uncompressed = io.BytesIO(gz.read())
 792                     except IOError:
 793                         continue
 794                     break
 795                 else:
 796                     raise original_ioerror
 797             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 798             resp.msg = old_resp.msg
 799         # deflate
 800         if resp.headers.get('Content-encoding', '') == 'deflate':
 801             gz = io.BytesIO(self.deflate(resp.read()))
 802             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 803             resp.msg = old_resp.msg
 804         return resp
 805
 806     https_request = http_request
 807     https_response = http_response
 808
 809
 810 def parse_iso8601(date_str, delimiter='T'):
 811     """ Return a UNIX timestamp from the given date """
 812
 813     if date_str is None:
 814         return None
 815
 816     m = re.search(
 817         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 818         date_str)
 819     if not m:
 820         timezone = datetime.timedelta()
 821     else:
 822         date_str = date_str[:-len(m.group(0))]
 823         if not m.group('sign'):
 824             timezone = datetime.timedelta()
 825         else:
 826             sign = 1 if m.group('sign') == '+' else -1
 827             timezone = datetime.timedelta(
 828                 hours=sign * int(m.group('hours')),
 829                 minutes=sign * int(m.group('minutes')))
 830     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 831     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 832     return calendar.timegm(dt.timetuple())
 833
 834
 835 def unified_strdate(date_str):
 836     """Return a string with the date in the format YYYYMMDD"""
 837
 838     if date_str is None:
 839         return None
 840
 841     upload_date = None
 842     #Replace commas
 843     date_str = date_str.replace(',', ' ')
 844     # %z (UTC offset) is only supported in python>=3.2
 845     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 846     format_expressions = [
 847         '%d %B %Y',
 848         '%d %b %Y',
 849         '%B %d %Y',
 850         '%b %d %Y',
 851         '%b %dst %Y %I:%M%p',
 852         '%b %dnd %Y %I:%M%p',
 853         '%b %dth %Y %I:%M%p',
 854         '%Y-%m-%d',
 855         '%Y/%m/%d',
 856         '%d.%m.%Y',
 857         '%d/%m/%Y',
 858         '%Y/%m/%d %H:%M:%S',
 859         '%Y-%m-%d %H:%M:%S',
 860         '%d.%m.%Y %H:%M',
 861         '%d.%m.%Y %H.%M',
 862         '%Y-%m-%dT%H:%M:%SZ',
 863         '%Y-%m-%dT%H:%M:%S.%fZ',
 864         '%Y-%m-%dT%H:%M:%S.%f0Z',
 865         '%Y-%m-%dT%H:%M:%S',
 866         '%Y-%m-%dT%H:%M:%S.%f',
 867         '%Y-%m-%dT%H:%M',
 868     ]
 869     for expression in format_expressions:
 870         try:
 871             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 872         except ValueError:
 873             pass
 874     if upload_date is None:
 875         timetuple = email.utils.parsedate_tz(date_str)
 876         if timetuple:
 877             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 878     return upload_date
 879
 880 def determine_ext(url, default_ext=u'unknown_video'):
 881     if url is None:
 882         return default_ext
 883     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 884     if re.match(r'^[A-Za-z0-9]+$', guess):
 885         return guess
 886     else:
 887         return default_ext
 888
 889 def subtitles_filename(filename, sub_lang, sub_format):
 890     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 891
 892 def date_from_str(date_str):
 893     """
 894     Return a datetime object from a string in the format YYYYMMDD or
 895     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 896     today = datetime.date.today()
 897     if date_str == 'now'or date_str == 'today':
 898         return today
 899     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 900     if match is not None:
 901         sign = match.group('sign')
 902         time = int(match.group('time'))
 903         if sign == '-':
 904             time = -time
 905         unit = match.group('unit')
 906         #A bad aproximation?
 907         if unit == 'month':
 908             unit = 'day'
 909             time *= 30
 910         elif unit == 'year':
 911             unit = 'day'
 912             time *= 365
 913         unit += 's'
 914         delta = datetime.timedelta(**{unit: time})
 915         return today + delta
 916     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 917
 918 def hyphenate_date(date_str):
 919     """
 920     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 921     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 922     if match is not None:
 923         return '-'.join(match.groups())
 924     else:
 925         return date_str
 926
 927 class DateRange(object):
 928     """Represents a time interval between two dates"""
 929     def __init__(self, start=None, end=None):
 930         """start and end must be strings in the format accepted by date"""
 931         if start is not None:
 932             self.start = date_from_str(start)
 933         else:
 934             self.start = datetime.datetime.min.date()
 935         if end is not None:
 936             self.end = date_from_str(end)
 937         else:
 938             self.end = datetime.datetime.max.date()
 939         if self.start > self.end:
 940             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 941     @classmethod
 942     def day(cls, day):
 943         """Returns a range that only contains the given day"""
 944         return cls(day,day)
 945     def __contains__(self, date):
 946         """Check if the date is in the range"""
 947         if not isinstance(date, datetime.date):
 948             date = date_from_str(date)
 949         return self.start <= date <= self.end
 950     def __str__(self):
 951         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 952
 953
 954 def platform_name():
 955     """ Returns the platform name as a compat_str """
 956     res = platform.platform()
 957     if isinstance(res, bytes):
 958         res = res.decode(preferredencoding())
 959
 960     assert isinstance(res, compat_str)
 961     return res
 962
 963
 964 def _windows_write_string(s, out):
 965     """ Returns True if the string was written using special methods,
 966     False if it has yet to be written out."""
 967     # Adapted from http://stackoverflow.com/a/3259271/35070
 968
 969     import ctypes
 970     import ctypes.wintypes
 971
 972     WIN_OUTPUT_IDS = {
 973         1: -11,
 974         2: -12,
 975     }
 976
 977     try:
 978         fileno = out.fileno()
 979     except AttributeError:
 980         # If the output stream doesn't have a fileno, it's virtual
 981         return False
 982     if fileno not in WIN_OUTPUT_IDS:
 983         return False
 984
 985     GetStdHandle = ctypes.WINFUNCTYPE(
 986         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 987         ("GetStdHandle", ctypes.windll.kernel32))
 988     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 989
 990     WriteConsoleW = ctypes.WINFUNCTYPE(
 991         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 992         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 993         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 994     written = ctypes.wintypes.DWORD(0)
 995
 996     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 997     FILE_TYPE_CHAR = 0x0002
 998     FILE_TYPE_REMOTE = 0x8000
 999     GetConsoleMode = ctypes.WINFUNCTYPE(
1000         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1001         ctypes.POINTER(ctypes.wintypes.DWORD))(
1002         ("GetConsoleMode", ctypes.windll.kernel32))
1003     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1004
1005     def not_a_console(handle):
1006         if handle == INVALID_HANDLE_VALUE or handle is None:
1007             return True
1008         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1009                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1010
1011     if not_a_console(h):
1012         return False
1013
1014     def next_nonbmp_pos(s):
1015         try:
1016             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1017         except StopIteration:
1018             return len(s)
1019
1020     while s:
1021         count = min(next_nonbmp_pos(s), 1024)
1022
1023         ret = WriteConsoleW(
1024             h, s, count if count else 2, ctypes.byref(written), None)
1025         if ret == 0:
1026             raise OSError('Failed to write string')
1027         if not count:  # We just wrote a non-BMP character
1028             assert written.value == 2
1029             s = s[1:]
1030         else:
1031             assert written.value > 0
1032             s = s[written.value:]
1033     return True
1034
1035
1036 def write_string(s, out=None, encoding=None):
1037     if out is None:
1038         out = sys.stderr
1039     assert type(s) == compat_str
1040
1041     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1042         if _windows_write_string(s, out):
1043             return
1044
1045     if ('b' in getattr(out, 'mode', '') or
1046             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1047         byt = s.encode(encoding or preferredencoding(), 'ignore')
1048         out.write(byt)
1049     elif hasattr(out, 'buffer'):
1050         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1051         byt = s.encode(enc, 'ignore')
1052         out.buffer.write(byt)
1053     else:
1054         out.write(s)
1055     out.flush()
1056
1057
1058 def bytes_to_intlist(bs):
1059     if not bs:
1060         return []
1061     if isinstance(bs[0], int):  # Python 3
1062         return list(bs)
1063     else:
1064         return [ord(c) for c in bs]
1065
1066
1067 def intlist_to_bytes(xs):
1068     if not xs:
1069         return b''
1070     if isinstance(chr(0), bytes):  # Python 2
1071         return ''.join([chr(x) for x in xs])
1072     else:
1073         return bytes(xs)
1074
1075
1076 def get_cachedir(params={}):
1077     cache_root = os.environ.get('XDG_CACHE_HOME',
1078                                 os.path.expanduser('~/.cache'))
1079     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1080
1081
1082 # Cross-platform file locking
1083 if sys.platform == 'win32':
1084     import ctypes.wintypes
1085     import msvcrt
1086
1087     class OVERLAPPED(ctypes.Structure):
1088         _fields_ = [
1089             ('Internal', ctypes.wintypes.LPVOID),
1090             ('InternalHigh', ctypes.wintypes.LPVOID),
1091             ('Offset', ctypes.wintypes.DWORD),
1092             ('OffsetHigh', ctypes.wintypes.DWORD),
1093             ('hEvent', ctypes.wintypes.HANDLE),
1094         ]
1095
1096     kernel32 = ctypes.windll.kernel32
1097     LockFileEx = kernel32.LockFileEx
1098     LockFileEx.argtypes = [
1099         ctypes.wintypes.HANDLE,     # hFile
1100         ctypes.wintypes.DWORD,      # dwFlags
1101         ctypes.wintypes.DWORD,      # dwReserved
1102         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1103         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1104         ctypes.POINTER(OVERLAPPED)  # Overlapped
1105     ]
1106     LockFileEx.restype = ctypes.wintypes.BOOL
1107     UnlockFileEx = kernel32.UnlockFileEx
1108     UnlockFileEx.argtypes = [
1109         ctypes.wintypes.HANDLE,     # hFile
1110         ctypes.wintypes.DWORD,      # dwReserved
1111         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1112         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1113         ctypes.POINTER(OVERLAPPED)  # Overlapped
1114     ]
1115     UnlockFileEx.restype = ctypes.wintypes.BOOL
1116     whole_low = 0xffffffff
1117     whole_high = 0x7fffffff
1118
1119     def _lock_file(f, exclusive):
1120         overlapped = OVERLAPPED()
1121         overlapped.Offset = 0
1122         overlapped.OffsetHigh = 0
1123         overlapped.hEvent = 0
1124         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1125         handle = msvcrt.get_osfhandle(f.fileno())
1126         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1127                           whole_low, whole_high, f._lock_file_overlapped_p):
1128             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1129
1130     def _unlock_file(f):
1131         assert f._lock_file_overlapped_p
1132         handle = msvcrt.get_osfhandle(f.fileno())
1133         if not UnlockFileEx(handle, 0,
1134                             whole_low, whole_high, f._lock_file_overlapped_p):
1135             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1136
1137 else:
1138     import fcntl
1139
1140     def _lock_file(f, exclusive):
1141         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1142
1143     def _unlock_file(f):
1144         fcntl.lockf(f, fcntl.LOCK_UN)
1145
1146
1147 class locked_file(object):
1148     def __init__(self, filename, mode, encoding=None):
1149         assert mode in ['r', 'a', 'w']
1150         self.f = io.open(filename, mode, encoding=encoding)
1151         self.mode = mode
1152
1153     def __enter__(self):
1154         exclusive = self.mode != 'r'
1155         try:
1156             _lock_file(self.f, exclusive)
1157         except IOError:
1158             self.f.close()
1159             raise
1160         return self
1161
1162     def __exit__(self, etype, value, traceback):
1163         try:
1164             _unlock_file(self.f)
1165         finally:
1166             self.f.close()
1167
1168     def __iter__(self):
1169         return iter(self.f)
1170
1171     def write(self, *args):
1172         return self.f.write(*args)
1173
1174     def read(self, *args):
1175         return self.f.read(*args)
1176
1177
1178 def shell_quote(args):
1179     quoted_args = []
1180     encoding = sys.getfilesystemencoding()
1181     if encoding is None:
1182         encoding = 'utf-8'
1183     for a in args:
1184         if isinstance(a, bytes):
1185             # We may get a filename encoded with 'encodeFilename'
1186             a = a.decode(encoding)
1187         quoted_args.append(pipes.quote(a))
1188     return u' '.join(quoted_args)
1189
1190
1191 def takewhile_inclusive(pred, seq):
1192     """ Like itertools.takewhile, but include the latest evaluated element
1193         (the first element so that Not pred(e)) """
1194     for e in seq:
1195         yield e
1196         if not pred(e):
1197             return
1198
1199
1200 def smuggle_url(url, data):
1201     """ Pass additional data in a URL for internal use. """
1202
1203     sdata = compat_urllib_parse.urlencode(
1204         {u'__youtubedl_smuggle': json.dumps(data)})
1205     return url + u'#' + sdata
1206
1207
1208 def unsmuggle_url(smug_url, default=None):
1209     if not '#__youtubedl_smuggle' in smug_url:
1210         return smug_url, default
1211     url, _, sdata = smug_url.rpartition(u'#')
1212     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1213     data = json.loads(jsond)
1214     return url, data
1215
1216
1217 def format_bytes(bytes):
1218     if bytes is None:
1219         return u'N/A'
1220     if type(bytes) is str:
1221         bytes = float(bytes)
1222     if bytes == 0.0:
1223         exponent = 0
1224     else:
1225         exponent = int(math.log(bytes, 1024.0))
1226     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1227     converted = float(bytes) / float(1024 ** exponent)
1228     return u'%.2f%s' % (converted, suffix)
1229
1230
1231 def get_term_width():
1232     columns = os.environ.get('COLUMNS', None)
1233     if columns:
1234         return int(columns)
1235
1236     try:
1237         sp = subprocess.Popen(
1238             ['stty', 'size'],
1239             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1240         out, err = sp.communicate()
1241         return int(out.split()[1])
1242     except:
1243         pass
1244     return None
1245
1246
1247 def month_by_name(name):
1248     """ Return the number of a month by (locale-independently) English name """
1249
1250     ENGLISH_NAMES = [
1251         u'January', u'February', u'March', u'April', u'May', u'June',
1252         u'July', u'August', u'September', u'October', u'November', u'December']
1253     try:
1254         return ENGLISH_NAMES.index(name) + 1
1255     except ValueError:
1256         return None
1257
1258
1259 def fix_xml_ampersands(xml_str):
1260     """Replace all the '&' by '&amp;' in XML"""
1261     return re.sub(
1262         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1263         u'&amp;',
1264         xml_str)
1265
1266
1267 def setproctitle(title):
1268     assert isinstance(title, compat_str)
1269     try:
1270         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1271     except OSError:
1272         return
1273     title_bytes = title.encode('utf-8')
1274     buf = ctypes.create_string_buffer(len(title_bytes))
1275     buf.value = title_bytes
1276     try:
1277         libc.prctl(15, buf, 0, 0, 0)
1278     except AttributeError:
1279         return  # Strange libc, just skip this
1280
1281
1282 def remove_start(s, start):
1283     if s.startswith(start):
1284         return s[len(start):]
1285     return s
1286
1287
1288 def url_basename(url):
1289     path = compat_urlparse.urlparse(url).path
1290     return path.strip(u'/').split(u'/')[-1]
1291
1292
1293 class HEADRequest(compat_urllib_request.Request):
1294     def get_method(self):
1295         return "HEAD"
1296
1297
1298 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1299     if get_attr:
1300         if v is not None:
1301             v = getattr(v, get_attr, None)
1302     if v == '':
1303         v = None
1304     return default if v is None else (int(v) * invscale // scale)
1305
1306
1307 def str_or_none(v, default=None):
1308     return default if v is None else compat_str(v)
1309
1310
1311 def str_to_int(int_str):
1312     if int_str is None:
1313         return None
1314     int_str = re.sub(r'[,\.]', u'', int_str)
1315     return int(int_str)
1316
1317
1318 def float_or_none(v, scale=1, invscale=1, default=None):
1319     return default if v is None else (float(v) * invscale / scale)
1320
1321
1322 def parse_duration(s):
1323     if s is None:
1324         return None
1325
1326     m = re.match(
1327         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1328     if not m:
1329         return None
1330     res = int(m.group('secs'))
1331     if m.group('mins'):
1332         res += int(m.group('mins')) * 60
1333         if m.group('hours'):
1334             res += int(m.group('hours')) * 60 * 60
1335     return res
1336
1337
1338 def prepend_extension(filename, ext):
1339     name, real_ext = os.path.splitext(filename)
1340     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1341
1342
1343 def check_executable(exe, args=[]):
1344     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1345     args can be a list of arguments for a short output (like -version) """
1346     try:
1347         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1348     except OSError:
1349         return False
1350     return exe
1351
1352
1353 class PagedList(object):
1354     def __init__(self, pagefunc, pagesize):
1355         self._pagefunc = pagefunc
1356         self._pagesize = pagesize
1357
1358     def __len__(self):
1359         # This is only useful for tests
1360         return len(self.getslice())
1361
1362     def getslice(self, start=0, end=None):
1363         res = []
1364         for pagenum in itertools.count(start // self._pagesize):
1365             firstid = pagenum * self._pagesize
1366             nextfirstid = pagenum * self._pagesize + self._pagesize
1367             if start >= nextfirstid:
1368                 continue
1369
1370             page_results = list(self._pagefunc(pagenum))
1371
1372             startv = (
1373                 start % self._pagesize
1374                 if firstid <= start < nextfirstid
1375                 else 0)
1376
1377             endv = (
1378                 ((end - 1) % self._pagesize) + 1
1379                 if (end is not None and firstid <= end <= nextfirstid)
1380                 else None)
1381
1382             if startv != 0 or endv is not None:
1383                 page_results = page_results[startv:endv]
1384             res.extend(page_results)
1385
1386             # A little optimization - if current page is not "full", ie. does
1387             # not contain page_size videos then we can assume that this page
1388             # is the last one - there are no more ids on further pages -
1389             # i.e. no need to query again.
1390             if len(page_results) + startv < self._pagesize:
1391                 break
1392
1393             # If we got the whole page, but the next page is not interesting,
1394             # break out early as well
1395             if end == nextfirstid:
1396                 break
1397         return res
1398
1399
1400 def uppercase_escape(s):
1401     unicode_escape = codecs.getdecoder('unicode_escape')
1402     return re.sub(
1403         r'\\U[0-9a-fA-F]{8}',
1404         lambda m: unicode_escape(m.group(0))[0],
1405         s)
1406
1407 try:
1408     struct.pack(u'!I', 0)
1409 except TypeError:
1410     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1411     def struct_pack(spec, *args):
1412         if isinstance(spec, compat_str):
1413             spec = spec.encode('ascii')
1414         return struct.pack(spec, *args)
1415
1416     def struct_unpack(spec, *args):
1417         if isinstance(spec, compat_str):
1418             spec = spec.encode('ascii')
1419         return struct.unpack(spec, *args)
1420 else:
1421     struct_pack = struct.pack
1422     struct_unpack = struct.unpack
1423
1424
1425 def read_batch_urls(batch_fd):
1426     def fixup(url):
1427         if not isinstance(url, compat_str):
1428             url = url.decode('utf-8', 'replace')
1429         BOM_UTF8 = u'\xef\xbb\xbf'
1430         if url.startswith(BOM_UTF8):
1431             url = url[len(BOM_UTF8):]
1432         url = url.strip()
1433         if url.startswith(('#', ';', ']')):
1434             return False
1435         return url
1436
1437     with contextlib.closing(batch_fd) as fd:
1438         return [url for url in map(fixup, fd) if url]
1439
1440
1441 def urlencode_postdata(*args, **kargs):
1442     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1443
1444
1445 def parse_xml(s):
1446     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1447         def doctype(self, name, pubid, system):
1448             pass  # Ignore doctypes
1449
1450     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1451     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1452     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1453
1454
1455 if sys.version_info < (3, 0) and sys.platform == 'win32':
1456     def compat_getpass(prompt, *args, **kwargs):
1457         if isinstance(prompt, compat_str):
1458             prompt = prompt.encode(preferredencoding())
1459         return getpass.getpass(prompt, *args, **kwargs)
1460 else:
1461     compat_getpass = getpass.getpass
1462
1463
1464 US_RATINGS = {
1465     'G': 0,
1466     'PG': 10,
1467     'PG-13': 13,
1468     'R': 16,
1469     'NC': 18,
1470 }
1471
1472
1473 def strip_jsonp(code):
1474     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1475
1476
1477 def js_to_json(code):
1478     def fix_kv(m):
1479         key = m.group(2)
1480         if key.startswith("'"):
1481             assert key.endswith("'")
1482             assert '"' not in key
1483             key = '"%s"' % key[1:-1]
1484         elif not key.startswith('"'):
1485             key = '"%s"' % key
1486
1487         value = m.group(4)
1488         if value.startswith("'"):
1489             assert value.endswith("'")
1490             assert '"' not in value
1491             value = '"%s"' % value[1:-1]
1492
1493         return m.group(1) + key + m.group(3) + value
1494
1495     res = re.sub(r'''(?x)
1496             ([{,]\s*)
1497             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1498             (:\s*)
1499             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1500         ''', fix_kv, code)
1501     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1502     return res
1503
1504
1505 def qualities(quality_ids):
1506     """ Get a numeric quality value out of a list of possible values """
1507     def q(qid):
1508         try:
1509             return quality_ids.index(qid)
1510         except ValueError:
1511             return -1
1512     return q
1513
1514
1515 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1516
1517 try:
1518     subprocess_check_output = subprocess.check_output
1519 except AttributeError:
1520     def subprocess_check_output(*args, **kwargs):
1521         assert 'input' not in kwargs
1522         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1523         output, _ = p.communicate()
1524         ret = p.poll()
1525         if ret:
1526             raise subprocess.CalledProcessError(ret, p.args, output=output)
1527         return output