_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import time
  10 import email.utils
  11 import errno
  12 import getpass
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 def compat_ord(c):
 196     if type(c) is int: return c
 197     else: return ord(c)
 198
 199 # This is not clearly defined otherwise
 200 compiled_regex_type = type(re.compile(''))
 201
 202 std_headers = {
 203     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 204     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 205     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 206     'Accept-Encoding': 'gzip, deflate',
 207     'Accept-Language': 'en-us,en;q=0.5',
 208 }
 209
 210 def preferredencoding():
 211     """Get preferred encoding.
 212
 213     Returns the best encoding scheme for the system, based on
 214     locale.getpreferredencoding() and some further tweaks.
 215     """
 216     try:
 217         pref = locale.getpreferredencoding()
 218         u'TEST'.encode(pref)
 219     except:
 220         pref = 'UTF-8'
 221
 222     return pref
 223
 224 if sys.version_info < (3,0):
 225     def compat_print(s):
 226         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 227 else:
 228     def compat_print(s):
 229         assert type(s) == type(u'')
 230         print(s)
 231
 232 # In Python 2.x, json.dump expects a bytestream.
 233 # In Python 3.x, it writes to a character stream
 234 if sys.version_info < (3,0):
 235     def write_json_file(obj, fn):
 236         with open(fn, 'wb') as f:
 237             json.dump(obj, f)
 238 else:
 239     def write_json_file(obj, fn):
 240         with open(fn, 'w', encoding='utf-8') as f:
 241             json.dump(obj, f)
 242
 243 if sys.version_info >= (2,7):
 244     def find_xpath_attr(node, xpath, key, val):
 245         """ Find the xpath xpath[@key=val] """
 246         assert re.match(r'^[a-zA-Z-]+$', key)
 247         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 248         expr = xpath + u"[@%s='%s']" % (key, val)
 249         return node.find(expr)
 250 else:
 251     def find_xpath_attr(node, xpath, key, val):
 252         for f in node.findall(xpath):
 253             if f.attrib.get(key) == val:
 254                 return f
 255         return None
 256
 257 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 258 # the namespace parameter
 259 def xpath_with_ns(path, ns_map):
 260     components = [c.split(':') for c in path.split('/')]
 261     replaced = []
 262     for c in components:
 263         if len(c) == 1:
 264             replaced.append(c[0])
 265         else:
 266             ns, tag = c
 267             replaced.append('{%s}%s' % (ns_map[ns], tag))
 268     return '/'.join(replaced)
 269
 270 def htmlentity_transform(matchobj):
 271     """Transforms an HTML entity to a character.
 272
 273     This function receives a match object and is intended to be used with
 274     the re.sub() function.
 275     """
 276     entity = matchobj.group(1)
 277
 278     # Known non-numeric HTML entity
 279     if entity in compat_html_entities.name2codepoint:
 280         return compat_chr(compat_html_entities.name2codepoint[entity])
 281
 282     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 283     if mobj is not None:
 284         numstr = mobj.group(1)
 285         if numstr.startswith(u'x'):
 286             base = 16
 287             numstr = u'0%s' % numstr
 288         else:
 289             base = 10
 290         return compat_chr(int(numstr, base))
 291
 292     # Unknown entity in name, return its literal representation
 293     return (u'&%s;' % entity)
 294
 295 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 296 class BaseHTMLParser(compat_html_parser.HTMLParser):
 297     def __init(self):
 298         compat_html_parser.HTMLParser.__init__(self)
 299         self.html = None
 300
 301     def loads(self, html):
 302         self.html = html
 303         self.feed(html)
 304         self.close()
 305
 306 class AttrParser(BaseHTMLParser):
 307     """Modified HTMLParser that isolates a tag with the specified attribute"""
 308     def __init__(self, attribute, value):
 309         self.attribute = attribute
 310         self.value = value
 311         self.result = None
 312         self.started = False
 313         self.depth = {}
 314         self.watch_startpos = False
 315         self.error_count = 0
 316         BaseHTMLParser.__init__(self)
 317
 318     def error(self, message):
 319         if self.error_count > 10 or self.started:
 320             raise compat_html_parser.HTMLParseError(message, self.getpos())
 321         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 322         self.error_count += 1
 323         self.goahead(1)
 324
 325     def handle_starttag(self, tag, attrs):
 326         attrs = dict(attrs)
 327         if self.started:
 328             self.find_startpos(None)
 329         if self.attribute in attrs and attrs[self.attribute] == self.value:
 330             self.result = [tag]
 331             self.started = True
 332             self.watch_startpos = True
 333         if self.started:
 334             if not tag in self.depth: self.depth[tag] = 0
 335             self.depth[tag] += 1
 336
 337     def handle_endtag(self, tag):
 338         if self.started:
 339             if tag in self.depth: self.depth[tag] -= 1
 340             if self.depth[self.result[0]] == 0:
 341                 self.started = False
 342                 self.result.append(self.getpos())
 343
 344     def find_startpos(self, x):
 345         """Needed to put the start position of the result (self.result[1])
 346         after the opening tag with the requested id"""
 347         if self.watch_startpos:
 348             self.watch_startpos = False
 349             self.result.append(self.getpos())
 350     handle_entityref = handle_charref = handle_data = handle_comment = \
 351     handle_decl = handle_pi = unknown_decl = find_startpos
 352
 353     def get_result(self):
 354         if self.result is None:
 355             return None
 356         if len(self.result) != 3:
 357             return None
 358         lines = self.html.split('\n')
 359         lines = lines[self.result[1][0]-1:self.result[2][0]]
 360         lines[0] = lines[0][self.result[1][1]:]
 361         if len(lines) == 1:
 362             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 363         lines[-1] = lines[-1][:self.result[2][1]]
 364         return '\n'.join(lines).strip()
 365 # Hack for https://github.com/rg3/youtube-dl/issues/662
 366 if sys.version_info < (2, 7, 3):
 367     AttrParser.parse_endtag = (lambda self, i:
 368         i + len("</scr'+'ipt>")
 369         if self.rawdata[i:].startswith("</scr'+'ipt>")
 370         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 371
 372 def get_element_by_id(id, html):
 373     """Return the content of the tag with the specified ID in the passed HTML document"""
 374     return get_element_by_attribute("id", id, html)
 375
 376 def get_element_by_attribute(attribute, value, html):
 377     """Return the content of the tag with the specified attribute in the passed HTML document"""
 378     parser = AttrParser(attribute, value)
 379     try:
 380         parser.loads(html)
 381     except compat_html_parser.HTMLParseError:
 382         pass
 383     return parser.get_result()
 384
 385 class MetaParser(BaseHTMLParser):
 386     """
 387     Modified HTMLParser that isolates a meta tag with the specified name
 388     attribute.
 389     """
 390     def __init__(self, name):
 391         BaseHTMLParser.__init__(self)
 392         self.name = name
 393         self.content = None
 394         self.result = None
 395
 396     def handle_starttag(self, tag, attrs):
 397         if tag != 'meta':
 398             return
 399         attrs = dict(attrs)
 400         if attrs.get('name') == self.name:
 401             self.result = attrs.get('content')
 402
 403     def get_result(self):
 404         return self.result
 405
 406 def get_meta_content(name, html):
 407     """
 408     Return the content attribute from the meta tag with the given name attribute.
 409     """
 410     parser = MetaParser(name)
 411     try:
 412         parser.loads(html)
 413     except compat_html_parser.HTMLParseError:
 414         pass
 415     return parser.get_result()
 416
 417
 418 def clean_html(html):
 419     """Clean an HTML snippet into a readable string"""
 420     # Newline vs <br />
 421     html = html.replace('\n', ' ')
 422     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 423     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 424     # Strip html tags
 425     html = re.sub('<.*?>', '', html)
 426     # Replace html entities
 427     html = unescapeHTML(html)
 428     return html.strip()
 429
 430
 431 def sanitize_open(filename, open_mode):
 432     """Try to open the given filename, and slightly tweak it if this fails.
 433
 434     Attempts to open the given filename. If this fails, it tries to change
 435     the filename slightly, step by step, until it's either able to open it
 436     or it fails and raises a final exception, like the standard open()
 437     function.
 438
 439     It returns the tuple (stream, definitive_file_name).
 440     """
 441     try:
 442         if filename == u'-':
 443             if sys.platform == 'win32':
 444                 import msvcrt
 445                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 446             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 447         stream = open(encodeFilename(filename), open_mode)
 448         return (stream, filename)
 449     except (IOError, OSError) as err:
 450         if err.errno in (errno.EACCES,):
 451             raise
 452
 453         # In case of error, try to remove win32 forbidden chars
 454         alt_filename = os.path.join(
 455                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 456                         for path_part in os.path.split(filename)
 457                        )
 458         if alt_filename == filename:
 459             raise
 460         else:
 461             # An exception here should be caught in the caller
 462             stream = open(encodeFilename(filename), open_mode)
 463             return (stream, alt_filename)
 464
 465
 466 def timeconvert(timestr):
 467     """Convert RFC 2822 defined time string into system timestamp"""
 468     timestamp = None
 469     timetuple = email.utils.parsedate_tz(timestr)
 470     if timetuple is not None:
 471         timestamp = email.utils.mktime_tz(timetuple)
 472     return timestamp
 473
 474 def sanitize_filename(s, restricted=False, is_id=False):
 475     """Sanitizes a string so it could be used as part of a filename.
 476     If restricted is set, use a stricter subset of allowed characters.
 477     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 478     """
 479     def replace_insane(char):
 480         if char == '?' or ord(char) < 32 or ord(char) == 127:
 481             return ''
 482         elif char == '"':
 483             return '' if restricted else '\''
 484         elif char == ':':
 485             return '_-' if restricted else ' -'
 486         elif char in '\\/|*<>':
 487             return '_'
 488         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 489             return '_'
 490         if restricted and ord(char) > 127:
 491             return '_'
 492         return char
 493
 494     result = u''.join(map(replace_insane, s))
 495     if not is_id:
 496         while '__' in result:
 497             result = result.replace('__', '_')
 498         result = result.strip('_')
 499         # Common case of "Foreign band name - English song title"
 500         if restricted and result.startswith('-_'):
 501             result = result[2:]
 502         if not result:
 503             result = '_'
 504     return result
 505
 506 def orderedSet(iterable):
 507     """ Remove all duplicates from the input iterable """
 508     res = []
 509     for el in iterable:
 510         if el not in res:
 511             res.append(el)
 512     return res
 513
 514
 515 def unescapeHTML(s):
 516     if s is None:
 517         return None
 518     assert type(s) == compat_str
 519
 520     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 521     return result
 522
 523
 524 def encodeFilename(s, for_subprocess=False):
 525     """
 526     @param s The name of the file
 527     """
 528
 529     assert type(s) == compat_str
 530
 531     # Python 3 has a Unicode API
 532     if sys.version_info >= (3, 0):
 533         return s
 534
 535     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 536         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 537         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 538         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 539         if not for_subprocess:
 540             return s
 541         else:
 542             # For subprocess calls, encode with locale encoding
 543             # Refer to http://stackoverflow.com/a/9951851/35070
 544             encoding = preferredencoding()
 545     else:
 546         encoding = sys.getfilesystemencoding()
 547     if encoding is None:
 548         encoding = 'utf-8'
 549     return s.encode(encoding, 'ignore')
 550
 551
 552 def encodeArgument(s):
 553     if not isinstance(s, compat_str):
 554         # Legacy code that uses byte strings
 555         # Uncomment the following line after fixing all post processors
 556         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 557         s = s.decode('ascii')
 558     return encodeFilename(s, True)
 559
 560
 561 def decodeOption(optval):
 562     if optval is None:
 563         return optval
 564     if isinstance(optval, bytes):
 565         optval = optval.decode(preferredencoding())
 566
 567     assert isinstance(optval, compat_str)
 568     return optval
 569
 570 def formatSeconds(secs):
 571     if secs > 3600:
 572         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 573     elif secs > 60:
 574         return '%d:%02d' % (secs // 60, secs % 60)
 575     else:
 576         return '%d' % secs
 577
 578
 579 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 580     if sys.version_info < (3, 2):
 581         import httplib
 582
 583         class HTTPSConnectionV3(httplib.HTTPSConnection):
 584             def __init__(self, *args, **kwargs):
 585                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 586
 587             def connect(self):
 588                 sock = socket.create_connection((self.host, self.port), self.timeout)
 589                 if getattr(self, '_tunnel_host', False):
 590                     self.sock = sock
 591                     self._tunnel()
 592                 try:
 593                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 594                 except ssl.SSLError:
 595                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 596
 597         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 598             def https_open(self, req):
 599                 return self.do_open(HTTPSConnectionV3, req)
 600         return HTTPSHandlerV3(**kwargs)
 601     else:
 602         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 603         context.verify_mode = (ssl.CERT_NONE
 604                                if opts_no_check_certificate
 605                                else ssl.CERT_REQUIRED)
 606         context.set_default_verify_paths()
 607         try:
 608             context.load_default_certs()
 609         except AttributeError:
 610             pass  # Python < 3.4
 611         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 612
 613 class ExtractorError(Exception):
 614     """Error during info extraction."""
 615     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 616         """ tb, if given, is the original traceback (so that it can be printed out).
 617         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 618         """
 619
 620         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 621             expected = True
 622         if video_id is not None:
 623             msg = video_id + ': ' + msg
 624         if not expected:
 625             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 626         super(ExtractorError, self).__init__(msg)
 627
 628         self.traceback = tb
 629         self.exc_info = sys.exc_info()  # preserve original exception
 630         self.cause = cause
 631         self.video_id = video_id
 632
 633     def format_traceback(self):
 634         if self.traceback is None:
 635             return None
 636         return u''.join(traceback.format_tb(self.traceback))
 637
 638
 639 class RegexNotFoundError(ExtractorError):
 640     """Error when a regex didn't match"""
 641     pass
 642
 643
 644 class DownloadError(Exception):
 645     """Download Error exception.
 646
 647     This exception may be thrown by FileDownloader objects if they are not
 648     configured to continue on errors. They will contain the appropriate
 649     error message.
 650     """
 651     def __init__(self, msg, exc_info=None):
 652         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 653         super(DownloadError, self).__init__(msg)
 654         self.exc_info = exc_info
 655
 656
 657 class SameFileError(Exception):
 658     """Same File exception.
 659
 660     This exception will be thrown by FileDownloader objects if they detect
 661     multiple files would have to be downloaded to the same file on disk.
 662     """
 663     pass
 664
 665
 666 class PostProcessingError(Exception):
 667     """Post Processing exception.
 668
 669     This exception may be raised by PostProcessor's .run() method to
 670     indicate an error in the postprocessing task.
 671     """
 672     def __init__(self, msg):
 673         self.msg = msg
 674
 675 class MaxDownloadsReached(Exception):
 676     """ --max-downloads limit has been reached. """
 677     pass
 678
 679
 680 class UnavailableVideoError(Exception):
 681     """Unavailable Format exception.
 682
 683     This exception will be thrown when a video is requested
 684     in a format that is not available for that video.
 685     """
 686     pass
 687
 688
 689 class ContentTooShortError(Exception):
 690     """Content Too Short exception.
 691
 692     This exception may be raised by FileDownloader objects when a file they
 693     download is too small for what the server announced first, indicating
 694     the connection was probably interrupted.
 695     """
 696     # Both in bytes
 697     downloaded = None
 698     expected = None
 699
 700     def __init__(self, downloaded, expected):
 701         self.downloaded = downloaded
 702         self.expected = expected
 703
 704 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 705     """Handler for HTTP requests and responses.
 706
 707     This class, when installed with an OpenerDirector, automatically adds
 708     the standard headers to every HTTP request and handles gzipped and
 709     deflated responses from web servers. If compression is to be avoided in
 710     a particular request, the original request in the program code only has
 711     to include the HTTP header "Youtubedl-No-Compression", which will be
 712     removed before making the real request.
 713
 714     Part of this code was copied from:
 715
 716     http://techknack.net/python-urllib2-handlers/
 717
 718     Andrew Rowls, the author of that code, agreed to release it to the
 719     public domain.
 720     """
 721
 722     @staticmethod
 723     def deflate(data):
 724         try:
 725             return zlib.decompress(data, -zlib.MAX_WBITS)
 726         except zlib.error:
 727             return zlib.decompress(data)
 728
 729     @staticmethod
 730     def addinfourl_wrapper(stream, headers, url, code):
 731         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 732             return compat_urllib_request.addinfourl(stream, headers, url, code)
 733         ret = compat_urllib_request.addinfourl(stream, headers, url)
 734         ret.code = code
 735         return ret
 736
 737     def http_request(self, req):
 738         for h,v in std_headers.items():
 739             if h in req.headers:
 740                 del req.headers[h]
 741             req.add_header(h, v)
 742         if 'Youtubedl-no-compression' in req.headers:
 743             if 'Accept-encoding' in req.headers:
 744                 del req.headers['Accept-encoding']
 745             del req.headers['Youtubedl-no-compression']
 746         if 'Youtubedl-user-agent' in req.headers:
 747             if 'User-agent' in req.headers:
 748                 del req.headers['User-agent']
 749             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 750             del req.headers['Youtubedl-user-agent']
 751         #print("sleeping\n")
 752         #time.sleep(1)
 753         return req
 754
 755     def http_response(self, req, resp):
 756         old_resp = resp
 757         # gzip
 758         if resp.headers.get('Content-encoding', '') == 'gzip':
 759             content = resp.read()
 760             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 761             try:
 762                 uncompressed = io.BytesIO(gz.read())
 763             except IOError as original_ioerror:
 764                 # There may be junk add the end of the file
 765                 # See http://stackoverflow.com/q/4928560/35070 for details
 766                 for i in range(1, 1024):
 767                     try:
 768                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 769                         uncompressed = io.BytesIO(gz.read())
 770                     except IOError:
 771                         continue
 772                     break
 773                 else:
 774                     raise original_ioerror
 775             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 776             resp.msg = old_resp.msg
 777         # deflate
 778         if resp.headers.get('Content-encoding', '') == 'deflate':
 779             gz = io.BytesIO(self.deflate(resp.read()))
 780             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 781             resp.msg = old_resp.msg
 782         return resp
 783
 784     https_request = http_request
 785     https_response = http_response
 786
 787
 788 def parse_iso8601(date_str, delimiter='T'):
 789     """ Return a UNIX timestamp from the given date """
 790
 791     if date_str is None:
 792         return None
 793
 794     m = re.search(
 795         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 796         date_str)
 797     if not m:
 798         timezone = datetime.timedelta()
 799     else:
 800         date_str = date_str[:-len(m.group(0))]
 801         if not m.group('sign'):
 802             timezone = datetime.timedelta()
 803         else:
 804             sign = 1 if m.group('sign') == '+' else -1
 805             timezone = datetime.timedelta(
 806                 hours=sign * int(m.group('hours')),
 807                 minutes=sign * int(m.group('minutes')))
 808     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 809     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 810     return calendar.timegm(dt.timetuple())
 811
 812
 813 def unified_strdate(date_str):
 814     """Return a string with the date in the format YYYYMMDD"""
 815
 816     if date_str is None:
 817         return None
 818
 819     upload_date = None
 820     #Replace commas
 821     date_str = date_str.replace(',', ' ')
 822     # %z (UTC offset) is only supported in python>=3.2
 823     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 824     format_expressions = [
 825         '%d %B %Y',
 826         '%d %b %Y',
 827         '%B %d %Y',
 828         '%b %d %Y',
 829         '%b %dst %Y %I:%M%p',
 830         '%b %dnd %Y %I:%M%p',
 831         '%b %dth %Y %I:%M%p',
 832         '%Y-%m-%d',
 833         '%d.%m.%Y',
 834         '%d/%m/%Y',
 835         '%Y/%m/%d %H:%M:%S',
 836         '%Y-%m-%d %H:%M:%S',
 837         '%d.%m.%Y %H:%M',
 838         '%d.%m.%Y %H.%M',
 839         '%Y-%m-%dT%H:%M:%SZ',
 840         '%Y-%m-%dT%H:%M:%S.%fZ',
 841         '%Y-%m-%dT%H:%M:%S.%f0Z',
 842         '%Y-%m-%dT%H:%M:%S',
 843         '%Y-%m-%dT%H:%M:%S.%f',
 844         '%Y-%m-%dT%H:%M',
 845     ]
 846     for expression in format_expressions:
 847         try:
 848             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 849         except ValueError:
 850             pass
 851     if upload_date is None:
 852         timetuple = email.utils.parsedate_tz(date_str)
 853         if timetuple:
 854             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 855     return upload_date
 856
 857 def determine_ext(url, default_ext=u'unknown_video'):
 858     if url is None:
 859         return default_ext
 860     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 861     if re.match(r'^[A-Za-z0-9]+$', guess):
 862         return guess
 863     else:
 864         return default_ext
 865
 866 def subtitles_filename(filename, sub_lang, sub_format):
 867     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 868
 869 def date_from_str(date_str):
 870     """
 871     Return a datetime object from a string in the format YYYYMMDD or
 872     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 873     today = datetime.date.today()
 874     if date_str == 'now'or date_str == 'today':
 875         return today
 876     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 877     if match is not None:
 878         sign = match.group('sign')
 879         time = int(match.group('time'))
 880         if sign == '-':
 881             time = -time
 882         unit = match.group('unit')
 883         #A bad aproximation?
 884         if unit == 'month':
 885             unit = 'day'
 886             time *= 30
 887         elif unit == 'year':
 888             unit = 'day'
 889             time *= 365
 890         unit += 's'
 891         delta = datetime.timedelta(**{unit: time})
 892         return today + delta
 893     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 894
 895 def hyphenate_date(date_str):
 896     """
 897     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 898     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 899     if match is not None:
 900         return '-'.join(match.groups())
 901     else:
 902         return date_str
 903
 904 class DateRange(object):
 905     """Represents a time interval between two dates"""
 906     def __init__(self, start=None, end=None):
 907         """start and end must be strings in the format accepted by date"""
 908         if start is not None:
 909             self.start = date_from_str(start)
 910         else:
 911             self.start = datetime.datetime.min.date()
 912         if end is not None:
 913             self.end = date_from_str(end)
 914         else:
 915             self.end = datetime.datetime.max.date()
 916         if self.start > self.end:
 917             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 918     @classmethod
 919     def day(cls, day):
 920         """Returns a range that only contains the given day"""
 921         return cls(day,day)
 922     def __contains__(self, date):
 923         """Check if the date is in the range"""
 924         if not isinstance(date, datetime.date):
 925             date = date_from_str(date)
 926         return self.start <= date <= self.end
 927     def __str__(self):
 928         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 929
 930
 931 def platform_name():
 932     """ Returns the platform name as a compat_str """
 933     res = platform.platform()
 934     if isinstance(res, bytes):
 935         res = res.decode(preferredencoding())
 936
 937     assert isinstance(res, compat_str)
 938     return res
 939
 940
 941 def _windows_write_string(s, out):
 942     """ Returns True if the string was written using special methods,
 943     False if it has yet to be written out."""
 944     # Adapted from http://stackoverflow.com/a/3259271/35070
 945
 946     import ctypes
 947     import ctypes.wintypes
 948
 949     WIN_OUTPUT_IDS = {
 950         1: -11,
 951         2: -12,
 952     }
 953
 954     try:
 955         fileno = out.fileno()
 956     except AttributeError:
 957         # If the output stream doesn't have a fileno, it's virtual
 958         return False
 959     if fileno not in WIN_OUTPUT_IDS:
 960         return False
 961
 962     GetStdHandle = ctypes.WINFUNCTYPE(
 963         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 964         ("GetStdHandle", ctypes.windll.kernel32))
 965     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 966
 967     WriteConsoleW = ctypes.WINFUNCTYPE(
 968         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 969         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 970         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 971     written = ctypes.wintypes.DWORD(0)
 972
 973     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 974     FILE_TYPE_CHAR = 0x0002
 975     FILE_TYPE_REMOTE = 0x8000
 976     GetConsoleMode = ctypes.WINFUNCTYPE(
 977         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 978         ctypes.POINTER(ctypes.wintypes.DWORD))(
 979         ("GetConsoleMode", ctypes.windll.kernel32))
 980     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 981
 982     def not_a_console(handle):
 983         if handle == INVALID_HANDLE_VALUE or handle is None:
 984             return True
 985         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 986                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 987
 988     if not_a_console(h):
 989         return False
 990
 991     def next_nonbmp_pos(s):
 992         try:
 993             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 994         except StopIteration:
 995             return len(s)
 996
 997     while s:
 998         count = min(next_nonbmp_pos(s), 1024)
 999
1000         ret = WriteConsoleW(
1001             h, s, count if count else 2, ctypes.byref(written), None)
1002         if ret == 0:
1003             raise OSError('Failed to write string')
1004         if not count:  # We just wrote a non-BMP character
1005             assert written.value == 2
1006             s = s[1:]
1007         else:
1008             assert written.value > 0
1009             s = s[written.value:]
1010     return True
1011
1012
1013 def write_string(s, out=None, encoding=None):
1014     if out is None:
1015         out = sys.stderr
1016     assert type(s) == compat_str
1017
1018     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1019         if _windows_write_string(s, out):
1020             return
1021
1022     if ('b' in getattr(out, 'mode', '') or
1023             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1024         byt = s.encode(encoding or preferredencoding(), 'ignore')
1025         out.write(byt)
1026     elif hasattr(out, 'buffer'):
1027         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1028         byt = s.encode(enc, 'ignore')
1029         out.buffer.write(byt)
1030     else:
1031         out.write(s)
1032     out.flush()
1033
1034
1035 def bytes_to_intlist(bs):
1036     if not bs:
1037         return []
1038     if isinstance(bs[0], int):  # Python 3
1039         return list(bs)
1040     else:
1041         return [ord(c) for c in bs]
1042
1043
1044 def intlist_to_bytes(xs):
1045     if not xs:
1046         return b''
1047     if isinstance(chr(0), bytes):  # Python 2
1048         return ''.join([chr(x) for x in xs])
1049     else:
1050         return bytes(xs)
1051
1052
1053 def get_cachedir(params={}):
1054     cache_root = os.environ.get('XDG_CACHE_HOME',
1055                                 os.path.expanduser('~/.cache'))
1056     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1057
1058
1059 # Cross-platform file locking
1060 if sys.platform == 'win32':
1061     import ctypes.wintypes
1062     import msvcrt
1063
1064     class OVERLAPPED(ctypes.Structure):
1065         _fields_ = [
1066             ('Internal', ctypes.wintypes.LPVOID),
1067             ('InternalHigh', ctypes.wintypes.LPVOID),
1068             ('Offset', ctypes.wintypes.DWORD),
1069             ('OffsetHigh', ctypes.wintypes.DWORD),
1070             ('hEvent', ctypes.wintypes.HANDLE),
1071         ]
1072
1073     kernel32 = ctypes.windll.kernel32
1074     LockFileEx = kernel32.LockFileEx
1075     LockFileEx.argtypes = [
1076         ctypes.wintypes.HANDLE,     # hFile
1077         ctypes.wintypes.DWORD,      # dwFlags
1078         ctypes.wintypes.DWORD,      # dwReserved
1079         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1080         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1081         ctypes.POINTER(OVERLAPPED)  # Overlapped
1082     ]
1083     LockFileEx.restype = ctypes.wintypes.BOOL
1084     UnlockFileEx = kernel32.UnlockFileEx
1085     UnlockFileEx.argtypes = [
1086         ctypes.wintypes.HANDLE,     # hFile
1087         ctypes.wintypes.DWORD,      # dwReserved
1088         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1089         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1090         ctypes.POINTER(OVERLAPPED)  # Overlapped
1091     ]
1092     UnlockFileEx.restype = ctypes.wintypes.BOOL
1093     whole_low = 0xffffffff
1094     whole_high = 0x7fffffff
1095
1096     def _lock_file(f, exclusive):
1097         overlapped = OVERLAPPED()
1098         overlapped.Offset = 0
1099         overlapped.OffsetHigh = 0
1100         overlapped.hEvent = 0
1101         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1102         handle = msvcrt.get_osfhandle(f.fileno())
1103         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1104                           whole_low, whole_high, f._lock_file_overlapped_p):
1105             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1106
1107     def _unlock_file(f):
1108         assert f._lock_file_overlapped_p
1109         handle = msvcrt.get_osfhandle(f.fileno())
1110         if not UnlockFileEx(handle, 0,
1111                             whole_low, whole_high, f._lock_file_overlapped_p):
1112             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1113
1114 else:
1115     import fcntl
1116
1117     def _lock_file(f, exclusive):
1118         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1119
1120     def _unlock_file(f):
1121         fcntl.lockf(f, fcntl.LOCK_UN)
1122
1123
1124 class locked_file(object):
1125     def __init__(self, filename, mode, encoding=None):
1126         assert mode in ['r', 'a', 'w']
1127         self.f = io.open(filename, mode, encoding=encoding)
1128         self.mode = mode
1129
1130     def __enter__(self):
1131         exclusive = self.mode != 'r'
1132         try:
1133             _lock_file(self.f, exclusive)
1134         except IOError:
1135             self.f.close()
1136             raise
1137         return self
1138
1139     def __exit__(self, etype, value, traceback):
1140         try:
1141             _unlock_file(self.f)
1142         finally:
1143             self.f.close()
1144
1145     def __iter__(self):
1146         return iter(self.f)
1147
1148     def write(self, *args):
1149         return self.f.write(*args)
1150
1151     def read(self, *args):
1152         return self.f.read(*args)
1153
1154
1155 def shell_quote(args):
1156     quoted_args = []
1157     encoding = sys.getfilesystemencoding()
1158     if encoding is None:
1159         encoding = 'utf-8'
1160     for a in args:
1161         if isinstance(a, bytes):
1162             # We may get a filename encoded with 'encodeFilename'
1163             a = a.decode(encoding)
1164         quoted_args.append(pipes.quote(a))
1165     return u' '.join(quoted_args)
1166
1167
1168 def takewhile_inclusive(pred, seq):
1169     """ Like itertools.takewhile, but include the latest evaluated element
1170         (the first element so that Not pred(e)) """
1171     for e in seq:
1172         yield e
1173         if not pred(e):
1174             return
1175
1176
1177 def smuggle_url(url, data):
1178     """ Pass additional data in a URL for internal use. """
1179
1180     sdata = compat_urllib_parse.urlencode(
1181         {u'__youtubedl_smuggle': json.dumps(data)})
1182     return url + u'#' + sdata
1183
1184
1185 def unsmuggle_url(smug_url, default=None):
1186     if not '#__youtubedl_smuggle' in smug_url:
1187         return smug_url, default
1188     url, _, sdata = smug_url.rpartition(u'#')
1189     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1190     data = json.loads(jsond)
1191     return url, data
1192
1193
1194 def format_bytes(bytes):
1195     if bytes is None:
1196         return u'N/A'
1197     if type(bytes) is str:
1198         bytes = float(bytes)
1199     if bytes == 0.0:
1200         exponent = 0
1201     else:
1202         exponent = int(math.log(bytes, 1024.0))
1203     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1204     converted = float(bytes) / float(1024 ** exponent)
1205     return u'%.2f%s' % (converted, suffix)
1206
1207
1208 def get_term_width():
1209     columns = os.environ.get('COLUMNS', None)
1210     if columns:
1211         return int(columns)
1212
1213     try:
1214         sp = subprocess.Popen(
1215             ['stty', 'size'],
1216             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1217         out, err = sp.communicate()
1218         return int(out.split()[1])
1219     except:
1220         pass
1221     return None
1222
1223
1224 def month_by_name(name):
1225     """ Return the number of a month by (locale-independently) English name """
1226
1227     ENGLISH_NAMES = [
1228         u'January', u'February', u'March', u'April', u'May', u'June',
1229         u'July', u'August', u'September', u'October', u'November', u'December']
1230     try:
1231         return ENGLISH_NAMES.index(name) + 1
1232     except ValueError:
1233         return None
1234
1235
1236 def fix_xml_ampersands(xml_str):
1237     """Replace all the '&' by '&amp;' in XML"""
1238     return re.sub(
1239         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1240         u'&amp;',
1241         xml_str)
1242
1243
1244 def setproctitle(title):
1245     assert isinstance(title, compat_str)
1246     try:
1247         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248     except OSError:
1249         return
1250     title_bytes = title.encode('utf-8')
1251     buf = ctypes.create_string_buffer(len(title_bytes))
1252     buf.value = title_bytes
1253     try:
1254         libc.prctl(15, buf, 0, 0, 0)
1255     except AttributeError:
1256         return  # Strange libc, just skip this
1257
1258
1259 def remove_start(s, start):
1260     if s.startswith(start):
1261         return s[len(start):]
1262     return s
1263
1264
1265 def url_basename(url):
1266     path = compat_urlparse.urlparse(url).path
1267     return path.strip(u'/').split(u'/')[-1]
1268
1269
1270 class HEADRequest(compat_urllib_request.Request):
1271     def get_method(self):
1272         return "HEAD"
1273
1274
1275 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1276     if get_attr:
1277         if v is not None:
1278             v = getattr(v, get_attr, None)
1279     return default if v is None else (int(v) * invscale // scale)
1280
1281
1282 def str_to_int(int_str):
1283     if int_str is None:
1284         return None
1285     int_str = re.sub(r'[,\.]', u'', int_str)
1286     return int(int_str)
1287
1288
1289 def float_or_none(v, scale=1, invscale=1, default=None):
1290     return default if v is None else (float(v) * invscale / scale)
1291
1292
1293 def parse_duration(s):
1294     if s is None:
1295         return None
1296
1297     m = re.match(
1298         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1299     if not m:
1300         return None
1301     res = int(m.group('secs'))
1302     if m.group('mins'):
1303         res += int(m.group('mins')) * 60
1304         if m.group('hours'):
1305             res += int(m.group('hours')) * 60 * 60
1306     return res
1307
1308
1309 def prepend_extension(filename, ext):
1310     name, real_ext = os.path.splitext(filename)
1311     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1312
1313
1314 def check_executable(exe, args=[]):
1315     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316     args can be a list of arguments for a short output (like -version) """
1317     try:
1318         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1319     except OSError:
1320         return False
1321     return exe
1322
1323
1324 class PagedList(object):
1325     def __init__(self, pagefunc, pagesize):
1326         self._pagefunc = pagefunc
1327         self._pagesize = pagesize
1328
1329     def __len__(self):
1330         # This is only useful for tests
1331         return len(self.getslice())
1332
1333     def getslice(self, start=0, end=None):
1334         res = []
1335         for pagenum in itertools.count(start // self._pagesize):
1336             firstid = pagenum * self._pagesize
1337             nextfirstid = pagenum * self._pagesize + self._pagesize
1338             if start >= nextfirstid:
1339                 continue
1340
1341             page_results = list(self._pagefunc(pagenum))
1342
1343             startv = (
1344                 start % self._pagesize
1345                 if firstid <= start < nextfirstid
1346                 else 0)
1347
1348             endv = (
1349                 ((end - 1) % self._pagesize) + 1
1350                 if (end is not None and firstid <= end <= nextfirstid)
1351                 else None)
1352
1353             if startv != 0 or endv is not None:
1354                 page_results = page_results[startv:endv]
1355             res.extend(page_results)
1356
1357             # A little optimization - if current page is not "full", ie. does
1358             # not contain page_size videos then we can assume that this page
1359             # is the last one - there are no more ids on further pages -
1360             # i.e. no need to query again.
1361             if len(page_results) + startv < self._pagesize:
1362                 break
1363
1364             # If we got the whole page, but the next page is not interesting,
1365             # break out early as well
1366             if end == nextfirstid:
1367                 break
1368         return res
1369
1370
1371 def uppercase_escape(s):
1372     unicode_escape = codecs.getdecoder('unicode_escape')
1373     return re.sub(
1374         r'\\U[0-9a-fA-F]{8}',
1375         lambda m: unicode_escape(m.group(0))[0],
1376         s)
1377
1378 try:
1379     struct.pack(u'!I', 0)
1380 except TypeError:
1381     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1382     def struct_pack(spec, *args):
1383         if isinstance(spec, compat_str):
1384             spec = spec.encode('ascii')
1385         return struct.pack(spec, *args)
1386
1387     def struct_unpack(spec, *args):
1388         if isinstance(spec, compat_str):
1389             spec = spec.encode('ascii')
1390         return struct.unpack(spec, *args)
1391 else:
1392     struct_pack = struct.pack
1393     struct_unpack = struct.unpack
1394
1395
1396 def read_batch_urls(batch_fd):
1397     def fixup(url):
1398         if not isinstance(url, compat_str):
1399             url = url.decode('utf-8', 'replace')
1400         BOM_UTF8 = u'\xef\xbb\xbf'
1401         if url.startswith(BOM_UTF8):
1402             url = url[len(BOM_UTF8):]
1403         url = url.strip()
1404         if url.startswith(('#', ';', ']')):
1405             return False
1406         return url
1407
1408     with contextlib.closing(batch_fd) as fd:
1409         return [url for url in map(fixup, fd) if url]
1410
1411
1412 def urlencode_postdata(*args, **kargs):
1413     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1414
1415
1416 def parse_xml(s):
1417     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1418         def doctype(self, name, pubid, system):
1419             pass  # Ignore doctypes
1420
1421     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1422     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1423     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1424
1425
1426 if sys.version_info < (3, 0) and sys.platform == 'win32':
1427     def compat_getpass(prompt, *args, **kwargs):
1428         if isinstance(prompt, compat_str):
1429             prompt = prompt.encode(preferredencoding())
1430         return getpass.getpass(prompt, *args, **kwargs)
1431 else:
1432     compat_getpass = getpass.getpass
1433
1434
1435 US_RATINGS = {
1436     'G': 0,
1437     'PG': 10,
1438     'PG-13': 13,
1439     'R': 16,
1440     'NC': 18,
1441 }
1442
1443
1444 def strip_jsonp(code):
1445     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1446
1447
1448 def qualities(quality_ids):
1449     """ Get a numeric quality value out of a list of possible values """
1450     def q(qid):
1451         try:
1452             return quality_ids.index(qid)
1453         except ValueError:
1454             return -1
1455     return q
1456
1457
1458 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1459
1460 try:
1461     subprocess_check_output = subprocess.check_output
1462 except AttributeError:
1463     def subprocess_check_output(*args, **kwargs):
1464         assert 'input' not in kwargs
1465         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1466         output, _ = p.communicate()
1467         ret = p.poll()
1468         if ret:
1469             raise subprocess.CalledProcessError(ret, p.args, output=output)
1470         return output