git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import traceback
  28 import xml.etree.ElementTree
  29 import zlib
  30
  31 try:
  32     import urllib.request as compat_urllib_request
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_request
  35
  36 try:
  37     import urllib.error as compat_urllib_error
  38 except ImportError: # Python 2
  39     import urllib2 as compat_urllib_error
  40
  41 try:
  42     import urllib.parse as compat_urllib_parse
  43 except ImportError: # Python 2
  44     import urllib as compat_urllib_parse
  45
  46 try:
  47     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  48 except ImportError: # Python 2
  49     from urlparse import urlparse as compat_urllib_parse_urlparse
  50
  51 try:
  52     import urllib.parse as compat_urlparse
  53 except ImportError: # Python 2
  54     import urlparse as compat_urlparse
  55
  56 try:
  57     import http.cookiejar as compat_cookiejar
  58 except ImportError: # Python 2
  59     import cookielib as compat_cookiejar
  60
  61 try:
  62     import html.entities as compat_html_entities
  63 except ImportError: # Python 2
  64     import htmlentitydefs as compat_html_entities
  65
  66 try:
  67     import html.parser as compat_html_parser
  68 except ImportError: # Python 2
  69     import HTMLParser as compat_html_parser
  70
  71 try:
  72     import http.client as compat_http_client
  73 except ImportError: # Python 2
  74     import httplib as compat_http_client
  75
  76 try:
  77     from urllib.error import HTTPError as compat_HTTPError
  78 except ImportError:  # Python 2
  79     from urllib2 import HTTPError as compat_HTTPError
  80
  81 try:
  82     from urllib.request import urlretrieve as compat_urlretrieve
  83 except ImportError:  # Python 2
  84     from urllib import urlretrieve as compat_urlretrieve
  85
  86
  87 try:
  88     from subprocess import DEVNULL
  89     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  90 except ImportError:
  91     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  92
  93 try:
  94     from urllib.parse import unquote as compat_urllib_parse_unquote
  95 except ImportError:
  96     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  97         if string == '':
  98             return string
  99         res = string.split('%')
 100         if len(res) == 1:
 101             return string
 102         if encoding is None:
 103             encoding = 'utf-8'
 104         if errors is None:
 105             errors = 'replace'
 106         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 107         pct_sequence = b''
 108         string = res[0]
 109         for item in res[1:]:
 110             try:
 111                 if not item:
 112                     raise ValueError
 113                 pct_sequence += item[:2].decode('hex')
 114                 rest = item[2:]
 115                 if not rest:
 116                     # This segment was just a single percent-encoded character.
 117                     # May be part of a sequence of code units, so delay decoding.
 118                     # (Stored in pct_sequence).
 119                     continue
 120             except ValueError:
 121                 rest = '%' + item
 122             # Encountered non-percent-encoded characters. Flush the current
 123             # pct_sequence.
 124             string += pct_sequence.decode(encoding, errors) + rest
 125             pct_sequence = b''
 126         if pct_sequence:
 127             # Flush the final pct_sequence
 128             string += pct_sequence.decode(encoding, errors)
 129         return string
 130
 131
 132 try:
 133     from urllib.parse import parse_qs as compat_parse_qs
 134 except ImportError: # Python 2
 135     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 136     # Python 2's version is apparently totally broken
 137
 138     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 139                 encoding='utf-8', errors='replace'):
 140         qs, _coerce_result = qs, unicode
 141         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 142         r = []
 143         for name_value in pairs:
 144             if not name_value and not strict_parsing:
 145                 continue
 146             nv = name_value.split('=', 1)
 147             if len(nv) != 2:
 148                 if strict_parsing:
 149                     raise ValueError("bad query field: %r" % (name_value,))
 150                 # Handle case of a control-name with no equal sign
 151                 if keep_blank_values:
 152                     nv.append('')
 153                 else:
 154                     continue
 155             if len(nv[1]) or keep_blank_values:
 156                 name = nv[0].replace('+', ' ')
 157                 name = compat_urllib_parse_unquote(
 158                     name, encoding=encoding, errors=errors)
 159                 name = _coerce_result(name)
 160                 value = nv[1].replace('+', ' ')
 161                 value = compat_urllib_parse_unquote(
 162                     value, encoding=encoding, errors=errors)
 163                 value = _coerce_result(value)
 164                 r.append((name, value))
 165         return r
 166
 167     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 168                 encoding='utf-8', errors='replace'):
 169         parsed_result = {}
 170         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 171                         encoding=encoding, errors=errors)
 172         for name, value in pairs:
 173             if name in parsed_result:
 174                 parsed_result[name].append(value)
 175             else:
 176                 parsed_result[name] = [value]
 177         return parsed_result
 178
 179 try:
 180     compat_str = unicode # Python 2
 181 except NameError:
 182     compat_str = str
 183
 184 try:
 185     compat_chr = unichr # Python 2
 186 except NameError:
 187     compat_chr = chr
 188
 189 try:
 190     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 191 except ImportError:  # Python 2.6
 192     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 193
 194 def compat_ord(c):
 195     if type(c) is int: return c
 196     else: return ord(c)
 197
 198 # This is not clearly defined otherwise
 199 compiled_regex_type = type(re.compile(''))
 200
 201 std_headers = {
 202     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 203     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 204     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 205     'Accept-Encoding': 'gzip, deflate',
 206     'Accept-Language': 'en-us,en;q=0.5',
 207 }
 208
 209 def preferredencoding():
 210     """Get preferred encoding.
 211
 212     Returns the best encoding scheme for the system, based on
 213     locale.getpreferredencoding() and some further tweaks.
 214     """
 215     try:
 216         pref = locale.getpreferredencoding()
 217         u'TEST'.encode(pref)
 218     except:
 219         pref = 'UTF-8'
 220
 221     return pref
 222
 223 if sys.version_info < (3,0):
 224     def compat_print(s):
 225         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 226 else:
 227     def compat_print(s):
 228         assert type(s) == type(u'')
 229         print(s)
 230
 231 # In Python 2.x, json.dump expects a bytestream.
 232 # In Python 3.x, it writes to a character stream
 233 if sys.version_info < (3,0):
 234     def write_json_file(obj, fn):
 235         with open(fn, 'wb') as f:
 236             json.dump(obj, f)
 237 else:
 238     def write_json_file(obj, fn):
 239         with open(fn, 'w', encoding='utf-8') as f:
 240             json.dump(obj, f)
 241
 242 if sys.version_info >= (2,7):
 243     def find_xpath_attr(node, xpath, key, val):
 244         """ Find the xpath xpath[@key=val] """
 245         assert re.match(r'^[a-zA-Z-]+$', key)
 246         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 247         expr = xpath + u"[@%s='%s']" % (key, val)
 248         return node.find(expr)
 249 else:
 250     def find_xpath_attr(node, xpath, key, val):
 251         for f in node.findall(xpath):
 252             if f.attrib.get(key) == val:
 253                 return f
 254         return None
 255
 256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 257 # the namespace parameter
 258 def xpath_with_ns(path, ns_map):
 259     components = [c.split(':') for c in path.split('/')]
 260     replaced = []
 261     for c in components:
 262         if len(c) == 1:
 263             replaced.append(c[0])
 264         else:
 265             ns, tag = c
 266             replaced.append('{%s}%s' % (ns_map[ns], tag))
 267     return '/'.join(replaced)
 268
 269 def htmlentity_transform(matchobj):
 270     """Transforms an HTML entity to a character.
 271
 272     This function receives a match object and is intended to be used with
 273     the re.sub() function.
 274     """
 275     entity = matchobj.group(1)
 276
 277     # Known non-numeric HTML entity
 278     if entity in compat_html_entities.name2codepoint:
 279         return compat_chr(compat_html_entities.name2codepoint[entity])
 280
 281     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 282     if mobj is not None:
 283         numstr = mobj.group(1)
 284         if numstr.startswith(u'x'):
 285             base = 16
 286             numstr = u'0%s' % numstr
 287         else:
 288             base = 10
 289         return compat_chr(int(numstr, base))
 290
 291     # Unknown entity in name, return its literal representation
 292     return (u'&%s;' % entity)
 293
 294 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 295 class BaseHTMLParser(compat_html_parser.HTMLParser):
 296     def __init(self):
 297         compat_html_parser.HTMLParser.__init__(self)
 298         self.html = None
 299
 300     def loads(self, html):
 301         self.html = html
 302         self.feed(html)
 303         self.close()
 304
 305 class AttrParser(BaseHTMLParser):
 306     """Modified HTMLParser that isolates a tag with the specified attribute"""
 307     def __init__(self, attribute, value):
 308         self.attribute = attribute
 309         self.value = value
 310         self.result = None
 311         self.started = False
 312         self.depth = {}
 313         self.watch_startpos = False
 314         self.error_count = 0
 315         BaseHTMLParser.__init__(self)
 316
 317     def error(self, message):
 318         if self.error_count > 10 or self.started:
 319             raise compat_html_parser.HTMLParseError(message, self.getpos())
 320         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 321         self.error_count += 1
 322         self.goahead(1)
 323
 324     def handle_starttag(self, tag, attrs):
 325         attrs = dict(attrs)
 326         if self.started:
 327             self.find_startpos(None)
 328         if self.attribute in attrs and attrs[self.attribute] == self.value:
 329             self.result = [tag]
 330             self.started = True
 331             self.watch_startpos = True
 332         if self.started:
 333             if not tag in self.depth: self.depth[tag] = 0
 334             self.depth[tag] += 1
 335
 336     def handle_endtag(self, tag):
 337         if self.started:
 338             if tag in self.depth: self.depth[tag] -= 1
 339             if self.depth[self.result[0]] == 0:
 340                 self.started = False
 341                 self.result.append(self.getpos())
 342
 343     def find_startpos(self, x):
 344         """Needed to put the start position of the result (self.result[1])
 345         after the opening tag with the requested id"""
 346         if self.watch_startpos:
 347             self.watch_startpos = False
 348             self.result.append(self.getpos())
 349     handle_entityref = handle_charref = handle_data = handle_comment = \
 350     handle_decl = handle_pi = unknown_decl = find_startpos
 351
 352     def get_result(self):
 353         if self.result is None:
 354             return None
 355         if len(self.result) != 3:
 356             return None
 357         lines = self.html.split('\n')
 358         lines = lines[self.result[1][0]-1:self.result[2][0]]
 359         lines[0] = lines[0][self.result[1][1]:]
 360         if len(lines) == 1:
 361             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 362         lines[-1] = lines[-1][:self.result[2][1]]
 363         return '\n'.join(lines).strip()
 364 # Hack for https://github.com/rg3/youtube-dl/issues/662
 365 if sys.version_info < (2, 7, 3):
 366     AttrParser.parse_endtag = (lambda self, i:
 367         i + len("</scr'+'ipt>")
 368         if self.rawdata[i:].startswith("</scr'+'ipt>")
 369         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 370
 371 def get_element_by_id(id, html):
 372     """Return the content of the tag with the specified ID in the passed HTML document"""
 373     return get_element_by_attribute("id", id, html)
 374
 375 def get_element_by_attribute(attribute, value, html):
 376     """Return the content of the tag with the specified attribute in the passed HTML document"""
 377     parser = AttrParser(attribute, value)
 378     try:
 379         parser.loads(html)
 380     except compat_html_parser.HTMLParseError:
 381         pass
 382     return parser.get_result()
 383
 384 class MetaParser(BaseHTMLParser):
 385     """
 386     Modified HTMLParser that isolates a meta tag with the specified name
 387     attribute.
 388     """
 389     def __init__(self, name):
 390         BaseHTMLParser.__init__(self)
 391         self.name = name
 392         self.content = None
 393         self.result = None
 394
 395     def handle_starttag(self, tag, attrs):
 396         if tag != 'meta':
 397             return
 398         attrs = dict(attrs)
 399         if attrs.get('name') == self.name:
 400             self.result = attrs.get('content')
 401
 402     def get_result(self):
 403         return self.result
 404
 405 def get_meta_content(name, html):
 406     """
 407     Return the content attribute from the meta tag with the given name attribute.
 408     """
 409     parser = MetaParser(name)
 410     try:
 411         parser.loads(html)
 412     except compat_html_parser.HTMLParseError:
 413         pass
 414     return parser.get_result()
 415
 416
 417 def clean_html(html):
 418     """Clean an HTML snippet into a readable string"""
 419     # Newline vs <br />
 420     html = html.replace('\n', ' ')
 421     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 422     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 423     # Strip html tags
 424     html = re.sub('<.*?>', '', html)
 425     # Replace html entities
 426     html = unescapeHTML(html)
 427     return html.strip()
 428
 429
 430 def sanitize_open(filename, open_mode):
 431     """Try to open the given filename, and slightly tweak it if this fails.
 432
 433     Attempts to open the given filename. If this fails, it tries to change
 434     the filename slightly, step by step, until it's either able to open it
 435     or it fails and raises a final exception, like the standard open()
 436     function.
 437
 438     It returns the tuple (stream, definitive_file_name).
 439     """
 440     try:
 441         if filename == u'-':
 442             if sys.platform == 'win32':
 443                 import msvcrt
 444                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 445             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 446         stream = open(encodeFilename(filename), open_mode)
 447         return (stream, filename)
 448     except (IOError, OSError) as err:
 449         if err.errno in (errno.EACCES,):
 450             raise
 451
 452         # In case of error, try to remove win32 forbidden chars
 453         alt_filename = os.path.join(
 454                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 455                         for path_part in os.path.split(filename)
 456                        )
 457         if alt_filename == filename:
 458             raise
 459         else:
 460             # An exception here should be caught in the caller
 461             stream = open(encodeFilename(filename), open_mode)
 462             return (stream, alt_filename)
 463
 464
 465 def timeconvert(timestr):
 466     """Convert RFC 2822 defined time string into system timestamp"""
 467     timestamp = None
 468     timetuple = email.utils.parsedate_tz(timestr)
 469     if timetuple is not None:
 470         timestamp = email.utils.mktime_tz(timetuple)
 471     return timestamp
 472
 473 def sanitize_filename(s, restricted=False, is_id=False):
 474     """Sanitizes a string so it could be used as part of a filename.
 475     If restricted is set, use a stricter subset of allowed characters.
 476     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 477     """
 478     def replace_insane(char):
 479         if char == '?' or ord(char) < 32 or ord(char) == 127:
 480             return ''
 481         elif char == '"':
 482             return '' if restricted else '\''
 483         elif char == ':':
 484             return '_-' if restricted else ' -'
 485         elif char in '\\/|*<>':
 486             return '_'
 487         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 488             return '_'
 489         if restricted and ord(char) > 127:
 490             return '_'
 491         return char
 492
 493     result = u''.join(map(replace_insane, s))
 494     if not is_id:
 495         while '__' in result:
 496             result = result.replace('__', '_')
 497         result = result.strip('_')
 498         # Common case of "Foreign band name - English song title"
 499         if restricted and result.startswith('-_'):
 500             result = result[2:]
 501         if not result:
 502             result = '_'
 503     return result
 504
 505 def orderedSet(iterable):
 506     """ Remove all duplicates from the input iterable """
 507     res = []
 508     for el in iterable:
 509         if el not in res:
 510             res.append(el)
 511     return res
 512
 513
 514 def unescapeHTML(s):
 515     if s is None:
 516         return None
 517     assert type(s) == compat_str
 518
 519     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 520     return result
 521
 522
 523 def encodeFilename(s, for_subprocess=False):
 524     """
 525     @param s The name of the file
 526     """
 527
 528     assert type(s) == compat_str
 529
 530     # Python 3 has a Unicode API
 531     if sys.version_info >= (3, 0):
 532         return s
 533
 534     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 535         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 536         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 537         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 538         if not for_subprocess:
 539             return s
 540         else:
 541             # For subprocess calls, encode with locale encoding
 542             # Refer to http://stackoverflow.com/a/9951851/35070
 543             encoding = preferredencoding()
 544     else:
 545         encoding = sys.getfilesystemencoding()
 546     if encoding is None:
 547         encoding = 'utf-8'
 548     return s.encode(encoding, 'ignore')
 549
 550
 551 def encodeArgument(s):
 552     if not isinstance(s, compat_str):
 553         # Legacy code that uses byte strings
 554         # Uncomment the following line after fixing all post processors
 555         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 556         s = s.decode('ascii')
 557     return encodeFilename(s, True)
 558
 559
 560 def decodeOption(optval):
 561     if optval is None:
 562         return optval
 563     if isinstance(optval, bytes):
 564         optval = optval.decode(preferredencoding())
 565
 566     assert isinstance(optval, compat_str)
 567     return optval
 568
 569 def formatSeconds(secs):
 570     if secs > 3600:
 571         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 572     elif secs > 60:
 573         return '%d:%02d' % (secs // 60, secs % 60)
 574     else:
 575         return '%d' % secs
 576
 577
 578 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 579     if sys.version_info < (3, 2):
 580         import httplib
 581
 582         class HTTPSConnectionV3(httplib.HTTPSConnection):
 583             def __init__(self, *args, **kwargs):
 584                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 585
 586             def connect(self):
 587                 sock = socket.create_connection((self.host, self.port), self.timeout)
 588                 if getattr(self, '_tunnel_host', False):
 589                     self.sock = sock
 590                     self._tunnel()
 591                 try:
 592                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 593                 except ssl.SSLError:
 594                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 595
 596         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 597             def https_open(self, req):
 598                 return self.do_open(HTTPSConnectionV3, req)
 599         return HTTPSHandlerV3(**kwargs)
 600     else:
 601         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 602         context.verify_mode = (ssl.CERT_NONE
 603                                if opts_no_check_certificate
 604                                else ssl.CERT_REQUIRED)
 605         context.set_default_verify_paths()
 606         try:
 607             context.load_default_certs()
 608         except AttributeError:
 609             pass  # Python < 3.4
 610         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 611
 612 class ExtractorError(Exception):
 613     """Error during info extraction."""
 614     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 615         """ tb, if given, is the original traceback (so that it can be printed out).
 616         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 617         """
 618
 619         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 620             expected = True
 621         if video_id is not None:
 622             msg = video_id + ': ' + msg
 623         if not expected:
 624             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 625         super(ExtractorError, self).__init__(msg)
 626
 627         self.traceback = tb
 628         self.exc_info = sys.exc_info()  # preserve original exception
 629         self.cause = cause
 630         self.video_id = video_id
 631
 632     def format_traceback(self):
 633         if self.traceback is None:
 634             return None
 635         return u''.join(traceback.format_tb(self.traceback))
 636
 637
 638 class RegexNotFoundError(ExtractorError):
 639     """Error when a regex didn't match"""
 640     pass
 641
 642
 643 class DownloadError(Exception):
 644     """Download Error exception.
 645
 646     This exception may be thrown by FileDownloader objects if they are not
 647     configured to continue on errors. They will contain the appropriate
 648     error message.
 649     """
 650     def __init__(self, msg, exc_info=None):
 651         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 652         super(DownloadError, self).__init__(msg)
 653         self.exc_info = exc_info
 654
 655
 656 class SameFileError(Exception):
 657     """Same File exception.
 658
 659     This exception will be thrown by FileDownloader objects if they detect
 660     multiple files would have to be downloaded to the same file on disk.
 661     """
 662     pass
 663
 664
 665 class PostProcessingError(Exception):
 666     """Post Processing exception.
 667
 668     This exception may be raised by PostProcessor's .run() method to
 669     indicate an error in the postprocessing task.
 670     """
 671     def __init__(self, msg):
 672         self.msg = msg
 673
 674 class MaxDownloadsReached(Exception):
 675     """ --max-downloads limit has been reached. """
 676     pass
 677
 678
 679 class UnavailableVideoError(Exception):
 680     """Unavailable Format exception.
 681
 682     This exception will be thrown when a video is requested
 683     in a format that is not available for that video.
 684     """
 685     pass
 686
 687
 688 class ContentTooShortError(Exception):
 689     """Content Too Short exception.
 690
 691     This exception may be raised by FileDownloader objects when a file they
 692     download is too small for what the server announced first, indicating
 693     the connection was probably interrupted.
 694     """
 695     # Both in bytes
 696     downloaded = None
 697     expected = None
 698
 699     def __init__(self, downloaded, expected):
 700         self.downloaded = downloaded
 701         self.expected = expected
 702
 703 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 704     """Handler for HTTP requests and responses.
 705
 706     This class, when installed with an OpenerDirector, automatically adds
 707     the standard headers to every HTTP request and handles gzipped and
 708     deflated responses from web servers. If compression is to be avoided in
 709     a particular request, the original request in the program code only has
 710     to include the HTTP header "Youtubedl-No-Compression", which will be
 711     removed before making the real request.
 712
 713     Part of this code was copied from:
 714
 715     http://techknack.net/python-urllib2-handlers/
 716
 717     Andrew Rowls, the author of that code, agreed to release it to the
 718     public domain.
 719     """
 720
 721     @staticmethod
 722     def deflate(data):
 723         try:
 724             return zlib.decompress(data, -zlib.MAX_WBITS)
 725         except zlib.error:
 726             return zlib.decompress(data)
 727
 728     @staticmethod
 729     def addinfourl_wrapper(stream, headers, url, code):
 730         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 731             return compat_urllib_request.addinfourl(stream, headers, url, code)
 732         ret = compat_urllib_request.addinfourl(stream, headers, url)
 733         ret.code = code
 734         return ret
 735
 736     def http_request(self, req):
 737         for h,v in std_headers.items():
 738             if h in req.headers:
 739                 del req.headers[h]
 740             req.add_header(h, v)
 741         if 'Youtubedl-no-compression' in req.headers:
 742             if 'Accept-encoding' in req.headers:
 743                 del req.headers['Accept-encoding']
 744             del req.headers['Youtubedl-no-compression']
 745         if 'Youtubedl-user-agent' in req.headers:
 746             if 'User-agent' in req.headers:
 747                 del req.headers['User-agent']
 748             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 749             del req.headers['Youtubedl-user-agent']
 750         return req
 751
 752     def http_response(self, req, resp):
 753         old_resp = resp
 754         # gzip
 755         if resp.headers.get('Content-encoding', '') == 'gzip':
 756             content = resp.read()
 757             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 758             try:
 759                 uncompressed = io.BytesIO(gz.read())
 760             except IOError as original_ioerror:
 761                 # There may be junk add the end of the file
 762                 # See http://stackoverflow.com/q/4928560/35070 for details
 763                 for i in range(1, 1024):
 764                     try:
 765                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 766                         uncompressed = io.BytesIO(gz.read())
 767                     except IOError:
 768                         continue
 769                     break
 770                 else:
 771                     raise original_ioerror
 772             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 773             resp.msg = old_resp.msg
 774         # deflate
 775         if resp.headers.get('Content-encoding', '') == 'deflate':
 776             gz = io.BytesIO(self.deflate(resp.read()))
 777             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 778             resp.msg = old_resp.msg
 779         return resp
 780
 781     https_request = http_request
 782     https_response = http_response
 783
 784
 785 def parse_iso8601(date_str, delimiter='T'):
 786     """ Return a UNIX timestamp from the given date """
 787
 788     if date_str is None:
 789         return None
 790
 791     m = re.search(
 792         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 793         date_str)
 794     if not m:
 795         timezone = datetime.timedelta()
 796     else:
 797         date_str = date_str[:-len(m.group(0))]
 798         if not m.group('sign'):
 799             timezone = datetime.timedelta()
 800         else:
 801             sign = 1 if m.group('sign') == '+' else -1
 802             timezone = datetime.timedelta(
 803                 hours=sign * int(m.group('hours')),
 804                 minutes=sign * int(m.group('minutes')))
 805     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 806     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 807     return calendar.timegm(dt.timetuple())
 808
 809
 810 def unified_strdate(date_str):
 811     """Return a string with the date in the format YYYYMMDD"""
 812
 813     if date_str is None:
 814         return None
 815
 816     upload_date = None
 817     #Replace commas
 818     date_str = date_str.replace(',', ' ')
 819     # %z (UTC offset) is only supported in python>=3.2
 820     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 821     format_expressions = [
 822         '%d %B %Y',
 823         '%d %b %Y',
 824         '%B %d %Y',
 825         '%b %d %Y',
 826         '%b %dst %Y %I:%M%p',
 827         '%b %dnd %Y %I:%M%p',
 828         '%b %dth %Y %I:%M%p',
 829         '%Y-%m-%d',
 830         '%Y/%m/%d',
 831         '%d.%m.%Y',
 832         '%d/%m/%Y',
 833         '%Y/%m/%d %H:%M:%S',
 834         '%Y-%m-%d %H:%M:%S',
 835         '%d.%m.%Y %H:%M',
 836         '%d.%m.%Y %H.%M',
 837         '%Y-%m-%dT%H:%M:%SZ',
 838         '%Y-%m-%dT%H:%M:%S.%fZ',
 839         '%Y-%m-%dT%H:%M:%S.%f0Z',
 840         '%Y-%m-%dT%H:%M:%S',
 841         '%Y-%m-%dT%H:%M:%S.%f',
 842         '%Y-%m-%dT%H:%M',
 843     ]
 844     for expression in format_expressions:
 845         try:
 846             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 847         except ValueError:
 848             pass
 849     if upload_date is None:
 850         timetuple = email.utils.parsedate_tz(date_str)
 851         if timetuple:
 852             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 853     return upload_date
 854
 855 def determine_ext(url, default_ext=u'unknown_video'):
 856     if url is None:
 857         return default_ext
 858     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 859     if re.match(r'^[A-Za-z0-9]+$', guess):
 860         return guess
 861     else:
 862         return default_ext
 863
 864 def subtitles_filename(filename, sub_lang, sub_format):
 865     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 866
 867 def date_from_str(date_str):
 868     """
 869     Return a datetime object from a string in the format YYYYMMDD or
 870     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 871     today = datetime.date.today()
 872     if date_str == 'now'or date_str == 'today':
 873         return today
 874     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 875     if match is not None:
 876         sign = match.group('sign')
 877         time = int(match.group('time'))
 878         if sign == '-':
 879             time = -time
 880         unit = match.group('unit')
 881         #A bad aproximation?
 882         if unit == 'month':
 883             unit = 'day'
 884             time *= 30
 885         elif unit == 'year':
 886             unit = 'day'
 887             time *= 365
 888         unit += 's'
 889         delta = datetime.timedelta(**{unit: time})
 890         return today + delta
 891     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 892
 893 def hyphenate_date(date_str):
 894     """
 895     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 896     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 897     if match is not None:
 898         return '-'.join(match.groups())
 899     else:
 900         return date_str
 901
 902 class DateRange(object):
 903     """Represents a time interval between two dates"""
 904     def __init__(self, start=None, end=None):
 905         """start and end must be strings in the format accepted by date"""
 906         if start is not None:
 907             self.start = date_from_str(start)
 908         else:
 909             self.start = datetime.datetime.min.date()
 910         if end is not None:
 911             self.end = date_from_str(end)
 912         else:
 913             self.end = datetime.datetime.max.date()
 914         if self.start > self.end:
 915             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 916     @classmethod
 917     def day(cls, day):
 918         """Returns a range that only contains the given day"""
 919         return cls(day,day)
 920     def __contains__(self, date):
 921         """Check if the date is in the range"""
 922         if not isinstance(date, datetime.date):
 923             date = date_from_str(date)
 924         return self.start <= date <= self.end
 925     def __str__(self):
 926         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 927
 928
 929 def platform_name():
 930     """ Returns the platform name as a compat_str """
 931     res = platform.platform()
 932     if isinstance(res, bytes):
 933         res = res.decode(preferredencoding())
 934
 935     assert isinstance(res, compat_str)
 936     return res
 937
 938
 939 def _windows_write_string(s, out):
 940     """ Returns True if the string was written using special methods,
 941     False if it has yet to be written out."""
 942     # Adapted from http://stackoverflow.com/a/3259271/35070
 943
 944     import ctypes
 945     import ctypes.wintypes
 946
 947     WIN_OUTPUT_IDS = {
 948         1: -11,
 949         2: -12,
 950     }
 951
 952     try:
 953         fileno = out.fileno()
 954     except AttributeError:
 955         # If the output stream doesn't have a fileno, it's virtual
 956         return False
 957     if fileno not in WIN_OUTPUT_IDS:
 958         return False
 959
 960     GetStdHandle = ctypes.WINFUNCTYPE(
 961         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 962         ("GetStdHandle", ctypes.windll.kernel32))
 963     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 964
 965     WriteConsoleW = ctypes.WINFUNCTYPE(
 966         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 967         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 968         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 969     written = ctypes.wintypes.DWORD(0)
 970
 971     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 972     FILE_TYPE_CHAR = 0x0002
 973     FILE_TYPE_REMOTE = 0x8000
 974     GetConsoleMode = ctypes.WINFUNCTYPE(
 975         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 976         ctypes.POINTER(ctypes.wintypes.DWORD))(
 977         ("GetConsoleMode", ctypes.windll.kernel32))
 978     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 979
 980     def not_a_console(handle):
 981         if handle == INVALID_HANDLE_VALUE or handle is None:
 982             return True
 983         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 984                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 985
 986     if not_a_console(h):
 987         return False
 988
 989     def next_nonbmp_pos(s):
 990         try:
 991             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 992         except StopIteration:
 993             return len(s)
 994
 995     while s:
 996         count = min(next_nonbmp_pos(s), 1024)
 997
 998         ret = WriteConsoleW(
 999             h, s, count if count else 2, ctypes.byref(written), None)
1000         if ret == 0:
1001             raise OSError('Failed to write string')
1002         if not count:  # We just wrote a non-BMP character
1003             assert written.value == 2
1004             s = s[1:]
1005         else:
1006             assert written.value > 0
1007             s = s[written.value:]
1008     return True
1009
1010
1011 def write_string(s, out=None, encoding=None):
1012     if out is None:
1013         out = sys.stderr
1014     assert type(s) == compat_str
1015
1016     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1017         if _windows_write_string(s, out):
1018             return
1019
1020     if ('b' in getattr(out, 'mode', '') or
1021             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1022         byt = s.encode(encoding or preferredencoding(), 'ignore')
1023         out.write(byt)
1024     elif hasattr(out, 'buffer'):
1025         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1026         byt = s.encode(enc, 'ignore')
1027         out.buffer.write(byt)
1028     else:
1029         out.write(s)
1030     out.flush()
1031
1032
1033 def bytes_to_intlist(bs):
1034     if not bs:
1035         return []
1036     if isinstance(bs[0], int):  # Python 3
1037         return list(bs)
1038     else:
1039         return [ord(c) for c in bs]
1040
1041
1042 def intlist_to_bytes(xs):
1043     if not xs:
1044         return b''
1045     if isinstance(chr(0), bytes):  # Python 2
1046         return ''.join([chr(x) for x in xs])
1047     else:
1048         return bytes(xs)
1049
1050
1051 def get_cachedir(params={}):
1052     cache_root = os.environ.get('XDG_CACHE_HOME',
1053                                 os.path.expanduser('~/.cache'))
1054     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1055
1056
1057 # Cross-platform file locking
1058 if sys.platform == 'win32':
1059     import ctypes.wintypes
1060     import msvcrt
1061
1062     class OVERLAPPED(ctypes.Structure):
1063         _fields_ = [
1064             ('Internal', ctypes.wintypes.LPVOID),
1065             ('InternalHigh', ctypes.wintypes.LPVOID),
1066             ('Offset', ctypes.wintypes.DWORD),
1067             ('OffsetHigh', ctypes.wintypes.DWORD),
1068             ('hEvent', ctypes.wintypes.HANDLE),
1069         ]
1070
1071     kernel32 = ctypes.windll.kernel32
1072     LockFileEx = kernel32.LockFileEx
1073     LockFileEx.argtypes = [
1074         ctypes.wintypes.HANDLE,     # hFile
1075         ctypes.wintypes.DWORD,      # dwFlags
1076         ctypes.wintypes.DWORD,      # dwReserved
1077         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1078         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1079         ctypes.POINTER(OVERLAPPED)  # Overlapped
1080     ]
1081     LockFileEx.restype = ctypes.wintypes.BOOL
1082     UnlockFileEx = kernel32.UnlockFileEx
1083     UnlockFileEx.argtypes = [
1084         ctypes.wintypes.HANDLE,     # hFile
1085         ctypes.wintypes.DWORD,      # dwReserved
1086         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1087         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1088         ctypes.POINTER(OVERLAPPED)  # Overlapped
1089     ]
1090     UnlockFileEx.restype = ctypes.wintypes.BOOL
1091     whole_low = 0xffffffff
1092     whole_high = 0x7fffffff
1093
1094     def _lock_file(f, exclusive):
1095         overlapped = OVERLAPPED()
1096         overlapped.Offset = 0
1097         overlapped.OffsetHigh = 0
1098         overlapped.hEvent = 0
1099         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1100         handle = msvcrt.get_osfhandle(f.fileno())
1101         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1102                           whole_low, whole_high, f._lock_file_overlapped_p):
1103             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1104
1105     def _unlock_file(f):
1106         assert f._lock_file_overlapped_p
1107         handle = msvcrt.get_osfhandle(f.fileno())
1108         if not UnlockFileEx(handle, 0,
1109                             whole_low, whole_high, f._lock_file_overlapped_p):
1110             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1111
1112 else:
1113     import fcntl
1114
1115     def _lock_file(f, exclusive):
1116         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1117
1118     def _unlock_file(f):
1119         fcntl.lockf(f, fcntl.LOCK_UN)
1120
1121
1122 class locked_file(object):
1123     def __init__(self, filename, mode, encoding=None):
1124         assert mode in ['r', 'a', 'w']
1125         self.f = io.open(filename, mode, encoding=encoding)
1126         self.mode = mode
1127
1128     def __enter__(self):
1129         exclusive = self.mode != 'r'
1130         try:
1131             _lock_file(self.f, exclusive)
1132         except IOError:
1133             self.f.close()
1134             raise
1135         return self
1136
1137     def __exit__(self, etype, value, traceback):
1138         try:
1139             _unlock_file(self.f)
1140         finally:
1141             self.f.close()
1142
1143     def __iter__(self):
1144         return iter(self.f)
1145
1146     def write(self, *args):
1147         return self.f.write(*args)
1148
1149     def read(self, *args):
1150         return self.f.read(*args)
1151
1152
1153 def shell_quote(args):
1154     quoted_args = []
1155     encoding = sys.getfilesystemencoding()
1156     if encoding is None:
1157         encoding = 'utf-8'
1158     for a in args:
1159         if isinstance(a, bytes):
1160             # We may get a filename encoded with 'encodeFilename'
1161             a = a.decode(encoding)
1162         quoted_args.append(pipes.quote(a))
1163     return u' '.join(quoted_args)
1164
1165
1166 def takewhile_inclusive(pred, seq):
1167     """ Like itertools.takewhile, but include the latest evaluated element
1168         (the first element so that Not pred(e)) """
1169     for e in seq:
1170         yield e
1171         if not pred(e):
1172             return
1173
1174
1175 def smuggle_url(url, data):
1176     """ Pass additional data in a URL for internal use. """
1177
1178     sdata = compat_urllib_parse.urlencode(
1179         {u'__youtubedl_smuggle': json.dumps(data)})
1180     return url + u'#' + sdata
1181
1182
1183 def unsmuggle_url(smug_url, default=None):
1184     if not '#__youtubedl_smuggle' in smug_url:
1185         return smug_url, default
1186     url, _, sdata = smug_url.rpartition(u'#')
1187     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1188     data = json.loads(jsond)
1189     return url, data
1190
1191
1192 def format_bytes(bytes):
1193     if bytes is None:
1194         return u'N/A'
1195     if type(bytes) is str:
1196         bytes = float(bytes)
1197     if bytes == 0.0:
1198         exponent = 0
1199     else:
1200         exponent = int(math.log(bytes, 1024.0))
1201     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1202     converted = float(bytes) / float(1024 ** exponent)
1203     return u'%.2f%s' % (converted, suffix)
1204
1205
1206 def get_term_width():
1207     columns = os.environ.get('COLUMNS', None)
1208     if columns:
1209         return int(columns)
1210
1211     try:
1212         sp = subprocess.Popen(
1213             ['stty', 'size'],
1214             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1215         out, err = sp.communicate()
1216         return int(out.split()[1])
1217     except:
1218         pass
1219     return None
1220
1221
1222 def month_by_name(name):
1223     """ Return the number of a month by (locale-independently) English name """
1224
1225     ENGLISH_NAMES = [
1226         u'January', u'February', u'March', u'April', u'May', u'June',
1227         u'July', u'August', u'September', u'October', u'November', u'December']
1228     try:
1229         return ENGLISH_NAMES.index(name) + 1
1230     except ValueError:
1231         return None
1232
1233
1234 def fix_xml_ampersands(xml_str):
1235     """Replace all the '&' by '&amp;' in XML"""
1236     return re.sub(
1237         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1238         u'&amp;',
1239         xml_str)
1240
1241
1242 def setproctitle(title):
1243     assert isinstance(title, compat_str)
1244     try:
1245         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1246     except OSError:
1247         return
1248     title_bytes = title.encode('utf-8')
1249     buf = ctypes.create_string_buffer(len(title_bytes))
1250     buf.value = title_bytes
1251     try:
1252         libc.prctl(15, buf, 0, 0, 0)
1253     except AttributeError:
1254         return  # Strange libc, just skip this
1255
1256
1257 def remove_start(s, start):
1258     if s.startswith(start):
1259         return s[len(start):]
1260     return s
1261
1262
1263 def url_basename(url):
1264     path = compat_urlparse.urlparse(url).path
1265     return path.strip(u'/').split(u'/')[-1]
1266
1267
1268 class HEADRequest(compat_urllib_request.Request):
1269     def get_method(self):
1270         return "HEAD"
1271
1272
1273 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1274     if get_attr:
1275         if v is not None:
1276             v = getattr(v, get_attr, None)
1277     if v == '':
1278         v = None
1279     return default if v is None else (int(v) * invscale // scale)
1280
1281
1282 def str_or_none(v, default=None):
1283     return default if v is None else compat_str(v)
1284
1285
1286 def str_to_int(int_str):
1287     if int_str is None:
1288         return None
1289     int_str = re.sub(r'[,\.]', u'', int_str)
1290     return int(int_str)
1291
1292
1293 def float_or_none(v, scale=1, invscale=1, default=None):
1294     return default if v is None else (float(v) * invscale / scale)
1295
1296
1297 def parse_duration(s):
1298     if s is None:
1299         return None
1300
1301     m = re.match(
1302         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1303     if not m:
1304         return None
1305     res = int(m.group('secs'))
1306     if m.group('mins'):
1307         res += int(m.group('mins')) * 60
1308         if m.group('hours'):
1309             res += int(m.group('hours')) * 60 * 60
1310     return res
1311
1312
1313 def prepend_extension(filename, ext):
1314     name, real_ext = os.path.splitext(filename)
1315     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1316
1317
1318 def check_executable(exe, args=[]):
1319     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1320     args can be a list of arguments for a short output (like -version) """
1321     try:
1322         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1323     except OSError:
1324         return False
1325     return exe
1326
1327
1328 class PagedList(object):
1329     def __init__(self, pagefunc, pagesize):
1330         self._pagefunc = pagefunc
1331         self._pagesize = pagesize
1332
1333     def __len__(self):
1334         # This is only useful for tests
1335         return len(self.getslice())
1336
1337     def getslice(self, start=0, end=None):
1338         res = []
1339         for pagenum in itertools.count(start // self._pagesize):
1340             firstid = pagenum * self._pagesize
1341             nextfirstid = pagenum * self._pagesize + self._pagesize
1342             if start >= nextfirstid:
1343                 continue
1344
1345             page_results = list(self._pagefunc(pagenum))
1346
1347             startv = (
1348                 start % self._pagesize
1349                 if firstid <= start < nextfirstid
1350                 else 0)
1351
1352             endv = (
1353                 ((end - 1) % self._pagesize) + 1
1354                 if (end is not None and firstid <= end <= nextfirstid)
1355                 else None)
1356
1357             if startv != 0 or endv is not None:
1358                 page_results = page_results[startv:endv]
1359             res.extend(page_results)
1360
1361             # A little optimization - if current page is not "full", ie. does
1362             # not contain page_size videos then we can assume that this page
1363             # is the last one - there are no more ids on further pages -
1364             # i.e. no need to query again.
1365             if len(page_results) + startv < self._pagesize:
1366                 break
1367
1368             # If we got the whole page, but the next page is not interesting,
1369             # break out early as well
1370             if end == nextfirstid:
1371                 break
1372         return res
1373
1374
1375 def uppercase_escape(s):
1376     unicode_escape = codecs.getdecoder('unicode_escape')
1377     return re.sub(
1378         r'\\U[0-9a-fA-F]{8}',
1379         lambda m: unicode_escape(m.group(0))[0],
1380         s)
1381
1382 try:
1383     struct.pack(u'!I', 0)
1384 except TypeError:
1385     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1386     def struct_pack(spec, *args):
1387         if isinstance(spec, compat_str):
1388             spec = spec.encode('ascii')
1389         return struct.pack(spec, *args)
1390
1391     def struct_unpack(spec, *args):
1392         if isinstance(spec, compat_str):
1393             spec = spec.encode('ascii')
1394         return struct.unpack(spec, *args)
1395 else:
1396     struct_pack = struct.pack
1397     struct_unpack = struct.unpack
1398
1399
1400 def read_batch_urls(batch_fd):
1401     def fixup(url):
1402         if not isinstance(url, compat_str):
1403             url = url.decode('utf-8', 'replace')
1404         BOM_UTF8 = u'\xef\xbb\xbf'
1405         if url.startswith(BOM_UTF8):
1406             url = url[len(BOM_UTF8):]
1407         url = url.strip()
1408         if url.startswith(('#', ';', ']')):
1409             return False
1410         return url
1411
1412     with contextlib.closing(batch_fd) as fd:
1413         return [url for url in map(fixup, fd) if url]
1414
1415
1416 def urlencode_postdata(*args, **kargs):
1417     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1418
1419
1420 def parse_xml(s):
1421     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1422         def doctype(self, name, pubid, system):
1423             pass  # Ignore doctypes
1424
1425     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1426     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1427     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1428
1429
1430 if sys.version_info < (3, 0) and sys.platform == 'win32':
1431     def compat_getpass(prompt, *args, **kwargs):
1432         if isinstance(prompt, compat_str):
1433             prompt = prompt.encode(preferredencoding())
1434         return getpass.getpass(prompt, *args, **kwargs)
1435 else:
1436     compat_getpass = getpass.getpass
1437
1438
1439 US_RATINGS = {
1440     'G': 0,
1441     'PG': 10,
1442     'PG-13': 13,
1443     'R': 16,
1444     'NC': 18,
1445 }
1446
1447
1448 def strip_jsonp(code):
1449     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1450
1451
1452 def qualities(quality_ids):
1453     """ Get a numeric quality value out of a list of possible values """
1454     def q(qid):
1455         try:
1456             return quality_ids.index(qid)
1457         except ValueError:
1458             return -1
1459     return q
1460
1461
1462 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1463
1464 try:
1465     subprocess_check_output = subprocess.check_output
1466 except AttributeError:
1467     def subprocess_check_output(*args, **kwargs):
1468         assert 'input' not in kwargs
1469         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1470         output, _ = p.communicate()
1471         ret = p.poll()
1472         if ret:
1473             raise subprocess.CalledProcessError(ret, p.args, output=output)
1474         return output