_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import itertools
  10 import io
  11 import json
  12 import locale
  13 import math
  14 import os
  15 import pipes
  16 import platform
  17 import re
  18 import ssl
  19 import socket
  20 import struct
  21 import subprocess
  22 import sys
  23 import traceback
  24 import zlib
  25
  26 try:
  27     import urllib.request as compat_urllib_request
  28 except ImportError: # Python 2
  29     import urllib2 as compat_urllib_request
  30
  31 try:
  32     import urllib.error as compat_urllib_error
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_error
  35
  36 try:
  37     import urllib.parse as compat_urllib_parse
  38 except ImportError: # Python 2
  39     import urllib as compat_urllib_parse
  40
  41 try:
  42     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  43 except ImportError: # Python 2
  44     from urlparse import urlparse as compat_urllib_parse_urlparse
  45
  46 try:
  47     import urllib.parse as compat_urlparse
  48 except ImportError: # Python 2
  49     import urlparse as compat_urlparse
  50
  51 try:
  52     import http.cookiejar as compat_cookiejar
  53 except ImportError: # Python 2
  54     import cookielib as compat_cookiejar
  55
  56 try:
  57     import html.entities as compat_html_entities
  58 except ImportError: # Python 2
  59     import htmlentitydefs as compat_html_entities
  60
  61 try:
  62     import html.parser as compat_html_parser
  63 except ImportError: # Python 2
  64     import HTMLParser as compat_html_parser
  65
  66 try:
  67     import http.client as compat_http_client
  68 except ImportError: # Python 2
  69     import httplib as compat_http_client
  70
  71 try:
  72     from urllib.error import HTTPError as compat_HTTPError
  73 except ImportError:  # Python 2
  74     from urllib2 import HTTPError as compat_HTTPError
  75
  76 try:
  77     from urllib.request import urlretrieve as compat_urlretrieve
  78 except ImportError:  # Python 2
  79     from urllib import urlretrieve as compat_urlretrieve
  80
  81
  82 try:
  83     from subprocess import DEVNULL
  84     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  85 except ImportError:
  86     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  87
  88 try:
  89     from urllib.parse import parse_qs as compat_parse_qs
  90 except ImportError: # Python 2
  91     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  92     # Python 2's version is apparently totally broken
  93     def _unquote(string, encoding='utf-8', errors='replace'):
  94         if string == '':
  95             return string
  96         res = string.split('%')
  97         if len(res) == 1:
  98             return string
  99         if encoding is None:
 100             encoding = 'utf-8'
 101         if errors is None:
 102             errors = 'replace'
 103         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 104         pct_sequence = b''
 105         string = res[0]
 106         for item in res[1:]:
 107             try:
 108                 if not item:
 109                     raise ValueError
 110                 pct_sequence += item[:2].decode('hex')
 111                 rest = item[2:]
 112                 if not rest:
 113                     # This segment was just a single percent-encoded character.
 114                     # May be part of a sequence of code units, so delay decoding.
 115                     # (Stored in pct_sequence).
 116                     continue
 117             except ValueError:
 118                 rest = '%' + item
 119             # Encountered non-percent-encoded characters. Flush the current
 120             # pct_sequence.
 121             string += pct_sequence.decode(encoding, errors) + rest
 122             pct_sequence = b''
 123         if pct_sequence:
 124             # Flush the final pct_sequence
 125             string += pct_sequence.decode(encoding, errors)
 126         return string
 127
 128     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 129                 encoding='utf-8', errors='replace'):
 130         qs, _coerce_result = qs, unicode
 131         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 132         r = []
 133         for name_value in pairs:
 134             if not name_value and not strict_parsing:
 135                 continue
 136             nv = name_value.split('=', 1)
 137             if len(nv) != 2:
 138                 if strict_parsing:
 139                     raise ValueError("bad query field: %r" % (name_value,))
 140                 # Handle case of a control-name with no equal sign
 141                 if keep_blank_values:
 142                     nv.append('')
 143                 else:
 144                     continue
 145             if len(nv[1]) or keep_blank_values:
 146                 name = nv[0].replace('+', ' ')
 147                 name = _unquote(name, encoding=encoding, errors=errors)
 148                 name = _coerce_result(name)
 149                 value = nv[1].replace('+', ' ')
 150                 value = _unquote(value, encoding=encoding, errors=errors)
 151                 value = _coerce_result(value)
 152                 r.append((name, value))
 153         return r
 154
 155     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 156                 encoding='utf-8', errors='replace'):
 157         parsed_result = {}
 158         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 159                         encoding=encoding, errors=errors)
 160         for name, value in pairs:
 161             if name in parsed_result:
 162                 parsed_result[name].append(value)
 163             else:
 164                 parsed_result[name] = [value]
 165         return parsed_result
 166
 167 try:
 168     compat_str = unicode # Python 2
 169 except NameError:
 170     compat_str = str
 171
 172 try:
 173     compat_chr = unichr # Python 2
 174 except NameError:
 175     compat_chr = chr
 176
 177 try:
 178     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 179 except ImportError:  # Python 2.6
 180     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 181
 182 def compat_ord(c):
 183     if type(c) is int: return c
 184     else: return ord(c)
 185
 186 # This is not clearly defined otherwise
 187 compiled_regex_type = type(re.compile(''))
 188
 189 std_headers = {
 190     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 191     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 192     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 193     'Accept-Encoding': 'gzip, deflate',
 194     'Accept-Language': 'en-us,en;q=0.5',
 195 }
 196
 197 def preferredencoding():
 198     """Get preferred encoding.
 199
 200     Returns the best encoding scheme for the system, based on
 201     locale.getpreferredencoding() and some further tweaks.
 202     """
 203     try:
 204         pref = locale.getpreferredencoding()
 205         u'TEST'.encode(pref)
 206     except:
 207         pref = 'UTF-8'
 208
 209     return pref
 210
 211 if sys.version_info < (3,0):
 212     def compat_print(s):
 213         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 214 else:
 215     def compat_print(s):
 216         assert type(s) == type(u'')
 217         print(s)
 218
 219 # In Python 2.x, json.dump expects a bytestream.
 220 # In Python 3.x, it writes to a character stream
 221 if sys.version_info < (3,0):
 222     def write_json_file(obj, fn):
 223         with open(fn, 'wb') as f:
 224             json.dump(obj, f)
 225 else:
 226     def write_json_file(obj, fn):
 227         with open(fn, 'w', encoding='utf-8') as f:
 228             json.dump(obj, f)
 229
 230 if sys.version_info >= (2,7):
 231     def find_xpath_attr(node, xpath, key, val):
 232         """ Find the xpath xpath[@key=val] """
 233         assert re.match(r'^[a-zA-Z]+$', key)
 234         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 235         expr = xpath + u"[@%s='%s']" % (key, val)
 236         return node.find(expr)
 237 else:
 238     def find_xpath_attr(node, xpath, key, val):
 239         for f in node.findall(xpath):
 240             if f.attrib.get(key) == val:
 241                 return f
 242         return None
 243
 244 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 245 # the namespace parameter
 246 def xpath_with_ns(path, ns_map):
 247     components = [c.split(':') for c in path.split('/')]
 248     replaced = []
 249     for c in components:
 250         if len(c) == 1:
 251             replaced.append(c[0])
 252         else:
 253             ns, tag = c
 254             replaced.append('{%s}%s' % (ns_map[ns], tag))
 255     return '/'.join(replaced)
 256
 257 def htmlentity_transform(matchobj):
 258     """Transforms an HTML entity to a character.
 259
 260     This function receives a match object and is intended to be used with
 261     the re.sub() function.
 262     """
 263     entity = matchobj.group(1)
 264
 265     # Known non-numeric HTML entity
 266     if entity in compat_html_entities.name2codepoint:
 267         return compat_chr(compat_html_entities.name2codepoint[entity])
 268
 269     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 270     if mobj is not None:
 271         numstr = mobj.group(1)
 272         if numstr.startswith(u'x'):
 273             base = 16
 274             numstr = u'0%s' % numstr
 275         else:
 276             base = 10
 277         return compat_chr(int(numstr, base))
 278
 279     # Unknown entity in name, return its literal representation
 280     return (u'&%s;' % entity)
 281
 282 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 283 class BaseHTMLParser(compat_html_parser.HTMLParser):
 284     def __init(self):
 285         compat_html_parser.HTMLParser.__init__(self)
 286         self.html = None
 287
 288     def loads(self, html):
 289         self.html = html
 290         self.feed(html)
 291         self.close()
 292
 293 class AttrParser(BaseHTMLParser):
 294     """Modified HTMLParser that isolates a tag with the specified attribute"""
 295     def __init__(self, attribute, value):
 296         self.attribute = attribute
 297         self.value = value
 298         self.result = None
 299         self.started = False
 300         self.depth = {}
 301         self.watch_startpos = False
 302         self.error_count = 0
 303         BaseHTMLParser.__init__(self)
 304
 305     def error(self, message):
 306         if self.error_count > 10 or self.started:
 307             raise compat_html_parser.HTMLParseError(message, self.getpos())
 308         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 309         self.error_count += 1
 310         self.goahead(1)
 311
 312     def handle_starttag(self, tag, attrs):
 313         attrs = dict(attrs)
 314         if self.started:
 315             self.find_startpos(None)
 316         if self.attribute in attrs and attrs[self.attribute] == self.value:
 317             self.result = [tag]
 318             self.started = True
 319             self.watch_startpos = True
 320         if self.started:
 321             if not tag in self.depth: self.depth[tag] = 0
 322             self.depth[tag] += 1
 323
 324     def handle_endtag(self, tag):
 325         if self.started:
 326             if tag in self.depth: self.depth[tag] -= 1
 327             if self.depth[self.result[0]] == 0:
 328                 self.started = False
 329                 self.result.append(self.getpos())
 330
 331     def find_startpos(self, x):
 332         """Needed to put the start position of the result (self.result[1])
 333         after the opening tag with the requested id"""
 334         if self.watch_startpos:
 335             self.watch_startpos = False
 336             self.result.append(self.getpos())
 337     handle_entityref = handle_charref = handle_data = handle_comment = \
 338     handle_decl = handle_pi = unknown_decl = find_startpos
 339
 340     def get_result(self):
 341         if self.result is None:
 342             return None
 343         if len(self.result) != 3:
 344             return None
 345         lines = self.html.split('\n')
 346         lines = lines[self.result[1][0]-1:self.result[2][0]]
 347         lines[0] = lines[0][self.result[1][1]:]
 348         if len(lines) == 1:
 349             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 350         lines[-1] = lines[-1][:self.result[2][1]]
 351         return '\n'.join(lines).strip()
 352 # Hack for https://github.com/rg3/youtube-dl/issues/662
 353 if sys.version_info < (2, 7, 3):
 354     AttrParser.parse_endtag = (lambda self, i:
 355         i + len("</scr'+'ipt>")
 356         if self.rawdata[i:].startswith("</scr'+'ipt>")
 357         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 358
 359 def get_element_by_id(id, html):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute("id", id, html)
 362
 363 def get_element_by_attribute(attribute, value, html):
 364     """Return the content of the tag with the specified attribute in the passed HTML document"""
 365     parser = AttrParser(attribute, value)
 366     try:
 367         parser.loads(html)
 368     except compat_html_parser.HTMLParseError:
 369         pass
 370     return parser.get_result()
 371
 372 class MetaParser(BaseHTMLParser):
 373     """
 374     Modified HTMLParser that isolates a meta tag with the specified name
 375     attribute.
 376     """
 377     def __init__(self, name):
 378         BaseHTMLParser.__init__(self)
 379         self.name = name
 380         self.content = None
 381         self.result = None
 382
 383     def handle_starttag(self, tag, attrs):
 384         if tag != 'meta':
 385             return
 386         attrs = dict(attrs)
 387         if attrs.get('name') == self.name:
 388             self.result = attrs.get('content')
 389
 390     def get_result(self):
 391         return self.result
 392
 393 def get_meta_content(name, html):
 394     """
 395     Return the content attribute from the meta tag with the given name attribute.
 396     """
 397     parser = MetaParser(name)
 398     try:
 399         parser.loads(html)
 400     except compat_html_parser.HTMLParseError:
 401         pass
 402     return parser.get_result()
 403
 404
 405 def clean_html(html):
 406     """Clean an HTML snippet into a readable string"""
 407     # Newline vs <br />
 408     html = html.replace('\n', ' ')
 409     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 410     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 411     # Strip html tags
 412     html = re.sub('<.*?>', '', html)
 413     # Replace html entities
 414     html = unescapeHTML(html)
 415     return html.strip()
 416
 417
 418 def sanitize_open(filename, open_mode):
 419     """Try to open the given filename, and slightly tweak it if this fails.
 420
 421     Attempts to open the given filename. If this fails, it tries to change
 422     the filename slightly, step by step, until it's either able to open it
 423     or it fails and raises a final exception, like the standard open()
 424     function.
 425
 426     It returns the tuple (stream, definitive_file_name).
 427     """
 428     try:
 429         if filename == u'-':
 430             if sys.platform == 'win32':
 431                 import msvcrt
 432                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 433             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 434         stream = open(encodeFilename(filename), open_mode)
 435         return (stream, filename)
 436     except (IOError, OSError) as err:
 437         if err.errno in (errno.EACCES,):
 438             raise
 439
 440         # In case of error, try to remove win32 forbidden chars
 441         alt_filename = os.path.join(
 442                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 443                         for path_part in os.path.split(filename)
 444                        )
 445         if alt_filename == filename:
 446             raise
 447         else:
 448             # An exception here should be caught in the caller
 449             stream = open(encodeFilename(filename), open_mode)
 450             return (stream, alt_filename)
 451
 452
 453 def timeconvert(timestr):
 454     """Convert RFC 2822 defined time string into system timestamp"""
 455     timestamp = None
 456     timetuple = email.utils.parsedate_tz(timestr)
 457     if timetuple is not None:
 458         timestamp = email.utils.mktime_tz(timetuple)
 459     return timestamp
 460
 461 def sanitize_filename(s, restricted=False, is_id=False):
 462     """Sanitizes a string so it could be used as part of a filename.
 463     If restricted is set, use a stricter subset of allowed characters.
 464     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 465     """
 466     def replace_insane(char):
 467         if char == '?' or ord(char) < 32 or ord(char) == 127:
 468             return ''
 469         elif char == '"':
 470             return '' if restricted else '\''
 471         elif char == ':':
 472             return '_-' if restricted else ' -'
 473         elif char in '\\/|*<>':
 474             return '_'
 475         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 476             return '_'
 477         if restricted and ord(char) > 127:
 478             return '_'
 479         return char
 480
 481     result = u''.join(map(replace_insane, s))
 482     if not is_id:
 483         while '__' in result:
 484             result = result.replace('__', '_')
 485         result = result.strip('_')
 486         # Common case of "Foreign band name - English song title"
 487         if restricted and result.startswith('-_'):
 488             result = result[2:]
 489         if not result:
 490             result = '_'
 491     return result
 492
 493 def orderedSet(iterable):
 494     """ Remove all duplicates from the input iterable """
 495     res = []
 496     for el in iterable:
 497         if el not in res:
 498             res.append(el)
 499     return res
 500
 501 def unescapeHTML(s):
 502     """
 503     @param s a string
 504     """
 505     assert type(s) == type(u'')
 506
 507     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 508     return result
 509
 510
 511 def encodeFilename(s, for_subprocess=False):
 512     """
 513     @param s The name of the file
 514     """
 515
 516     assert type(s) == compat_str
 517
 518     # Python 3 has a Unicode API
 519     if sys.version_info >= (3, 0):
 520         return s
 521
 522     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 523         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 524         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 525         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 526         if not for_subprocess:
 527             return s
 528         else:
 529             # For subprocess calls, encode with locale encoding
 530             # Refer to http://stackoverflow.com/a/9951851/35070
 531             encoding = preferredencoding()
 532     else:
 533         encoding = sys.getfilesystemencoding()
 534     if encoding is None:
 535         encoding = 'utf-8'
 536     return s.encode(encoding, 'ignore')
 537
 538
 539 def decodeOption(optval):
 540     if optval is None:
 541         return optval
 542     if isinstance(optval, bytes):
 543         optval = optval.decode(preferredencoding())
 544
 545     assert isinstance(optval, compat_str)
 546     return optval
 547
 548 def formatSeconds(secs):
 549     if secs > 3600:
 550         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 551     elif secs > 60:
 552         return '%d:%02d' % (secs // 60, secs % 60)
 553     else:
 554         return '%d' % secs
 555
 556
 557 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 558     if sys.version_info < (3, 2):
 559         import httplib
 560
 561         class HTTPSConnectionV3(httplib.HTTPSConnection):
 562             def __init__(self, *args, **kwargs):
 563                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 564
 565             def connect(self):
 566                 sock = socket.create_connection((self.host, self.port), self.timeout)
 567                 if getattr(self, '_tunnel_host', False):
 568                     self.sock = sock
 569                     self._tunnel()
 570                 try:
 571                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 572                 except ssl.SSLError:
 573                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 574
 575         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 576             def https_open(self, req):
 577                 return self.do_open(HTTPSConnectionV3, req)
 578         return HTTPSHandlerV3(**kwargs)
 579     else:
 580         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 581         context.verify_mode = (ssl.CERT_NONE
 582                                if opts_no_check_certificate
 583                                else ssl.CERT_REQUIRED)
 584         context.set_default_verify_paths()
 585         try:
 586             context.load_default_certs()
 587         except AttributeError:
 588             pass  # Python < 3.4
 589         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 590
 591 class ExtractorError(Exception):
 592     """Error during info extraction."""
 593     def __init__(self, msg, tb=None, expected=False, cause=None):
 594         """ tb, if given, is the original traceback (so that it can be printed out).
 595         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 596         """
 597
 598         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 599             expected = True
 600         if not expected:
 601             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 602         super(ExtractorError, self).__init__(msg)
 603
 604         self.traceback = tb
 605         self.exc_info = sys.exc_info()  # preserve original exception
 606         self.cause = cause
 607
 608     def format_traceback(self):
 609         if self.traceback is None:
 610             return None
 611         return u''.join(traceback.format_tb(self.traceback))
 612
 613
 614 class RegexNotFoundError(ExtractorError):
 615     """Error when a regex didn't match"""
 616     pass
 617
 618
 619 class DownloadError(Exception):
 620     """Download Error exception.
 621
 622     This exception may be thrown by FileDownloader objects if they are not
 623     configured to continue on errors. They will contain the appropriate
 624     error message.
 625     """
 626     def __init__(self, msg, exc_info=None):
 627         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 628         super(DownloadError, self).__init__(msg)
 629         self.exc_info = exc_info
 630
 631
 632 class SameFileError(Exception):
 633     """Same File exception.
 634
 635     This exception will be thrown by FileDownloader objects if they detect
 636     multiple files would have to be downloaded to the same file on disk.
 637     """
 638     pass
 639
 640
 641 class PostProcessingError(Exception):
 642     """Post Processing exception.
 643
 644     This exception may be raised by PostProcessor's .run() method to
 645     indicate an error in the postprocessing task.
 646     """
 647     def __init__(self, msg):
 648         self.msg = msg
 649
 650 class MaxDownloadsReached(Exception):
 651     """ --max-downloads limit has been reached. """
 652     pass
 653
 654
 655 class UnavailableVideoError(Exception):
 656     """Unavailable Format exception.
 657
 658     This exception will be thrown when a video is requested
 659     in a format that is not available for that video.
 660     """
 661     pass
 662
 663
 664 class ContentTooShortError(Exception):
 665     """Content Too Short exception.
 666
 667     This exception may be raised by FileDownloader objects when a file they
 668     download is too small for what the server announced first, indicating
 669     the connection was probably interrupted.
 670     """
 671     # Both in bytes
 672     downloaded = None
 673     expected = None
 674
 675     def __init__(self, downloaded, expected):
 676         self.downloaded = downloaded
 677         self.expected = expected
 678
 679 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 680     """Handler for HTTP requests and responses.
 681
 682     This class, when installed with an OpenerDirector, automatically adds
 683     the standard headers to every HTTP request and handles gzipped and
 684     deflated responses from web servers. If compression is to be avoided in
 685     a particular request, the original request in the program code only has
 686     to include the HTTP header "Youtubedl-No-Compression", which will be
 687     removed before making the real request.
 688
 689     Part of this code was copied from:
 690
 691     http://techknack.net/python-urllib2-handlers/
 692
 693     Andrew Rowls, the author of that code, agreed to release it to the
 694     public domain.
 695     """
 696
 697     @staticmethod
 698     def deflate(data):
 699         try:
 700             return zlib.decompress(data, -zlib.MAX_WBITS)
 701         except zlib.error:
 702             return zlib.decompress(data)
 703
 704     @staticmethod
 705     def addinfourl_wrapper(stream, headers, url, code):
 706         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 707             return compat_urllib_request.addinfourl(stream, headers, url, code)
 708         ret = compat_urllib_request.addinfourl(stream, headers, url)
 709         ret.code = code
 710         return ret
 711
 712     def http_request(self, req):
 713         for h,v in std_headers.items():
 714             if h in req.headers:
 715                 del req.headers[h]
 716             req.add_header(h, v)
 717         if 'Youtubedl-no-compression' in req.headers:
 718             if 'Accept-encoding' in req.headers:
 719                 del req.headers['Accept-encoding']
 720             del req.headers['Youtubedl-no-compression']
 721         if 'Youtubedl-user-agent' in req.headers:
 722             if 'User-agent' in req.headers:
 723                 del req.headers['User-agent']
 724             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 725             del req.headers['Youtubedl-user-agent']
 726         return req
 727
 728     def http_response(self, req, resp):
 729         old_resp = resp
 730         # gzip
 731         if resp.headers.get('Content-encoding', '') == 'gzip':
 732             content = resp.read()
 733             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 734             try:
 735                 uncompressed = io.BytesIO(gz.read())
 736             except IOError as original_ioerror:
 737                 # There may be junk add the end of the file
 738                 # See http://stackoverflow.com/q/4928560/35070 for details
 739                 for i in range(1, 1024):
 740                     try:
 741                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 742                         uncompressed = io.BytesIO(gz.read())
 743                     except IOError:
 744                         continue
 745                     break
 746                 else:
 747                     raise original_ioerror
 748             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 749             resp.msg = old_resp.msg
 750         # deflate
 751         if resp.headers.get('Content-encoding', '') == 'deflate':
 752             gz = io.BytesIO(self.deflate(resp.read()))
 753             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 754             resp.msg = old_resp.msg
 755         return resp
 756
 757     https_request = http_request
 758     https_response = http_response
 759
 760
 761 def unified_strdate(date_str):
 762     """Return a string with the date in the format YYYYMMDD"""
 763     upload_date = None
 764     #Replace commas
 765     date_str = date_str.replace(',', ' ')
 766     # %z (UTC offset) is only supported in python>=3.2
 767     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 768     format_expressions = [
 769         '%d %B %Y',
 770         '%d %b %Y',
 771         '%B %d %Y',
 772         '%b %d %Y',
 773         '%Y-%m-%d',
 774         '%d/%m/%Y',
 775         '%Y/%m/%d %H:%M:%S',
 776         '%Y-%m-%d %H:%M:%S',
 777         '%d.%m.%Y %H:%M',
 778         '%Y-%m-%dT%H:%M:%SZ',
 779         '%Y-%m-%dT%H:%M:%S.%fZ',
 780         '%Y-%m-%dT%H:%M:%S.%f0Z',
 781         '%Y-%m-%dT%H:%M:%S',
 782         '%Y-%m-%dT%H:%M',
 783     ]
 784     for expression in format_expressions:
 785         try:
 786             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 787         except ValueError:
 788             pass
 789     if upload_date is None:
 790         timetuple = email.utils.parsedate_tz(date_str)
 791         if timetuple:
 792             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 793     return upload_date
 794
 795 def determine_ext(url, default_ext=u'unknown_video'):
 796     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 797     if re.match(r'^[A-Za-z0-9]+$', guess):
 798         return guess
 799     else:
 800         return default_ext
 801
 802 def subtitles_filename(filename, sub_lang, sub_format):
 803     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 804
 805 def date_from_str(date_str):
 806     """
 807     Return a datetime object from a string in the format YYYYMMDD or
 808     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 809     today = datetime.date.today()
 810     if date_str == 'now'or date_str == 'today':
 811         return today
 812     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 813     if match is not None:
 814         sign = match.group('sign')
 815         time = int(match.group('time'))
 816         if sign == '-':
 817             time = -time
 818         unit = match.group('unit')
 819         #A bad aproximation?
 820         if unit == 'month':
 821             unit = 'day'
 822             time *= 30
 823         elif unit == 'year':
 824             unit = 'day'
 825             time *= 365
 826         unit += 's'
 827         delta = datetime.timedelta(**{unit: time})
 828         return today + delta
 829     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 830
 831 def hyphenate_date(date_str):
 832     """
 833     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 834     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 835     if match is not None:
 836         return '-'.join(match.groups())
 837     else:
 838         return date_str
 839
 840 class DateRange(object):
 841     """Represents a time interval between two dates"""
 842     def __init__(self, start=None, end=None):
 843         """start and end must be strings in the format accepted by date"""
 844         if start is not None:
 845             self.start = date_from_str(start)
 846         else:
 847             self.start = datetime.datetime.min.date()
 848         if end is not None:
 849             self.end = date_from_str(end)
 850         else:
 851             self.end = datetime.datetime.max.date()
 852         if self.start > self.end:
 853             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 854     @classmethod
 855     def day(cls, day):
 856         """Returns a range that only contains the given day"""
 857         return cls(day,day)
 858     def __contains__(self, date):
 859         """Check if the date is in the range"""
 860         if not isinstance(date, datetime.date):
 861             date = date_from_str(date)
 862         return self.start <= date <= self.end
 863     def __str__(self):
 864         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 865
 866
 867 def platform_name():
 868     """ Returns the platform name as a compat_str """
 869     res = platform.platform()
 870     if isinstance(res, bytes):
 871         res = res.decode(preferredencoding())
 872
 873     assert isinstance(res, compat_str)
 874     return res
 875
 876
 877 def write_string(s, out=None):
 878     if out is None:
 879         out = sys.stderr
 880     assert type(s) == compat_str
 881
 882     if ('b' in getattr(out, 'mode', '') or
 883             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 884         s = s.encode(preferredencoding(), 'ignore')
 885     try:
 886         out.write(s)
 887     except UnicodeEncodeError:
 888         # In Windows shells, this can fail even when the codec is just charmap!?
 889         # See https://wiki.python.org/moin/PrintFails#Issue
 890         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 891             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 892             out.write(s)
 893         else:
 894             raise
 895
 896     out.flush()
 897
 898
 899 def bytes_to_intlist(bs):
 900     if not bs:
 901         return []
 902     if isinstance(bs[0], int):  # Python 3
 903         return list(bs)
 904     else:
 905         return [ord(c) for c in bs]
 906
 907
 908 def intlist_to_bytes(xs):
 909     if not xs:
 910         return b''
 911     if isinstance(chr(0), bytes):  # Python 2
 912         return ''.join([chr(x) for x in xs])
 913     else:
 914         return bytes(xs)
 915
 916
 917 def get_cachedir(params={}):
 918     cache_root = os.environ.get('XDG_CACHE_HOME',
 919                                 os.path.expanduser('~/.cache'))
 920     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 921
 922
 923 # Cross-platform file locking
 924 if sys.platform == 'win32':
 925     import ctypes.wintypes
 926     import msvcrt
 927
 928     class OVERLAPPED(ctypes.Structure):
 929         _fields_ = [
 930             ('Internal', ctypes.wintypes.LPVOID),
 931             ('InternalHigh', ctypes.wintypes.LPVOID),
 932             ('Offset', ctypes.wintypes.DWORD),
 933             ('OffsetHigh', ctypes.wintypes.DWORD),
 934             ('hEvent', ctypes.wintypes.HANDLE),
 935         ]
 936
 937     kernel32 = ctypes.windll.kernel32
 938     LockFileEx = kernel32.LockFileEx
 939     LockFileEx.argtypes = [
 940         ctypes.wintypes.HANDLE,     # hFile
 941         ctypes.wintypes.DWORD,      # dwFlags
 942         ctypes.wintypes.DWORD,      # dwReserved
 943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 945         ctypes.POINTER(OVERLAPPED)  # Overlapped
 946     ]
 947     LockFileEx.restype = ctypes.wintypes.BOOL
 948     UnlockFileEx = kernel32.UnlockFileEx
 949     UnlockFileEx.argtypes = [
 950         ctypes.wintypes.HANDLE,     # hFile
 951         ctypes.wintypes.DWORD,      # dwReserved
 952         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 954         ctypes.POINTER(OVERLAPPED)  # Overlapped
 955     ]
 956     UnlockFileEx.restype = ctypes.wintypes.BOOL
 957     whole_low = 0xffffffff
 958     whole_high = 0x7fffffff
 959
 960     def _lock_file(f, exclusive):
 961         overlapped = OVERLAPPED()
 962         overlapped.Offset = 0
 963         overlapped.OffsetHigh = 0
 964         overlapped.hEvent = 0
 965         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 966         handle = msvcrt.get_osfhandle(f.fileno())
 967         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 968                           whole_low, whole_high, f._lock_file_overlapped_p):
 969             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 970
 971     def _unlock_file(f):
 972         assert f._lock_file_overlapped_p
 973         handle = msvcrt.get_osfhandle(f.fileno())
 974         if not UnlockFileEx(handle, 0,
 975                             whole_low, whole_high, f._lock_file_overlapped_p):
 976             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 977
 978 else:
 979     import fcntl
 980
 981     def _lock_file(f, exclusive):
 982         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 983
 984     def _unlock_file(f):
 985         fcntl.lockf(f, fcntl.LOCK_UN)
 986
 987
 988 class locked_file(object):
 989     def __init__(self, filename, mode, encoding=None):
 990         assert mode in ['r', 'a', 'w']
 991         self.f = io.open(filename, mode, encoding=encoding)
 992         self.mode = mode
 993
 994     def __enter__(self):
 995         exclusive = self.mode != 'r'
 996         try:
 997             _lock_file(self.f, exclusive)
 998         except IOError:
 999             self.f.close()
1000             raise
1001         return self
1002
1003     def __exit__(self, etype, value, traceback):
1004         try:
1005             _unlock_file(self.f)
1006         finally:
1007             self.f.close()
1008
1009     def __iter__(self):
1010         return iter(self.f)
1011
1012     def write(self, *args):
1013         return self.f.write(*args)
1014
1015     def read(self, *args):
1016         return self.f.read(*args)
1017
1018
1019 def shell_quote(args):
1020     quoted_args = []
1021     encoding = sys.getfilesystemencoding()
1022     if encoding is None:
1023         encoding = 'utf-8'
1024     for a in args:
1025         if isinstance(a, bytes):
1026             # We may get a filename encoded with 'encodeFilename'
1027             a = a.decode(encoding)
1028         quoted_args.append(pipes.quote(a))
1029     return u' '.join(quoted_args)
1030
1031
1032 def takewhile_inclusive(pred, seq):
1033     """ Like itertools.takewhile, but include the latest evaluated element
1034         (the first element so that Not pred(e)) """
1035     for e in seq:
1036         yield e
1037         if not pred(e):
1038             return
1039
1040
1041 def smuggle_url(url, data):
1042     """ Pass additional data in a URL for internal use. """
1043
1044     sdata = compat_urllib_parse.urlencode(
1045         {u'__youtubedl_smuggle': json.dumps(data)})
1046     return url + u'#' + sdata
1047
1048
1049 def unsmuggle_url(smug_url, default=None):
1050     if not '#__youtubedl_smuggle' in smug_url:
1051         return smug_url, default
1052     url, _, sdata = smug_url.rpartition(u'#')
1053     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1054     data = json.loads(jsond)
1055     return url, data
1056
1057
1058 def format_bytes(bytes):
1059     if bytes is None:
1060         return u'N/A'
1061     if type(bytes) is str:
1062         bytes = float(bytes)
1063     if bytes == 0.0:
1064         exponent = 0
1065     else:
1066         exponent = int(math.log(bytes, 1024.0))
1067     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1068     converted = float(bytes) / float(1024 ** exponent)
1069     return u'%.2f%s' % (converted, suffix)
1070
1071
1072 def str_to_int(int_str):
1073     int_str = re.sub(r'[,\.]', u'', int_str)
1074     return int(int_str)
1075
1076
1077 def get_term_width():
1078     columns = os.environ.get('COLUMNS', None)
1079     if columns:
1080         return int(columns)
1081
1082     try:
1083         sp = subprocess.Popen(
1084             ['stty', 'size'],
1085             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1086         out, err = sp.communicate()
1087         return int(out.split()[1])
1088     except:
1089         pass
1090     return None
1091
1092
1093 def month_by_name(name):
1094     """ Return the number of a month by (locale-independently) English name """
1095
1096     ENGLISH_NAMES = [
1097         u'January', u'February', u'March', u'April', u'May', u'June',
1098         u'July', u'August', u'September', u'October', u'November', u'December']
1099     try:
1100         return ENGLISH_NAMES.index(name) + 1
1101     except ValueError:
1102         return None
1103
1104
1105 def fix_xml_ampersands(xml_str):
1106     """Replace all the '&' by '&amp;' in XML"""
1107     return re.sub(
1108         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1109         u'&amp;',
1110         xml_str)
1111
1112
1113 def setproctitle(title):
1114     assert isinstance(title, compat_str)
1115     try:
1116         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1117     except OSError:
1118         return
1119     title = title
1120     buf = ctypes.create_string_buffer(len(title) + 1)
1121     buf.value = title.encode('utf-8')
1122     try:
1123         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1124     except AttributeError:
1125         return  # Strange libc, just skip this
1126
1127
1128 def remove_start(s, start):
1129     if s.startswith(start):
1130         return s[len(start):]
1131     return s
1132
1133
1134 def url_basename(url):
1135     path = compat_urlparse.urlparse(url).path
1136     return path.strip(u'/').split(u'/')[-1]
1137
1138
1139 class HEADRequest(compat_urllib_request.Request):
1140     def get_method(self):
1141         return "HEAD"
1142
1143
1144 def int_or_none(v, scale=1):
1145     return v if v is None else (int(v) // scale)
1146
1147
1148 def parse_duration(s):
1149     if s is None:
1150         return None
1151
1152     m = re.match(
1153         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1154     if not m:
1155         return None
1156     res = int(m.group('secs'))
1157     if m.group('mins'):
1158         res += int(m.group('mins')) * 60
1159         if m.group('hours'):
1160             res += int(m.group('hours')) * 60 * 60
1161     return res
1162
1163
1164 def prepend_extension(filename, ext):
1165     name, real_ext = os.path.splitext(filename)
1166     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1167
1168
1169 def check_executable(exe, args=[]):
1170     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1171     args can be a list of arguments for a short output (like -version) """
1172     try:
1173         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1174     except OSError:
1175         return False
1176     return exe
1177
1178
1179 class PagedList(object):
1180     def __init__(self, pagefunc, pagesize):
1181         self._pagefunc = pagefunc
1182         self._pagesize = pagesize
1183
1184     def __len__(self):
1185         # This is only useful for tests
1186         return len(self.getslice())
1187
1188     def getslice(self, start=0, end=None):
1189         res = []
1190         for pagenum in itertools.count(start // self._pagesize):
1191             firstid = pagenum * self._pagesize
1192             nextfirstid = pagenum * self._pagesize + self._pagesize
1193             if start >= nextfirstid:
1194                 continue
1195
1196             page_results = list(self._pagefunc(pagenum))
1197
1198             startv = (
1199                 start % self._pagesize
1200                 if firstid <= start < nextfirstid
1201                 else 0)
1202
1203             endv = (
1204                 ((end - 1) % self._pagesize) + 1
1205                 if (end is not None and firstid <= end <= nextfirstid)
1206                 else None)
1207
1208             if startv != 0 or endv is not None:
1209                 page_results = page_results[startv:endv]
1210             res.extend(page_results)
1211
1212             # A little optimization - if current page is not "full", ie. does
1213             # not contain page_size videos then we can assume that this page
1214             # is the last one - there are no more ids on further pages -
1215             # i.e. no need to query again.
1216             if len(page_results) + startv < self._pagesize:
1217                 break
1218
1219             # If we got the whole page, but the next page is not interesting,
1220             # break out early as well
1221             if end == nextfirstid:
1222                 break
1223         return res
1224
1225
1226 def uppercase_escape(s):
1227     return re.sub(
1228         r'\\U([0-9a-fA-F]{8})',
1229         lambda m: compat_chr(int(m.group(1), base=16)), s)
1230
1231 try:
1232     struct.pack(u'!I', 0)
1233 except TypeError:
1234     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1235     def struct_pack(spec, *args):
1236         if isinstance(spec, compat_str):
1237             spec = spec.encode('ascii')
1238         return struct.pack(spec, *args)
1239
1240     def struct_unpack(spec, *args):
1241         if isinstance(spec, compat_str):
1242             spec = spec.encode('ascii')
1243         return struct.unpack(spec, *args)
1244 else:
1245     struct_pack = struct.pack
1246     struct_unpack = struct.unpack