_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import contextlib
   6 import ctypes
   7 import datetime
   8 import email.utils
   9 import errno
  10 import getpass
  11 import gzip
  12 import itertools
  13 import io
  14 import json
  15 import locale
  16 import math
  17 import os
  18 import pipes
  19 import platform
  20 import re
  21 import ssl
  22 import socket
  23 import struct
  24 import subprocess
  25 import sys
  26 import traceback
  27 import xml.etree.ElementTree
  28 import zlib
  29
  30 try:
  31     import urllib.request as compat_urllib_request
  32 except ImportError: # Python 2
  33     import urllib2 as compat_urllib_request
  34
  35 try:
  36     import urllib.error as compat_urllib_error
  37 except ImportError: # Python 2
  38     import urllib2 as compat_urllib_error
  39
  40 try:
  41     import urllib.parse as compat_urllib_parse
  42 except ImportError: # Python 2
  43     import urllib as compat_urllib_parse
  44
  45 try:
  46     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  47 except ImportError: # Python 2
  48     from urlparse import urlparse as compat_urllib_parse_urlparse
  49
  50 try:
  51     import urllib.parse as compat_urlparse
  52 except ImportError: # Python 2
  53     import urlparse as compat_urlparse
  54
  55 try:
  56     import http.cookiejar as compat_cookiejar
  57 except ImportError: # Python 2
  58     import cookielib as compat_cookiejar
  59
  60 try:
  61     import html.entities as compat_html_entities
  62 except ImportError: # Python 2
  63     import htmlentitydefs as compat_html_entities
  64
  65 try:
  66     import html.parser as compat_html_parser
  67 except ImportError: # Python 2
  68     import HTMLParser as compat_html_parser
  69
  70 try:
  71     import http.client as compat_http_client
  72 except ImportError: # Python 2
  73     import httplib as compat_http_client
  74
  75 try:
  76     from urllib.error import HTTPError as compat_HTTPError
  77 except ImportError:  # Python 2
  78     from urllib2 import HTTPError as compat_HTTPError
  79
  80 try:
  81     from urllib.request import urlretrieve as compat_urlretrieve
  82 except ImportError:  # Python 2
  83     from urllib import urlretrieve as compat_urlretrieve
  84
  85
  86 try:
  87     from subprocess import DEVNULL
  88     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  89 except ImportError:
  90     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  91
  92 try:
  93     from urllib.parse import parse_qs as compat_parse_qs
  94 except ImportError: # Python 2
  95     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  96     # Python 2's version is apparently totally broken
  97     def _unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 133                 encoding='utf-8', errors='replace'):
 134         qs, _coerce_result = qs, unicode
 135         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 136         r = []
 137         for name_value in pairs:
 138             if not name_value and not strict_parsing:
 139                 continue
 140             nv = name_value.split('=', 1)
 141             if len(nv) != 2:
 142                 if strict_parsing:
 143                     raise ValueError("bad query field: %r" % (name_value,))
 144                 # Handle case of a control-name with no equal sign
 145                 if keep_blank_values:
 146                     nv.append('')
 147                 else:
 148                     continue
 149             if len(nv[1]) or keep_blank_values:
 150                 name = nv[0].replace('+', ' ')
 151                 name = _unquote(name, encoding=encoding, errors=errors)
 152                 name = _coerce_result(name)
 153                 value = nv[1].replace('+', ' ')
 154                 value = _unquote(value, encoding=encoding, errors=errors)
 155                 value = _coerce_result(value)
 156                 r.append((name, value))
 157         return r
 158
 159     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 160                 encoding='utf-8', errors='replace'):
 161         parsed_result = {}
 162         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 163                         encoding=encoding, errors=errors)
 164         for name, value in pairs:
 165             if name in parsed_result:
 166                 parsed_result[name].append(value)
 167             else:
 168                 parsed_result[name] = [value]
 169         return parsed_result
 170
 171 try:
 172     compat_str = unicode # Python 2
 173 except NameError:
 174     compat_str = str
 175
 176 try:
 177     compat_chr = unichr # Python 2
 178 except NameError:
 179     compat_chr = chr
 180
 181 try:
 182     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 183 except ImportError:  # Python 2.6
 184     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 185
 186 def compat_ord(c):
 187     if type(c) is int: return c
 188     else: return ord(c)
 189
 190 # This is not clearly defined otherwise
 191 compiled_regex_type = type(re.compile(''))
 192
 193 std_headers = {
 194     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 195     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 196     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 197     'Accept-Encoding': 'gzip, deflate',
 198     'Accept-Language': 'en-us,en;q=0.5',
 199 }
 200
 201 def preferredencoding():
 202     """Get preferred encoding.
 203
 204     Returns the best encoding scheme for the system, based on
 205     locale.getpreferredencoding() and some further tweaks.
 206     """
 207     try:
 208         pref = locale.getpreferredencoding()
 209         u'TEST'.encode(pref)
 210     except:
 211         pref = 'UTF-8'
 212
 213     return pref
 214
 215 if sys.version_info < (3,0):
 216     def compat_print(s):
 217         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 218 else:
 219     def compat_print(s):
 220         assert type(s) == type(u'')
 221         print(s)
 222
 223 # In Python 2.x, json.dump expects a bytestream.
 224 # In Python 3.x, it writes to a character stream
 225 if sys.version_info < (3,0):
 226     def write_json_file(obj, fn):
 227         with open(fn, 'wb') as f:
 228             json.dump(obj, f)
 229 else:
 230     def write_json_file(obj, fn):
 231         with open(fn, 'w', encoding='utf-8') as f:
 232             json.dump(obj, f)
 233
 234 if sys.version_info >= (2,7):
 235     def find_xpath_attr(node, xpath, key, val):
 236         """ Find the xpath xpath[@key=val] """
 237         assert re.match(r'^[a-zA-Z]+$', key)
 238         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 239         expr = xpath + u"[@%s='%s']" % (key, val)
 240         return node.find(expr)
 241 else:
 242     def find_xpath_attr(node, xpath, key, val):
 243         for f in node.findall(xpath):
 244             if f.attrib.get(key) == val:
 245                 return f
 246         return None
 247
 248 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 249 # the namespace parameter
 250 def xpath_with_ns(path, ns_map):
 251     components = [c.split(':') for c in path.split('/')]
 252     replaced = []
 253     for c in components:
 254         if len(c) == 1:
 255             replaced.append(c[0])
 256         else:
 257             ns, tag = c
 258             replaced.append('{%s}%s' % (ns_map[ns], tag))
 259     return '/'.join(replaced)
 260
 261 def htmlentity_transform(matchobj):
 262     """Transforms an HTML entity to a character.
 263
 264     This function receives a match object and is intended to be used with
 265     the re.sub() function.
 266     """
 267     entity = matchobj.group(1)
 268
 269     # Known non-numeric HTML entity
 270     if entity in compat_html_entities.name2codepoint:
 271         return compat_chr(compat_html_entities.name2codepoint[entity])
 272
 273     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 274     if mobj is not None:
 275         numstr = mobj.group(1)
 276         if numstr.startswith(u'x'):
 277             base = 16
 278             numstr = u'0%s' % numstr
 279         else:
 280             base = 10
 281         return compat_chr(int(numstr, base))
 282
 283     # Unknown entity in name, return its literal representation
 284     return (u'&%s;' % entity)
 285
 286 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 287 class BaseHTMLParser(compat_html_parser.HTMLParser):
 288     def __init(self):
 289         compat_html_parser.HTMLParser.__init__(self)
 290         self.html = None
 291
 292     def loads(self, html):
 293         self.html = html
 294         self.feed(html)
 295         self.close()
 296
 297 class AttrParser(BaseHTMLParser):
 298     """Modified HTMLParser that isolates a tag with the specified attribute"""
 299     def __init__(self, attribute, value):
 300         self.attribute = attribute
 301         self.value = value
 302         self.result = None
 303         self.started = False
 304         self.depth = {}
 305         self.watch_startpos = False
 306         self.error_count = 0
 307         BaseHTMLParser.__init__(self)
 308
 309     def error(self, message):
 310         if self.error_count > 10 or self.started:
 311             raise compat_html_parser.HTMLParseError(message, self.getpos())
 312         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 313         self.error_count += 1
 314         self.goahead(1)
 315
 316     def handle_starttag(self, tag, attrs):
 317         attrs = dict(attrs)
 318         if self.started:
 319             self.find_startpos(None)
 320         if self.attribute in attrs and attrs[self.attribute] == self.value:
 321             self.result = [tag]
 322             self.started = True
 323             self.watch_startpos = True
 324         if self.started:
 325             if not tag in self.depth: self.depth[tag] = 0
 326             self.depth[tag] += 1
 327
 328     def handle_endtag(self, tag):
 329         if self.started:
 330             if tag in self.depth: self.depth[tag] -= 1
 331             if self.depth[self.result[0]] == 0:
 332                 self.started = False
 333                 self.result.append(self.getpos())
 334
 335     def find_startpos(self, x):
 336         """Needed to put the start position of the result (self.result[1])
 337         after the opening tag with the requested id"""
 338         if self.watch_startpos:
 339             self.watch_startpos = False
 340             self.result.append(self.getpos())
 341     handle_entityref = handle_charref = handle_data = handle_comment = \
 342     handle_decl = handle_pi = unknown_decl = find_startpos
 343
 344     def get_result(self):
 345         if self.result is None:
 346             return None
 347         if len(self.result) != 3:
 348             return None
 349         lines = self.html.split('\n')
 350         lines = lines[self.result[1][0]-1:self.result[2][0]]
 351         lines[0] = lines[0][self.result[1][1]:]
 352         if len(lines) == 1:
 353             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 354         lines[-1] = lines[-1][:self.result[2][1]]
 355         return '\n'.join(lines).strip()
 356 # Hack for https://github.com/rg3/youtube-dl/issues/662
 357 if sys.version_info < (2, 7, 3):
 358     AttrParser.parse_endtag = (lambda self, i:
 359         i + len("</scr'+'ipt>")
 360         if self.rawdata[i:].startswith("</scr'+'ipt>")
 361         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 362
 363 def get_element_by_id(id, html):
 364     """Return the content of the tag with the specified ID in the passed HTML document"""
 365     return get_element_by_attribute("id", id, html)
 366
 367 def get_element_by_attribute(attribute, value, html):
 368     """Return the content of the tag with the specified attribute in the passed HTML document"""
 369     parser = AttrParser(attribute, value)
 370     try:
 371         parser.loads(html)
 372     except compat_html_parser.HTMLParseError:
 373         pass
 374     return parser.get_result()
 375
 376 class MetaParser(BaseHTMLParser):
 377     """
 378     Modified HTMLParser that isolates a meta tag with the specified name
 379     attribute.
 380     """
 381     def __init__(self, name):
 382         BaseHTMLParser.__init__(self)
 383         self.name = name
 384         self.content = None
 385         self.result = None
 386
 387     def handle_starttag(self, tag, attrs):
 388         if tag != 'meta':
 389             return
 390         attrs = dict(attrs)
 391         if attrs.get('name') == self.name:
 392             self.result = attrs.get('content')
 393
 394     def get_result(self):
 395         return self.result
 396
 397 def get_meta_content(name, html):
 398     """
 399     Return the content attribute from the meta tag with the given name attribute.
 400     """
 401     parser = MetaParser(name)
 402     try:
 403         parser.loads(html)
 404     except compat_html_parser.HTMLParseError:
 405         pass
 406     return parser.get_result()
 407
 408
 409 def clean_html(html):
 410     """Clean an HTML snippet into a readable string"""
 411     # Newline vs <br />
 412     html = html.replace('\n', ' ')
 413     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 414     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 415     # Strip html tags
 416     html = re.sub('<.*?>', '', html)
 417     # Replace html entities
 418     html = unescapeHTML(html)
 419     return html.strip()
 420
 421
 422 def sanitize_open(filename, open_mode):
 423     """Try to open the given filename, and slightly tweak it if this fails.
 424
 425     Attempts to open the given filename. If this fails, it tries to change
 426     the filename slightly, step by step, until it's either able to open it
 427     or it fails and raises a final exception, like the standard open()
 428     function.
 429
 430     It returns the tuple (stream, definitive_file_name).
 431     """
 432     try:
 433         if filename == u'-':
 434             if sys.platform == 'win32':
 435                 import msvcrt
 436                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 437             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 438         stream = open(encodeFilename(filename), open_mode)
 439         return (stream, filename)
 440     except (IOError, OSError) as err:
 441         if err.errno in (errno.EACCES,):
 442             raise
 443
 444         # In case of error, try to remove win32 forbidden chars
 445         alt_filename = os.path.join(
 446                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 447                         for path_part in os.path.split(filename)
 448                        )
 449         if alt_filename == filename:
 450             raise
 451         else:
 452             # An exception here should be caught in the caller
 453             stream = open(encodeFilename(filename), open_mode)
 454             return (stream, alt_filename)
 455
 456
 457 def timeconvert(timestr):
 458     """Convert RFC 2822 defined time string into system timestamp"""
 459     timestamp = None
 460     timetuple = email.utils.parsedate_tz(timestr)
 461     if timetuple is not None:
 462         timestamp = email.utils.mktime_tz(timetuple)
 463     return timestamp
 464
 465 def sanitize_filename(s, restricted=False, is_id=False):
 466     """Sanitizes a string so it could be used as part of a filename.
 467     If restricted is set, use a stricter subset of allowed characters.
 468     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 469     """
 470     def replace_insane(char):
 471         if char == '?' or ord(char) < 32 or ord(char) == 127:
 472             return ''
 473         elif char == '"':
 474             return '' if restricted else '\''
 475         elif char == ':':
 476             return '_-' if restricted else ' -'
 477         elif char in '\\/|*<>':
 478             return '_'
 479         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 480             return '_'
 481         if restricted and ord(char) > 127:
 482             return '_'
 483         return char
 484
 485     result = u''.join(map(replace_insane, s))
 486     if not is_id:
 487         while '__' in result:
 488             result = result.replace('__', '_')
 489         result = result.strip('_')
 490         # Common case of "Foreign band name - English song title"
 491         if restricted and result.startswith('-_'):
 492             result = result[2:]
 493         if not result:
 494             result = '_'
 495     return result
 496
 497 def orderedSet(iterable):
 498     """ Remove all duplicates from the input iterable """
 499     res = []
 500     for el in iterable:
 501         if el not in res:
 502             res.append(el)
 503     return res
 504
 505
 506 def unescapeHTML(s):
 507     if s is None:
 508         return None
 509     assert type(s) == compat_str
 510
 511     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 512     return result
 513
 514
 515 def encodeFilename(s, for_subprocess=False):
 516     """
 517     @param s The name of the file
 518     """
 519
 520     assert type(s) == compat_str
 521
 522     # Python 3 has a Unicode API
 523     if sys.version_info >= (3, 0):
 524         return s
 525
 526     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 527         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 528         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 529         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 530         if not for_subprocess:
 531             return s
 532         else:
 533             # For subprocess calls, encode with locale encoding
 534             # Refer to http://stackoverflow.com/a/9951851/35070
 535             encoding = preferredencoding()
 536     else:
 537         encoding = sys.getfilesystemencoding()
 538     if encoding is None:
 539         encoding = 'utf-8'
 540     return s.encode(encoding, 'ignore')
 541
 542 def decodeOption(optval):
 543     if optval is None:
 544         return optval
 545     if isinstance(optval, bytes):
 546         optval = optval.decode(preferredencoding())
 547
 548     assert isinstance(optval, compat_str)
 549     return optval
 550
 551 def formatSeconds(secs):
 552     if secs > 3600:
 553         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 554     elif secs > 60:
 555         return '%d:%02d' % (secs // 60, secs % 60)
 556     else:
 557         return '%d' % secs
 558
 559
 560 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 561     if sys.version_info < (3, 2):
 562         import httplib
 563
 564         class HTTPSConnectionV3(httplib.HTTPSConnection):
 565             def __init__(self, *args, **kwargs):
 566                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 567
 568             def connect(self):
 569                 sock = socket.create_connection((self.host, self.port), self.timeout)
 570                 if getattr(self, '_tunnel_host', False):
 571                     self.sock = sock
 572                     self._tunnel()
 573                 try:
 574                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 575                 except ssl.SSLError:
 576                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 577
 578         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 579             def https_open(self, req):
 580                 return self.do_open(HTTPSConnectionV3, req)
 581         return HTTPSHandlerV3(**kwargs)
 582     else:
 583         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 584         context.verify_mode = (ssl.CERT_NONE
 585                                if opts_no_check_certificate
 586                                else ssl.CERT_REQUIRED)
 587         context.set_default_verify_paths()
 588         try:
 589             context.load_default_certs()
 590         except AttributeError:
 591             pass  # Python < 3.4
 592         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 593
 594 class ExtractorError(Exception):
 595     """Error during info extraction."""
 596     def __init__(self, msg, tb=None, expected=False, cause=None):
 597         """ tb, if given, is the original traceback (so that it can be printed out).
 598         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 599         """
 600
 601         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 602             expected = True
 603         if not expected:
 604             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 605         super(ExtractorError, self).__init__(msg)
 606
 607         self.traceback = tb
 608         self.exc_info = sys.exc_info()  # preserve original exception
 609         self.cause = cause
 610
 611     def format_traceback(self):
 612         if self.traceback is None:
 613             return None
 614         return u''.join(traceback.format_tb(self.traceback))
 615
 616
 617 class RegexNotFoundError(ExtractorError):
 618     """Error when a regex didn't match"""
 619     pass
 620
 621
 622 class DownloadError(Exception):
 623     """Download Error exception.
 624
 625     This exception may be thrown by FileDownloader objects if they are not
 626     configured to continue on errors. They will contain the appropriate
 627     error message.
 628     """
 629     def __init__(self, msg, exc_info=None):
 630         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 631         super(DownloadError, self).__init__(msg)
 632         self.exc_info = exc_info
 633
 634
 635 class SameFileError(Exception):
 636     """Same File exception.
 637
 638     This exception will be thrown by FileDownloader objects if they detect
 639     multiple files would have to be downloaded to the same file on disk.
 640     """
 641     pass
 642
 643
 644 class PostProcessingError(Exception):
 645     """Post Processing exception.
 646
 647     This exception may be raised by PostProcessor's .run() method to
 648     indicate an error in the postprocessing task.
 649     """
 650     def __init__(self, msg):
 651         self.msg = msg
 652
 653 class MaxDownloadsReached(Exception):
 654     """ --max-downloads limit has been reached. """
 655     pass
 656
 657
 658 class UnavailableVideoError(Exception):
 659     """Unavailable Format exception.
 660
 661     This exception will be thrown when a video is requested
 662     in a format that is not available for that video.
 663     """
 664     pass
 665
 666
 667 class ContentTooShortError(Exception):
 668     """Content Too Short exception.
 669
 670     This exception may be raised by FileDownloader objects when a file they
 671     download is too small for what the server announced first, indicating
 672     the connection was probably interrupted.
 673     """
 674     # Both in bytes
 675     downloaded = None
 676     expected = None
 677
 678     def __init__(self, downloaded, expected):
 679         self.downloaded = downloaded
 680         self.expected = expected
 681
 682 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 683     """Handler for HTTP requests and responses.
 684
 685     This class, when installed with an OpenerDirector, automatically adds
 686     the standard headers to every HTTP request and handles gzipped and
 687     deflated responses from web servers. If compression is to be avoided in
 688     a particular request, the original request in the program code only has
 689     to include the HTTP header "Youtubedl-No-Compression", which will be
 690     removed before making the real request.
 691
 692     Part of this code was copied from:
 693
 694     http://techknack.net/python-urllib2-handlers/
 695
 696     Andrew Rowls, the author of that code, agreed to release it to the
 697     public domain.
 698     """
 699
 700     @staticmethod
 701     def deflate(data):
 702         try:
 703             return zlib.decompress(data, -zlib.MAX_WBITS)
 704         except zlib.error:
 705             return zlib.decompress(data)
 706
 707     @staticmethod
 708     def addinfourl_wrapper(stream, headers, url, code):
 709         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 710             return compat_urllib_request.addinfourl(stream, headers, url, code)
 711         ret = compat_urllib_request.addinfourl(stream, headers, url)
 712         ret.code = code
 713         return ret
 714
 715     def http_request(self, req):
 716         for h,v in std_headers.items():
 717             if h in req.headers:
 718                 del req.headers[h]
 719             req.add_header(h, v)
 720         if 'Youtubedl-no-compression' in req.headers:
 721             if 'Accept-encoding' in req.headers:
 722                 del req.headers['Accept-encoding']
 723             del req.headers['Youtubedl-no-compression']
 724         if 'Youtubedl-user-agent' in req.headers:
 725             if 'User-agent' in req.headers:
 726                 del req.headers['User-agent']
 727             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 728             del req.headers['Youtubedl-user-agent']
 729         return req
 730
 731     def http_response(self, req, resp):
 732         old_resp = resp
 733         # gzip
 734         if resp.headers.get('Content-encoding', '') == 'gzip':
 735             content = resp.read()
 736             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 737             try:
 738                 uncompressed = io.BytesIO(gz.read())
 739             except IOError as original_ioerror:
 740                 # There may be junk add the end of the file
 741                 # See http://stackoverflow.com/q/4928560/35070 for details
 742                 for i in range(1, 1024):
 743                     try:
 744                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 745                         uncompressed = io.BytesIO(gz.read())
 746                     except IOError:
 747                         continue
 748                     break
 749                 else:
 750                     raise original_ioerror
 751             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 752             resp.msg = old_resp.msg
 753         # deflate
 754         if resp.headers.get('Content-encoding', '') == 'deflate':
 755             gz = io.BytesIO(self.deflate(resp.read()))
 756             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 757             resp.msg = old_resp.msg
 758         return resp
 759
 760     https_request = http_request
 761     https_response = http_response
 762
 763
 764 def parse_iso8601(date_str):
 765     """ Return a UNIX timestamp from the given date """
 766
 767     if date_str is None:
 768         return None
 769
 770     m = re.search(
 771         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 772         date_str)
 773     if not m:
 774         timezone = datetime.timedelta()
 775     else:
 776         date_str = date_str[:-len(m.group(0))]
 777         if not m.group('sign'):
 778             timezone = datetime.timedelta()
 779         else:
 780             sign = 1 if m.group('sign') == '+' else -1
 781             timezone = datetime.timedelta(
 782                 hours=sign * int(m.group('hours')),
 783                 minutes=sign * int(m.group('minutes')))
 784
 785     dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
 786     return calendar.timegm(dt.timetuple())
 787
 788
 789 def unified_strdate(date_str):
 790     """Return a string with the date in the format YYYYMMDD"""
 791
 792     if date_str is None:
 793         return None
 794
 795     upload_date = None
 796     #Replace commas
 797     date_str = date_str.replace(',', ' ')
 798     # %z (UTC offset) is only supported in python>=3.2
 799     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 800     format_expressions = [
 801         '%d %B %Y',
 802         '%d %b %Y',
 803         '%B %d %Y',
 804         '%b %d %Y',
 805         '%Y-%m-%d',
 806         '%d.%m.%Y',
 807         '%d/%m/%Y',
 808         '%Y/%m/%d %H:%M:%S',
 809         '%Y-%m-%d %H:%M:%S',
 810         '%d.%m.%Y %H:%M',
 811         '%d.%m.%Y %H.%M',
 812         '%Y-%m-%dT%H:%M:%SZ',
 813         '%Y-%m-%dT%H:%M:%S.%fZ',
 814         '%Y-%m-%dT%H:%M:%S.%f0Z',
 815         '%Y-%m-%dT%H:%M:%S',
 816         '%Y-%m-%dT%H:%M:%S.%f',
 817         '%Y-%m-%dT%H:%M',
 818     ]
 819     for expression in format_expressions:
 820         try:
 821             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 822         except ValueError:
 823             pass
 824     if upload_date is None:
 825         timetuple = email.utils.parsedate_tz(date_str)
 826         if timetuple:
 827             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 828     return upload_date
 829
 830 def determine_ext(url, default_ext=u'unknown_video'):
 831     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 832     if re.match(r'^[A-Za-z0-9]+$', guess):
 833         return guess
 834     else:
 835         return default_ext
 836
 837 def subtitles_filename(filename, sub_lang, sub_format):
 838     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 839
 840 def date_from_str(date_str):
 841     """
 842     Return a datetime object from a string in the format YYYYMMDD or
 843     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 844     today = datetime.date.today()
 845     if date_str == 'now'or date_str == 'today':
 846         return today
 847     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 848     if match is not None:
 849         sign = match.group('sign')
 850         time = int(match.group('time'))
 851         if sign == '-':
 852             time = -time
 853         unit = match.group('unit')
 854         #A bad aproximation?
 855         if unit == 'month':
 856             unit = 'day'
 857             time *= 30
 858         elif unit == 'year':
 859             unit = 'day'
 860             time *= 365
 861         unit += 's'
 862         delta = datetime.timedelta(**{unit: time})
 863         return today + delta
 864     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 865
 866 def hyphenate_date(date_str):
 867     """
 868     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 869     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 870     if match is not None:
 871         return '-'.join(match.groups())
 872     else:
 873         return date_str
 874
 875 class DateRange(object):
 876     """Represents a time interval between two dates"""
 877     def __init__(self, start=None, end=None):
 878         """start and end must be strings in the format accepted by date"""
 879         if start is not None:
 880             self.start = date_from_str(start)
 881         else:
 882             self.start = datetime.datetime.min.date()
 883         if end is not None:
 884             self.end = date_from_str(end)
 885         else:
 886             self.end = datetime.datetime.max.date()
 887         if self.start > self.end:
 888             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 889     @classmethod
 890     def day(cls, day):
 891         """Returns a range that only contains the given day"""
 892         return cls(day,day)
 893     def __contains__(self, date):
 894         """Check if the date is in the range"""
 895         if not isinstance(date, datetime.date):
 896             date = date_from_str(date)
 897         return self.start <= date <= self.end
 898     def __str__(self):
 899         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 900
 901
 902 def platform_name():
 903     """ Returns the platform name as a compat_str """
 904     res = platform.platform()
 905     if isinstance(res, bytes):
 906         res = res.decode(preferredencoding())
 907
 908     assert isinstance(res, compat_str)
 909     return res
 910
 911
 912 def write_string(s, out=None):
 913     if out is None:
 914         out = sys.stderr
 915     assert type(s) == compat_str
 916
 917     if ('b' in getattr(out, 'mode', '') or
 918             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 919         s = s.encode(preferredencoding(), 'ignore')
 920     try:
 921         out.write(s)
 922     except UnicodeEncodeError:
 923         # In Windows shells, this can fail even when the codec is just charmap!?
 924         # See https://wiki.python.org/moin/PrintFails#Issue
 925         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 926             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 927             out.write(s)
 928         else:
 929             raise
 930
 931     out.flush()
 932
 933
 934 def bytes_to_intlist(bs):
 935     if not bs:
 936         return []
 937     if isinstance(bs[0], int):  # Python 3
 938         return list(bs)
 939     else:
 940         return [ord(c) for c in bs]
 941
 942
 943 def intlist_to_bytes(xs):
 944     if not xs:
 945         return b''
 946     if isinstance(chr(0), bytes):  # Python 2
 947         return ''.join([chr(x) for x in xs])
 948     else:
 949         return bytes(xs)
 950
 951
 952 def get_cachedir(params={}):
 953     cache_root = os.environ.get('XDG_CACHE_HOME',
 954                                 os.path.expanduser('~/.cache'))
 955     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 956
 957
 958 # Cross-platform file locking
 959 if sys.platform == 'win32':
 960     import ctypes.wintypes
 961     import msvcrt
 962
 963     class OVERLAPPED(ctypes.Structure):
 964         _fields_ = [
 965             ('Internal', ctypes.wintypes.LPVOID),
 966             ('InternalHigh', ctypes.wintypes.LPVOID),
 967             ('Offset', ctypes.wintypes.DWORD),
 968             ('OffsetHigh', ctypes.wintypes.DWORD),
 969             ('hEvent', ctypes.wintypes.HANDLE),
 970         ]
 971
 972     kernel32 = ctypes.windll.kernel32
 973     LockFileEx = kernel32.LockFileEx
 974     LockFileEx.argtypes = [
 975         ctypes.wintypes.HANDLE,     # hFile
 976         ctypes.wintypes.DWORD,      # dwFlags
 977         ctypes.wintypes.DWORD,      # dwReserved
 978         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 979         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 980         ctypes.POINTER(OVERLAPPED)  # Overlapped
 981     ]
 982     LockFileEx.restype = ctypes.wintypes.BOOL
 983     UnlockFileEx = kernel32.UnlockFileEx
 984     UnlockFileEx.argtypes = [
 985         ctypes.wintypes.HANDLE,     # hFile
 986         ctypes.wintypes.DWORD,      # dwReserved
 987         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 988         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 989         ctypes.POINTER(OVERLAPPED)  # Overlapped
 990     ]
 991     UnlockFileEx.restype = ctypes.wintypes.BOOL
 992     whole_low = 0xffffffff
 993     whole_high = 0x7fffffff
 994
 995     def _lock_file(f, exclusive):
 996         overlapped = OVERLAPPED()
 997         overlapped.Offset = 0
 998         overlapped.OffsetHigh = 0
 999         overlapped.hEvent = 0
1000         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1001         handle = msvcrt.get_osfhandle(f.fileno())
1002         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1003                           whole_low, whole_high, f._lock_file_overlapped_p):
1004             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1005
1006     def _unlock_file(f):
1007         assert f._lock_file_overlapped_p
1008         handle = msvcrt.get_osfhandle(f.fileno())
1009         if not UnlockFileEx(handle, 0,
1010                             whole_low, whole_high, f._lock_file_overlapped_p):
1011             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1012
1013 else:
1014     import fcntl
1015
1016     def _lock_file(f, exclusive):
1017         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1018
1019     def _unlock_file(f):
1020         fcntl.lockf(f, fcntl.LOCK_UN)
1021
1022
1023 class locked_file(object):
1024     def __init__(self, filename, mode, encoding=None):
1025         assert mode in ['r', 'a', 'w']
1026         self.f = io.open(filename, mode, encoding=encoding)
1027         self.mode = mode
1028
1029     def __enter__(self):
1030         exclusive = self.mode != 'r'
1031         try:
1032             _lock_file(self.f, exclusive)
1033         except IOError:
1034             self.f.close()
1035             raise
1036         return self
1037
1038     def __exit__(self, etype, value, traceback):
1039         try:
1040             _unlock_file(self.f)
1041         finally:
1042             self.f.close()
1043
1044     def __iter__(self):
1045         return iter(self.f)
1046
1047     def write(self, *args):
1048         return self.f.write(*args)
1049
1050     def read(self, *args):
1051         return self.f.read(*args)
1052
1053
1054 def shell_quote(args):
1055     quoted_args = []
1056     encoding = sys.getfilesystemencoding()
1057     if encoding is None:
1058         encoding = 'utf-8'
1059     for a in args:
1060         if isinstance(a, bytes):
1061             # We may get a filename encoded with 'encodeFilename'
1062             a = a.decode(encoding)
1063         quoted_args.append(pipes.quote(a))
1064     return u' '.join(quoted_args)
1065
1066
1067 def takewhile_inclusive(pred, seq):
1068     """ Like itertools.takewhile, but include the latest evaluated element
1069         (the first element so that Not pred(e)) """
1070     for e in seq:
1071         yield e
1072         if not pred(e):
1073             return
1074
1075
1076 def smuggle_url(url, data):
1077     """ Pass additional data in a URL for internal use. """
1078
1079     sdata = compat_urllib_parse.urlencode(
1080         {u'__youtubedl_smuggle': json.dumps(data)})
1081     return url + u'#' + sdata
1082
1083
1084 def unsmuggle_url(smug_url, default=None):
1085     if not '#__youtubedl_smuggle' in smug_url:
1086         return smug_url, default
1087     url, _, sdata = smug_url.rpartition(u'#')
1088     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1089     data = json.loads(jsond)
1090     return url, data
1091
1092
1093 def format_bytes(bytes):
1094     if bytes is None:
1095         return u'N/A'
1096     if type(bytes) is str:
1097         bytes = float(bytes)
1098     if bytes == 0.0:
1099         exponent = 0
1100     else:
1101         exponent = int(math.log(bytes, 1024.0))
1102     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1103     converted = float(bytes) / float(1024 ** exponent)
1104     return u'%.2f%s' % (converted, suffix)
1105
1106
1107 def str_to_int(int_str):
1108     int_str = re.sub(r'[,\.]', u'', int_str)
1109     return int(int_str)
1110
1111
1112 def get_term_width():
1113     columns = os.environ.get('COLUMNS', None)
1114     if columns:
1115         return int(columns)
1116
1117     try:
1118         sp = subprocess.Popen(
1119             ['stty', 'size'],
1120             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1121         out, err = sp.communicate()
1122         return int(out.split()[1])
1123     except:
1124         pass
1125     return None
1126
1127
1128 def month_by_name(name):
1129     """ Return the number of a month by (locale-independently) English name """
1130
1131     ENGLISH_NAMES = [
1132         u'January', u'February', u'March', u'April', u'May', u'June',
1133         u'July', u'August', u'September', u'October', u'November', u'December']
1134     try:
1135         return ENGLISH_NAMES.index(name) + 1
1136     except ValueError:
1137         return None
1138
1139
1140 def fix_xml_ampersands(xml_str):
1141     """Replace all the '&' by '&amp;' in XML"""
1142     return re.sub(
1143         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1144         u'&amp;',
1145         xml_str)
1146
1147
1148 def setproctitle(title):
1149     assert isinstance(title, compat_str)
1150     try:
1151         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1152     except OSError:
1153         return
1154     title_bytes = title.encode('utf-8')
1155     buf = ctypes.create_string_buffer(len(title_bytes))
1156     buf.value = title_bytes
1157     try:
1158         libc.prctl(15, buf, 0, 0, 0)
1159     except AttributeError:
1160         return  # Strange libc, just skip this
1161
1162
1163 def remove_start(s, start):
1164     if s.startswith(start):
1165         return s[len(start):]
1166     return s
1167
1168
1169 def url_basename(url):
1170     path = compat_urlparse.urlparse(url).path
1171     return path.strip(u'/').split(u'/')[-1]
1172
1173
1174 class HEADRequest(compat_urllib_request.Request):
1175     def get_method(self):
1176         return "HEAD"
1177
1178
1179 def int_or_none(v, scale=1, default=None):
1180     return default if v is None else (int(v) // scale)
1181
1182
1183 def float_or_none(v, scale=1, default=None):
1184     return default if v is None else (float(v) / scale)
1185
1186
1187 def parse_duration(s):
1188     if s is None:
1189         return None
1190
1191     m = re.match(
1192         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1193     if not m:
1194         return None
1195     res = int(m.group('secs'))
1196     if m.group('mins'):
1197         res += int(m.group('mins')) * 60
1198         if m.group('hours'):
1199             res += int(m.group('hours')) * 60 * 60
1200     return res
1201
1202
1203 def prepend_extension(filename, ext):
1204     name, real_ext = os.path.splitext(filename)
1205     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1206
1207
1208 def check_executable(exe, args=[]):
1209     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1210     args can be a list of arguments for a short output (like -version) """
1211     try:
1212         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1213     except OSError:
1214         return False
1215     return exe
1216
1217
1218 class PagedList(object):
1219     def __init__(self, pagefunc, pagesize):
1220         self._pagefunc = pagefunc
1221         self._pagesize = pagesize
1222
1223     def __len__(self):
1224         # This is only useful for tests
1225         return len(self.getslice())
1226
1227     def getslice(self, start=0, end=None):
1228         res = []
1229         for pagenum in itertools.count(start // self._pagesize):
1230             firstid = pagenum * self._pagesize
1231             nextfirstid = pagenum * self._pagesize + self._pagesize
1232             if start >= nextfirstid:
1233                 continue
1234
1235             page_results = list(self._pagefunc(pagenum))
1236
1237             startv = (
1238                 start % self._pagesize
1239                 if firstid <= start < nextfirstid
1240                 else 0)
1241
1242             endv = (
1243                 ((end - 1) % self._pagesize) + 1
1244                 if (end is not None and firstid <= end <= nextfirstid)
1245                 else None)
1246
1247             if startv != 0 or endv is not None:
1248                 page_results = page_results[startv:endv]
1249             res.extend(page_results)
1250
1251             # A little optimization - if current page is not "full", ie. does
1252             # not contain page_size videos then we can assume that this page
1253             # is the last one - there are no more ids on further pages -
1254             # i.e. no need to query again.
1255             if len(page_results) + startv < self._pagesize:
1256                 break
1257
1258             # If we got the whole page, but the next page is not interesting,
1259             # break out early as well
1260             if end == nextfirstid:
1261                 break
1262         return res
1263
1264
1265 def uppercase_escape(s):
1266     return re.sub(
1267         r'\\U[0-9a-fA-F]{8}',
1268         lambda m: m.group(0).decode('unicode-escape'), s)
1269
1270 try:
1271     struct.pack(u'!I', 0)
1272 except TypeError:
1273     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1274     def struct_pack(spec, *args):
1275         if isinstance(spec, compat_str):
1276             spec = spec.encode('ascii')
1277         return struct.pack(spec, *args)
1278
1279     def struct_unpack(spec, *args):
1280         if isinstance(spec, compat_str):
1281             spec = spec.encode('ascii')
1282         return struct.unpack(spec, *args)
1283 else:
1284     struct_pack = struct.pack
1285     struct_unpack = struct.unpack
1286
1287
1288 def read_batch_urls(batch_fd):
1289     def fixup(url):
1290         if not isinstance(url, compat_str):
1291             url = url.decode('utf-8', 'replace')
1292         BOM_UTF8 = u'\xef\xbb\xbf'
1293         if url.startswith(BOM_UTF8):
1294             url = url[len(BOM_UTF8):]
1295         url = url.strip()
1296         if url.startswith(('#', ';', ']')):
1297             return False
1298         return url
1299
1300     with contextlib.closing(batch_fd) as fd:
1301         return [url for url in map(fixup, fd) if url]
1302
1303
1304 def urlencode_postdata(*args, **kargs):
1305     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1306
1307
1308 def parse_xml(s):
1309     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1310         def doctype(self, name, pubid, system):
1311             pass  # Ignore doctypes
1312
1313     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1314     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1315     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1316
1317
1318 if sys.version_info < (3, 0) and sys.platform == 'win32':
1319     def compat_getpass(prompt, *args, **kwargs):
1320         if isinstance(prompt, compat_str):
1321             prompt = prompt.encode(preferredencoding())
1322         return getpass.getpass(prompt, *args, **kwargs)
1323 else:
1324     compat_getpass = getpass.getpass
1325
1326
1327 US_RATINGS = {
1328     'G': 0,
1329     'PG': 10,
1330     'PG-13': 13,
1331     'R': 16,
1332     'NC': 18,
1333 }
1334
1335
1336 def strip_jsonp(code):
1337     return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)