_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import itertools
  10 import io
  11 import json
  12 import locale
  13 import math
  14 import os
  15 import pipes
  16 import platform
  17 import re
  18 import ssl
  19 import socket
  20 import subprocess
  21 import sys
  22 import traceback
  23 import zlib
  24
  25 try:
  26     import urllib.request as compat_urllib_request
  27 except ImportError: # Python 2
  28     import urllib2 as compat_urllib_request
  29
  30 try:
  31     import urllib.error as compat_urllib_error
  32 except ImportError: # Python 2
  33     import urllib2 as compat_urllib_error
  34
  35 try:
  36     import urllib.parse as compat_urllib_parse
  37 except ImportError: # Python 2
  38     import urllib as compat_urllib_parse
  39
  40 try:
  41     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  42 except ImportError: # Python 2
  43     from urlparse import urlparse as compat_urllib_parse_urlparse
  44
  45 try:
  46     import urllib.parse as compat_urlparse
  47 except ImportError: # Python 2
  48     import urlparse as compat_urlparse
  49
  50 try:
  51     import http.cookiejar as compat_cookiejar
  52 except ImportError: # Python 2
  53     import cookielib as compat_cookiejar
  54
  55 try:
  56     import html.entities as compat_html_entities
  57 except ImportError: # Python 2
  58     import htmlentitydefs as compat_html_entities
  59
  60 try:
  61     import html.parser as compat_html_parser
  62 except ImportError: # Python 2
  63     import HTMLParser as compat_html_parser
  64
  65 try:
  66     import http.client as compat_http_client
  67 except ImportError: # Python 2
  68     import httplib as compat_http_client
  69
  70 try:
  71     from urllib.error import HTTPError as compat_HTTPError
  72 except ImportError:  # Python 2
  73     from urllib2 import HTTPError as compat_HTTPError
  74
  75 try:
  76     from urllib.request import urlretrieve as compat_urlretrieve
  77 except ImportError:  # Python 2
  78     from urllib import urlretrieve as compat_urlretrieve
  79
  80
  81 try:
  82     from subprocess import DEVNULL
  83     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  84 except ImportError:
  85     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  86
  87 try:
  88     from urllib.parse import parse_qs as compat_parse_qs
  89 except ImportError: # Python 2
  90     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  91     # Python 2's version is apparently totally broken
  92     def _unquote(string, encoding='utf-8', errors='replace'):
  93         if string == '':
  94             return string
  95         res = string.split('%')
  96         if len(res) == 1:
  97             return string
  98         if encoding is None:
  99             encoding = 'utf-8'
 100         if errors is None:
 101             errors = 'replace'
 102         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 103         pct_sequence = b''
 104         string = res[0]
 105         for item in res[1:]:
 106             try:
 107                 if not item:
 108                     raise ValueError
 109                 pct_sequence += item[:2].decode('hex')
 110                 rest = item[2:]
 111                 if not rest:
 112                     # This segment was just a single percent-encoded character.
 113                     # May be part of a sequence of code units, so delay decoding.
 114                     # (Stored in pct_sequence).
 115                     continue
 116             except ValueError:
 117                 rest = '%' + item
 118             # Encountered non-percent-encoded characters. Flush the current
 119             # pct_sequence.
 120             string += pct_sequence.decode(encoding, errors) + rest
 121             pct_sequence = b''
 122         if pct_sequence:
 123             # Flush the final pct_sequence
 124             string += pct_sequence.decode(encoding, errors)
 125         return string
 126
 127     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 128                 encoding='utf-8', errors='replace'):
 129         qs, _coerce_result = qs, unicode
 130         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 131         r = []
 132         for name_value in pairs:
 133             if not name_value and not strict_parsing:
 134                 continue
 135             nv = name_value.split('=', 1)
 136             if len(nv) != 2:
 137                 if strict_parsing:
 138                     raise ValueError("bad query field: %r" % (name_value,))
 139                 # Handle case of a control-name with no equal sign
 140                 if keep_blank_values:
 141                     nv.append('')
 142                 else:
 143                     continue
 144             if len(nv[1]) or keep_blank_values:
 145                 name = nv[0].replace('+', ' ')
 146                 name = _unquote(name, encoding=encoding, errors=errors)
 147                 name = _coerce_result(name)
 148                 value = nv[1].replace('+', ' ')
 149                 value = _unquote(value, encoding=encoding, errors=errors)
 150                 value = _coerce_result(value)
 151                 r.append((name, value))
 152         return r
 153
 154     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 155                 encoding='utf-8', errors='replace'):
 156         parsed_result = {}
 157         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 158                         encoding=encoding, errors=errors)
 159         for name, value in pairs:
 160             if name in parsed_result:
 161                 parsed_result[name].append(value)
 162             else:
 163                 parsed_result[name] = [value]
 164         return parsed_result
 165
 166 try:
 167     compat_str = unicode # Python 2
 168 except NameError:
 169     compat_str = str
 170
 171 try:
 172     compat_chr = unichr # Python 2
 173 except NameError:
 174     compat_chr = chr
 175
 176 def compat_ord(c):
 177     if type(c) is int: return c
 178     else: return ord(c)
 179
 180 # This is not clearly defined otherwise
 181 compiled_regex_type = type(re.compile(''))
 182
 183 std_headers = {
 184     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 185     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 186     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 187     'Accept-Encoding': 'gzip, deflate',
 188     'Accept-Language': 'en-us,en;q=0.5',
 189 }
 190
 191 def preferredencoding():
 192     """Get preferred encoding.
 193
 194     Returns the best encoding scheme for the system, based on
 195     locale.getpreferredencoding() and some further tweaks.
 196     """
 197     try:
 198         pref = locale.getpreferredencoding()
 199         u'TEST'.encode(pref)
 200     except:
 201         pref = 'UTF-8'
 202
 203     return pref
 204
 205 if sys.version_info < (3,0):
 206     def compat_print(s):
 207         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 208 else:
 209     def compat_print(s):
 210         assert type(s) == type(u'')
 211         print(s)
 212
 213 # In Python 2.x, json.dump expects a bytestream.
 214 # In Python 3.x, it writes to a character stream
 215 if sys.version_info < (3,0):
 216     def write_json_file(obj, fn):
 217         with open(fn, 'wb') as f:
 218             json.dump(obj, f)
 219 else:
 220     def write_json_file(obj, fn):
 221         with open(fn, 'w', encoding='utf-8') as f:
 222             json.dump(obj, f)
 223
 224 if sys.version_info >= (2,7):
 225     def find_xpath_attr(node, xpath, key, val):
 226         """ Find the xpath xpath[@key=val] """
 227         assert re.match(r'^[a-zA-Z]+$', key)
 228         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 229         expr = xpath + u"[@%s='%s']" % (key, val)
 230         return node.find(expr)
 231 else:
 232     def find_xpath_attr(node, xpath, key, val):
 233         for f in node.findall(xpath):
 234             if f.attrib.get(key) == val:
 235                 return f
 236         return None
 237
 238 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 239 # the namespace parameter
 240 def xpath_with_ns(path, ns_map):
 241     components = [c.split(':') for c in path.split('/')]
 242     replaced = []
 243     for c in components:
 244         if len(c) == 1:
 245             replaced.append(c[0])
 246         else:
 247             ns, tag = c
 248             replaced.append('{%s}%s' % (ns_map[ns], tag))
 249     return '/'.join(replaced)
 250
 251 def htmlentity_transform(matchobj):
 252     """Transforms an HTML entity to a character.
 253
 254     This function receives a match object and is intended to be used with
 255     the re.sub() function.
 256     """
 257     entity = matchobj.group(1)
 258
 259     # Known non-numeric HTML entity
 260     if entity in compat_html_entities.name2codepoint:
 261         return compat_chr(compat_html_entities.name2codepoint[entity])
 262
 263     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 264     if mobj is not None:
 265         numstr = mobj.group(1)
 266         if numstr.startswith(u'x'):
 267             base = 16
 268             numstr = u'0%s' % numstr
 269         else:
 270             base = 10
 271         return compat_chr(int(numstr, base))
 272
 273     # Unknown entity in name, return its literal representation
 274     return (u'&%s;' % entity)
 275
 276 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 277 class BaseHTMLParser(compat_html_parser.HTMLParser):
 278     def __init(self):
 279         compat_html_parser.HTMLParser.__init__(self)
 280         self.html = None
 281
 282     def loads(self, html):
 283         self.html = html
 284         self.feed(html)
 285         self.close()
 286
 287 class AttrParser(BaseHTMLParser):
 288     """Modified HTMLParser that isolates a tag with the specified attribute"""
 289     def __init__(self, attribute, value):
 290         self.attribute = attribute
 291         self.value = value
 292         self.result = None
 293         self.started = False
 294         self.depth = {}
 295         self.watch_startpos = False
 296         self.error_count = 0
 297         BaseHTMLParser.__init__(self)
 298
 299     def error(self, message):
 300         if self.error_count > 10 or self.started:
 301             raise compat_html_parser.HTMLParseError(message, self.getpos())
 302         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 303         self.error_count += 1
 304         self.goahead(1)
 305
 306     def handle_starttag(self, tag, attrs):
 307         attrs = dict(attrs)
 308         if self.started:
 309             self.find_startpos(None)
 310         if self.attribute in attrs and attrs[self.attribute] == self.value:
 311             self.result = [tag]
 312             self.started = True
 313             self.watch_startpos = True
 314         if self.started:
 315             if not tag in self.depth: self.depth[tag] = 0
 316             self.depth[tag] += 1
 317
 318     def handle_endtag(self, tag):
 319         if self.started:
 320             if tag in self.depth: self.depth[tag] -= 1
 321             if self.depth[self.result[0]] == 0:
 322                 self.started = False
 323                 self.result.append(self.getpos())
 324
 325     def find_startpos(self, x):
 326         """Needed to put the start position of the result (self.result[1])
 327         after the opening tag with the requested id"""
 328         if self.watch_startpos:
 329             self.watch_startpos = False
 330             self.result.append(self.getpos())
 331     handle_entityref = handle_charref = handle_data = handle_comment = \
 332     handle_decl = handle_pi = unknown_decl = find_startpos
 333
 334     def get_result(self):
 335         if self.result is None:
 336             return None
 337         if len(self.result) != 3:
 338             return None
 339         lines = self.html.split('\n')
 340         lines = lines[self.result[1][0]-1:self.result[2][0]]
 341         lines[0] = lines[0][self.result[1][1]:]
 342         if len(lines) == 1:
 343             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 344         lines[-1] = lines[-1][:self.result[2][1]]
 345         return '\n'.join(lines).strip()
 346 # Hack for https://github.com/rg3/youtube-dl/issues/662
 347 if sys.version_info < (2, 7, 3):
 348     AttrParser.parse_endtag = (lambda self, i:
 349         i + len("</scr'+'ipt>")
 350         if self.rawdata[i:].startswith("</scr'+'ipt>")
 351         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 352
 353 def get_element_by_id(id, html):
 354     """Return the content of the tag with the specified ID in the passed HTML document"""
 355     return get_element_by_attribute("id", id, html)
 356
 357 def get_element_by_attribute(attribute, value, html):
 358     """Return the content of the tag with the specified attribute in the passed HTML document"""
 359     parser = AttrParser(attribute, value)
 360     try:
 361         parser.loads(html)
 362     except compat_html_parser.HTMLParseError:
 363         pass
 364     return parser.get_result()
 365
 366 class MetaParser(BaseHTMLParser):
 367     """
 368     Modified HTMLParser that isolates a meta tag with the specified name
 369     attribute.
 370     """
 371     def __init__(self, name):
 372         BaseHTMLParser.__init__(self)
 373         self.name = name
 374         self.content = None
 375         self.result = None
 376
 377     def handle_starttag(self, tag, attrs):
 378         if tag != 'meta':
 379             return
 380         attrs = dict(attrs)
 381         if attrs.get('name') == self.name:
 382             self.result = attrs.get('content')
 383
 384     def get_result(self):
 385         return self.result
 386
 387 def get_meta_content(name, html):
 388     """
 389     Return the content attribute from the meta tag with the given name attribute.
 390     """
 391     parser = MetaParser(name)
 392     try:
 393         parser.loads(html)
 394     except compat_html_parser.HTMLParseError:
 395         pass
 396     return parser.get_result()
 397
 398
 399 def clean_html(html):
 400     """Clean an HTML snippet into a readable string"""
 401     # Newline vs <br />
 402     html = html.replace('\n', ' ')
 403     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 404     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 405     # Strip html tags
 406     html = re.sub('<.*?>', '', html)
 407     # Replace html entities
 408     html = unescapeHTML(html)
 409     return html.strip()
 410
 411
 412 def sanitize_open(filename, open_mode):
 413     """Try to open the given filename, and slightly tweak it if this fails.
 414
 415     Attempts to open the given filename. If this fails, it tries to change
 416     the filename slightly, step by step, until it's either able to open it
 417     or it fails and raises a final exception, like the standard open()
 418     function.
 419
 420     It returns the tuple (stream, definitive_file_name).
 421     """
 422     try:
 423         if filename == u'-':
 424             if sys.platform == 'win32':
 425                 import msvcrt
 426                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 427             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 428         stream = open(encodeFilename(filename), open_mode)
 429         return (stream, filename)
 430     except (IOError, OSError) as err:
 431         if err.errno in (errno.EACCES,):
 432             raise
 433
 434         # In case of error, try to remove win32 forbidden chars
 435         alt_filename = os.path.join(
 436                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 437                         for path_part in os.path.split(filename)
 438                        )
 439         if alt_filename == filename:
 440             raise
 441         else:
 442             # An exception here should be caught in the caller
 443             stream = open(encodeFilename(filename), open_mode)
 444             return (stream, alt_filename)
 445
 446
 447 def timeconvert(timestr):
 448     """Convert RFC 2822 defined time string into system timestamp"""
 449     timestamp = None
 450     timetuple = email.utils.parsedate_tz(timestr)
 451     if timetuple is not None:
 452         timestamp = email.utils.mktime_tz(timetuple)
 453     return timestamp
 454
 455 def sanitize_filename(s, restricted=False, is_id=False):
 456     """Sanitizes a string so it could be used as part of a filename.
 457     If restricted is set, use a stricter subset of allowed characters.
 458     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 459     """
 460     def replace_insane(char):
 461         if char == '?' or ord(char) < 32 or ord(char) == 127:
 462             return ''
 463         elif char == '"':
 464             return '' if restricted else '\''
 465         elif char == ':':
 466             return '_-' if restricted else ' -'
 467         elif char in '\\/|*<>':
 468             return '_'
 469         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 470             return '_'
 471         if restricted and ord(char) > 127:
 472             return '_'
 473         return char
 474
 475     result = u''.join(map(replace_insane, s))
 476     if not is_id:
 477         while '__' in result:
 478             result = result.replace('__', '_')
 479         result = result.strip('_')
 480         # Common case of "Foreign band name - English song title"
 481         if restricted and result.startswith('-_'):
 482             result = result[2:]
 483         if not result:
 484             result = '_'
 485     return result
 486
 487 def orderedSet(iterable):
 488     """ Remove all duplicates from the input iterable """
 489     res = []
 490     for el in iterable:
 491         if el not in res:
 492             res.append(el)
 493     return res
 494
 495 def unescapeHTML(s):
 496     """
 497     @param s a string
 498     """
 499     assert type(s) == type(u'')
 500
 501     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 502     return result
 503
 504
 505 def encodeFilename(s, for_subprocess=False):
 506     """
 507     @param s The name of the file
 508     """
 509
 510     assert type(s) == compat_str
 511
 512     # Python 3 has a Unicode API
 513     if sys.version_info >= (3, 0):
 514         return s
 515
 516     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 517         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 518         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 519         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 520         if not for_subprocess:
 521             return s
 522         else:
 523             # For subprocess calls, encode with locale encoding
 524             # Refer to http://stackoverflow.com/a/9951851/35070
 525             encoding = preferredencoding()
 526     else:
 527         encoding = sys.getfilesystemencoding()
 528     if encoding is None:
 529         encoding = 'utf-8'
 530     return s.encode(encoding, 'ignore')
 531
 532
 533 def decodeOption(optval):
 534     if optval is None:
 535         return optval
 536     if isinstance(optval, bytes):
 537         optval = optval.decode(preferredencoding())
 538
 539     assert isinstance(optval, compat_str)
 540     return optval
 541
 542 def formatSeconds(secs):
 543     if secs > 3600:
 544         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 545     elif secs > 60:
 546         return '%d:%02d' % (secs // 60, secs % 60)
 547     else:
 548         return '%d' % secs
 549
 550
 551 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 552     if sys.version_info < (3, 2):
 553         import httplib
 554
 555         class HTTPSConnectionV3(httplib.HTTPSConnection):
 556             def __init__(self, *args, **kwargs):
 557                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 558
 559             def connect(self):
 560                 sock = socket.create_connection((self.host, self.port), self.timeout)
 561                 if getattr(self, '_tunnel_host', False):
 562                     self.sock = sock
 563                     self._tunnel()
 564                 try:
 565                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 566                 except ssl.SSLError:
 567                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 568
 569         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 570             def https_open(self, req):
 571                 return self.do_open(HTTPSConnectionV3, req)
 572         return HTTPSHandlerV3(**kwargs)
 573     else:
 574         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 575         context.verify_mode = (ssl.CERT_NONE
 576                                if opts_no_check_certificate
 577                                else ssl.CERT_REQUIRED)
 578         context.set_default_verify_paths()
 579         try:
 580             context.load_default_certs()
 581         except AttributeError:
 582             pass  # Python < 3.4
 583         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 584
 585 class ExtractorError(Exception):
 586     """Error during info extraction."""
 587     def __init__(self, msg, tb=None, expected=False, cause=None):
 588         """ tb, if given, is the original traceback (so that it can be printed out).
 589         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 590         """
 591
 592         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 593             expected = True
 594         if not expected:
 595             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 596         super(ExtractorError, self).__init__(msg)
 597
 598         self.traceback = tb
 599         self.exc_info = sys.exc_info()  # preserve original exception
 600         self.cause = cause
 601
 602     def format_traceback(self):
 603         if self.traceback is None:
 604             return None
 605         return u''.join(traceback.format_tb(self.traceback))
 606
 607
 608 class RegexNotFoundError(ExtractorError):
 609     """Error when a regex didn't match"""
 610     pass
 611
 612
 613 class DownloadError(Exception):
 614     """Download Error exception.
 615
 616     This exception may be thrown by FileDownloader objects if they are not
 617     configured to continue on errors. They will contain the appropriate
 618     error message.
 619     """
 620     def __init__(self, msg, exc_info=None):
 621         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 622         super(DownloadError, self).__init__(msg)
 623         self.exc_info = exc_info
 624
 625
 626 class SameFileError(Exception):
 627     """Same File exception.
 628
 629     This exception will be thrown by FileDownloader objects if they detect
 630     multiple files would have to be downloaded to the same file on disk.
 631     """
 632     pass
 633
 634
 635 class PostProcessingError(Exception):
 636     """Post Processing exception.
 637
 638     This exception may be raised by PostProcessor's .run() method to
 639     indicate an error in the postprocessing task.
 640     """
 641     def __init__(self, msg):
 642         self.msg = msg
 643
 644 class MaxDownloadsReached(Exception):
 645     """ --max-downloads limit has been reached. """
 646     pass
 647
 648
 649 class UnavailableVideoError(Exception):
 650     """Unavailable Format exception.
 651
 652     This exception will be thrown when a video is requested
 653     in a format that is not available for that video.
 654     """
 655     pass
 656
 657
 658 class ContentTooShortError(Exception):
 659     """Content Too Short exception.
 660
 661     This exception may be raised by FileDownloader objects when a file they
 662     download is too small for what the server announced first, indicating
 663     the connection was probably interrupted.
 664     """
 665     # Both in bytes
 666     downloaded = None
 667     expected = None
 668
 669     def __init__(self, downloaded, expected):
 670         self.downloaded = downloaded
 671         self.expected = expected
 672
 673 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 674     """Handler for HTTP requests and responses.
 675
 676     This class, when installed with an OpenerDirector, automatically adds
 677     the standard headers to every HTTP request and handles gzipped and
 678     deflated responses from web servers. If compression is to be avoided in
 679     a particular request, the original request in the program code only has
 680     to include the HTTP header "Youtubedl-No-Compression", which will be
 681     removed before making the real request.
 682
 683     Part of this code was copied from:
 684
 685     http://techknack.net/python-urllib2-handlers/
 686
 687     Andrew Rowls, the author of that code, agreed to release it to the
 688     public domain.
 689     """
 690
 691     @staticmethod
 692     def deflate(data):
 693         try:
 694             return zlib.decompress(data, -zlib.MAX_WBITS)
 695         except zlib.error:
 696             return zlib.decompress(data)
 697
 698     @staticmethod
 699     def addinfourl_wrapper(stream, headers, url, code):
 700         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 701             return compat_urllib_request.addinfourl(stream, headers, url, code)
 702         ret = compat_urllib_request.addinfourl(stream, headers, url)
 703         ret.code = code
 704         return ret
 705
 706     def http_request(self, req):
 707         for h,v in std_headers.items():
 708             if h in req.headers:
 709                 del req.headers[h]
 710             req.add_header(h, v)
 711         if 'Youtubedl-no-compression' in req.headers:
 712             if 'Accept-encoding' in req.headers:
 713                 del req.headers['Accept-encoding']
 714             del req.headers['Youtubedl-no-compression']
 715         if 'Youtubedl-user-agent' in req.headers:
 716             if 'User-agent' in req.headers:
 717                 del req.headers['User-agent']
 718             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 719             del req.headers['Youtubedl-user-agent']
 720         return req
 721
 722     def http_response(self, req, resp):
 723         old_resp = resp
 724         # gzip
 725         if resp.headers.get('Content-encoding', '') == 'gzip':
 726             content = resp.read()
 727             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 728             try:
 729                 uncompressed = io.BytesIO(gz.read())
 730             except IOError as original_ioerror:
 731                 # There may be junk add the end of the file
 732                 # See http://stackoverflow.com/q/4928560/35070 for details
 733                 for i in range(1, 1024):
 734                     try:
 735                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 736                         uncompressed = io.BytesIO(gz.read())
 737                     except IOError:
 738                         continue
 739                     break
 740                 else:
 741                     raise original_ioerror
 742             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 743             resp.msg = old_resp.msg
 744         # deflate
 745         if resp.headers.get('Content-encoding', '') == 'deflate':
 746             gz = io.BytesIO(self.deflate(resp.read()))
 747             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 748             resp.msg = old_resp.msg
 749         return resp
 750
 751     https_request = http_request
 752     https_response = http_response
 753
 754
 755 def unified_strdate(date_str):
 756     """Return a string with the date in the format YYYYMMDD"""
 757     upload_date = None
 758     #Replace commas
 759     date_str = date_str.replace(',', ' ')
 760     # %z (UTC offset) is only supported in python>=3.2
 761     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 762     format_expressions = [
 763         '%d %B %Y',
 764         '%B %d %Y',
 765         '%b %d %Y',
 766         '%Y-%m-%d',
 767         '%d/%m/%Y',
 768         '%Y/%m/%d %H:%M:%S',
 769         '%Y-%m-%d %H:%M:%S',
 770         '%d.%m.%Y %H:%M',
 771         '%Y-%m-%dT%H:%M:%SZ',
 772         '%Y-%m-%dT%H:%M:%S.%fZ',
 773         '%Y-%m-%dT%H:%M:%S.%f0Z',
 774         '%Y-%m-%dT%H:%M:%S',
 775         '%Y-%m-%dT%H:%M',
 776     ]
 777     for expression in format_expressions:
 778         try:
 779             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 780         except ValueError:
 781             pass
 782     if upload_date is None:
 783         timetuple = email.utils.parsedate_tz(date_str)
 784         if timetuple:
 785             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 786     return upload_date
 787
 788 def determine_ext(url, default_ext=u'unknown_video'):
 789     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 790     if re.match(r'^[A-Za-z0-9]+$', guess):
 791         return guess
 792     else:
 793         return default_ext
 794
 795 def subtitles_filename(filename, sub_lang, sub_format):
 796     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 797
 798 def date_from_str(date_str):
 799     """
 800     Return a datetime object from a string in the format YYYYMMDD or
 801     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 802     today = datetime.date.today()
 803     if date_str == 'now'or date_str == 'today':
 804         return today
 805     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 806     if match is not None:
 807         sign = match.group('sign')
 808         time = int(match.group('time'))
 809         if sign == '-':
 810             time = -time
 811         unit = match.group('unit')
 812         #A bad aproximation?
 813         if unit == 'month':
 814             unit = 'day'
 815             time *= 30
 816         elif unit == 'year':
 817             unit = 'day'
 818             time *= 365
 819         unit += 's'
 820         delta = datetime.timedelta(**{unit: time})
 821         return today + delta
 822     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 823
 824 def hyphenate_date(date_str):
 825     """
 826     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 827     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 828     if match is not None:
 829         return '-'.join(match.groups())
 830     else:
 831         return date_str
 832
 833 class DateRange(object):
 834     """Represents a time interval between two dates"""
 835     def __init__(self, start=None, end=None):
 836         """start and end must be strings in the format accepted by date"""
 837         if start is not None:
 838             self.start = date_from_str(start)
 839         else:
 840             self.start = datetime.datetime.min.date()
 841         if end is not None:
 842             self.end = date_from_str(end)
 843         else:
 844             self.end = datetime.datetime.max.date()
 845         if self.start > self.end:
 846             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 847     @classmethod
 848     def day(cls, day):
 849         """Returns a range that only contains the given day"""
 850         return cls(day,day)
 851     def __contains__(self, date):
 852         """Check if the date is in the range"""
 853         if not isinstance(date, datetime.date):
 854             date = date_from_str(date)
 855         return self.start <= date <= self.end
 856     def __str__(self):
 857         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 858
 859
 860 def platform_name():
 861     """ Returns the platform name as a compat_str """
 862     res = platform.platform()
 863     if isinstance(res, bytes):
 864         res = res.decode(preferredencoding())
 865
 866     assert isinstance(res, compat_str)
 867     return res
 868
 869
 870 def write_string(s, out=None):
 871     if out is None:
 872         out = sys.stderr
 873     assert type(s) == compat_str
 874
 875     if ('b' in getattr(out, 'mode', '') or
 876             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 877         s = s.encode(preferredencoding(), 'ignore')
 878     try:
 879         out.write(s)
 880     except UnicodeEncodeError:
 881         # In Windows shells, this can fail even when the codec is just charmap!?
 882         # See https://wiki.python.org/moin/PrintFails#Issue
 883         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 884             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 885             out.write(s)
 886         else:
 887             raise
 888
 889     out.flush()
 890
 891
 892 def bytes_to_intlist(bs):
 893     if not bs:
 894         return []
 895     if isinstance(bs[0], int):  # Python 3
 896         return list(bs)
 897     else:
 898         return [ord(c) for c in bs]
 899
 900
 901 def intlist_to_bytes(xs):
 902     if not xs:
 903         return b''
 904     if isinstance(chr(0), bytes):  # Python 2
 905         return ''.join([chr(x) for x in xs])
 906     else:
 907         return bytes(xs)
 908
 909
 910 def get_cachedir(params={}):
 911     cache_root = os.environ.get('XDG_CACHE_HOME',
 912                                 os.path.expanduser('~/.cache'))
 913     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 914
 915
 916 # Cross-platform file locking
 917 if sys.platform == 'win32':
 918     import ctypes.wintypes
 919     import msvcrt
 920
 921     class OVERLAPPED(ctypes.Structure):
 922         _fields_ = [
 923             ('Internal', ctypes.wintypes.LPVOID),
 924             ('InternalHigh', ctypes.wintypes.LPVOID),
 925             ('Offset', ctypes.wintypes.DWORD),
 926             ('OffsetHigh', ctypes.wintypes.DWORD),
 927             ('hEvent', ctypes.wintypes.HANDLE),
 928         ]
 929
 930     kernel32 = ctypes.windll.kernel32
 931     LockFileEx = kernel32.LockFileEx
 932     LockFileEx.argtypes = [
 933         ctypes.wintypes.HANDLE,     # hFile
 934         ctypes.wintypes.DWORD,      # dwFlags
 935         ctypes.wintypes.DWORD,      # dwReserved
 936         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 937         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 938         ctypes.POINTER(OVERLAPPED)  # Overlapped
 939     ]
 940     LockFileEx.restype = ctypes.wintypes.BOOL
 941     UnlockFileEx = kernel32.UnlockFileEx
 942     UnlockFileEx.argtypes = [
 943         ctypes.wintypes.HANDLE,     # hFile
 944         ctypes.wintypes.DWORD,      # dwReserved
 945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 946         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 947         ctypes.POINTER(OVERLAPPED)  # Overlapped
 948     ]
 949     UnlockFileEx.restype = ctypes.wintypes.BOOL
 950     whole_low = 0xffffffff
 951     whole_high = 0x7fffffff
 952
 953     def _lock_file(f, exclusive):
 954         overlapped = OVERLAPPED()
 955         overlapped.Offset = 0
 956         overlapped.OffsetHigh = 0
 957         overlapped.hEvent = 0
 958         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 959         handle = msvcrt.get_osfhandle(f.fileno())
 960         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 961                           whole_low, whole_high, f._lock_file_overlapped_p):
 962             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 963
 964     def _unlock_file(f):
 965         assert f._lock_file_overlapped_p
 966         handle = msvcrt.get_osfhandle(f.fileno())
 967         if not UnlockFileEx(handle, 0,
 968                             whole_low, whole_high, f._lock_file_overlapped_p):
 969             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 970
 971 else:
 972     import fcntl
 973
 974     def _lock_file(f, exclusive):
 975         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 976
 977     def _unlock_file(f):
 978         fcntl.lockf(f, fcntl.LOCK_UN)
 979
 980
 981 class locked_file(object):
 982     def __init__(self, filename, mode, encoding=None):
 983         assert mode in ['r', 'a', 'w']
 984         self.f = io.open(filename, mode, encoding=encoding)
 985         self.mode = mode
 986
 987     def __enter__(self):
 988         exclusive = self.mode != 'r'
 989         try:
 990             _lock_file(self.f, exclusive)
 991         except IOError:
 992             self.f.close()
 993             raise
 994         return self
 995
 996     def __exit__(self, etype, value, traceback):
 997         try:
 998             _unlock_file(self.f)
 999         finally:
1000             self.f.close()
1001
1002     def __iter__(self):
1003         return iter(self.f)
1004
1005     def write(self, *args):
1006         return self.f.write(*args)
1007
1008     def read(self, *args):
1009         return self.f.read(*args)
1010
1011
1012 def shell_quote(args):
1013     quoted_args = []
1014     encoding = sys.getfilesystemencoding()
1015     if encoding is None:
1016         encoding = 'utf-8'
1017     for a in args:
1018         if isinstance(a, bytes):
1019             # We may get a filename encoded with 'encodeFilename'
1020             a = a.decode(encoding)
1021         quoted_args.append(pipes.quote(a))
1022     return u' '.join(quoted_args)
1023
1024
1025 def takewhile_inclusive(pred, seq):
1026     """ Like itertools.takewhile, but include the latest evaluated element
1027         (the first element so that Not pred(e)) """
1028     for e in seq:
1029         yield e
1030         if not pred(e):
1031             return
1032
1033
1034 def smuggle_url(url, data):
1035     """ Pass additional data in a URL for internal use. """
1036
1037     sdata = compat_urllib_parse.urlencode(
1038         {u'__youtubedl_smuggle': json.dumps(data)})
1039     return url + u'#' + sdata
1040
1041
1042 def unsmuggle_url(smug_url, default=None):
1043     if not '#__youtubedl_smuggle' in smug_url:
1044         return smug_url, default
1045     url, _, sdata = smug_url.rpartition(u'#')
1046     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1047     data = json.loads(jsond)
1048     return url, data
1049
1050
1051 def format_bytes(bytes):
1052     if bytes is None:
1053         return u'N/A'
1054     if type(bytes) is str:
1055         bytes = float(bytes)
1056     if bytes == 0.0:
1057         exponent = 0
1058     else:
1059         exponent = int(math.log(bytes, 1024.0))
1060     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1061     converted = float(bytes) / float(1024 ** exponent)
1062     return u'%.2f%s' % (converted, suffix)
1063
1064
1065 def str_to_int(int_str):
1066     int_str = re.sub(r'[,\.]', u'', int_str)
1067     return int(int_str)
1068
1069
1070 def get_term_width():
1071     columns = os.environ.get('COLUMNS', None)
1072     if columns:
1073         return int(columns)
1074
1075     try:
1076         sp = subprocess.Popen(
1077             ['stty', 'size'],
1078             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1079         out, err = sp.communicate()
1080         return int(out.split()[1])
1081     except:
1082         pass
1083     return None
1084
1085
1086 def month_by_name(name):
1087     """ Return the number of a month by (locale-independently) English name """
1088
1089     ENGLISH_NAMES = [
1090         u'January', u'February', u'March', u'April', u'May', u'June',
1091         u'July', u'August', u'September', u'October', u'November', u'December']
1092     try:
1093         return ENGLISH_NAMES.index(name) + 1
1094     except ValueError:
1095         return None
1096
1097
1098 def fix_xml_ampersands(xml_str):
1099     """Replace all the '&' by '&amp;' in XML"""
1100     return re.sub(
1101         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1102         u'&amp;',
1103         xml_str)
1104
1105
1106 def setproctitle(title):
1107     assert isinstance(title, compat_str)
1108     try:
1109         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1110     except OSError:
1111         return
1112     title = title
1113     buf = ctypes.create_string_buffer(len(title) + 1)
1114     buf.value = title.encode('utf-8')
1115     try:
1116         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1117     except AttributeError:
1118         return  # Strange libc, just skip this
1119
1120
1121 def remove_start(s, start):
1122     if s.startswith(start):
1123         return s[len(start):]
1124     return s
1125
1126
1127 def url_basename(url):
1128     path = compat_urlparse.urlparse(url).path
1129     return path.strip(u'/').split(u'/')[-1]
1130
1131
1132 class HEADRequest(compat_urllib_request.Request):
1133     def get_method(self):
1134         return "HEAD"
1135
1136
1137 def int_or_none(v, scale=1):
1138     return v if v is None else (int(v) // scale)
1139
1140
1141 def parse_duration(s):
1142     if s is None:
1143         return None
1144
1145     m = re.match(
1146         r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
1147     if not m:
1148         return None
1149     res = int(m.group('secs'))
1150     if m.group('mins'):
1151         res += int(m.group('mins')) * 60
1152         if m.group('hours'):
1153             res += int(m.group('hours')) * 60 * 60
1154     return res
1155
1156
1157 def prepend_extension(filename, ext):
1158     name, real_ext = os.path.splitext(filename)
1159     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1160
1161
1162 def check_executable(exe, args=[]):
1163     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1164     args can be a list of arguments for a short output (like -version) """
1165     try:
1166         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1167     except OSError:
1168         return False
1169     return exe
1170
1171
1172 class PagedList(object):
1173     def __init__(self, pagefunc, pagesize):
1174         self._pagefunc = pagefunc
1175         self._pagesize = pagesize
1176
1177     def __len__(self):
1178         # This is only useful for tests
1179         return len(self.getslice())
1180
1181     def getslice(self, start=0, end=None):
1182         res = []
1183         for pagenum in itertools.count(start // self._pagesize):
1184             firstid = pagenum * self._pagesize
1185             nextfirstid = pagenum * self._pagesize + self._pagesize
1186             if start >= nextfirstid:
1187                 continue
1188
1189             page_results = list(self._pagefunc(pagenum))
1190
1191             startv = (
1192                 start % self._pagesize
1193                 if firstid <= start < nextfirstid
1194                 else 0)
1195
1196             endv = (
1197                 ((end - 1) % self._pagesize) + 1
1198                 if (end is not None and firstid <= end <= nextfirstid)
1199                 else None)
1200
1201             if startv != 0 or endv is not None:
1202                 page_results = page_results[startv:endv]
1203             res.extend(page_results)
1204
1205             # A little optimization - if current page is not "full", ie. does
1206             # not contain page_size videos then we can assume that this page
1207             # is the last one - there are no more ids on further pages -
1208             # i.e. no need to query again.
1209             if len(page_results) + startv < self._pagesize:
1210                 break
1211
1212             # If we got the whole page, but the next page is not interesting,
1213             # break out early as well
1214             if end == nextfirstid:
1215                 break
1216         return res
1217
1218
1219 def uppercase_escape(s):
1220     return re.sub(
1221         r'\\U([0-9a-fA-F]{8})',
1222         lambda m: compat_chr(int(m.group(1), base=16)), s)