_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import itertools
  10 import io
  11 import json
  12 import locale
  13 import math
  14 import os
  15 import pipes
  16 import platform
  17 import re
  18 import ssl
  19 import socket
  20 import subprocess
  21 import sys
  22 import traceback
  23 import zlib
  24
  25 try:
  26     import urllib.request as compat_urllib_request
  27 except ImportError: # Python 2
  28     import urllib2 as compat_urllib_request
  29
  30 try:
  31     import urllib.error as compat_urllib_error
  32 except ImportError: # Python 2
  33     import urllib2 as compat_urllib_error
  34
  35 try:
  36     import urllib.parse as compat_urllib_parse
  37 except ImportError: # Python 2
  38     import urllib as compat_urllib_parse
  39
  40 try:
  41     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  42 except ImportError: # Python 2
  43     from urlparse import urlparse as compat_urllib_parse_urlparse
  44
  45 try:
  46     import urllib.parse as compat_urlparse
  47 except ImportError: # Python 2
  48     import urlparse as compat_urlparse
  49
  50 try:
  51     import http.cookiejar as compat_cookiejar
  52 except ImportError: # Python 2
  53     import cookielib as compat_cookiejar
  54
  55 try:
  56     import html.entities as compat_html_entities
  57 except ImportError: # Python 2
  58     import htmlentitydefs as compat_html_entities
  59
  60 try:
  61     import html.parser as compat_html_parser
  62 except ImportError: # Python 2
  63     import HTMLParser as compat_html_parser
  64
  65 try:
  66     import http.client as compat_http_client
  67 except ImportError: # Python 2
  68     import httplib as compat_http_client
  69
  70 try:
  71     from urllib.error import HTTPError as compat_HTTPError
  72 except ImportError:  # Python 2
  73     from urllib2 import HTTPError as compat_HTTPError
  74
  75 try:
  76     from urllib.request import urlretrieve as compat_urlretrieve
  77 except ImportError:  # Python 2
  78     from urllib import urlretrieve as compat_urlretrieve
  79
  80
  81 try:
  82     from subprocess import DEVNULL
  83     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  84 except ImportError:
  85     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  86
  87 try:
  88     from urllib.parse import parse_qs as compat_parse_qs
  89 except ImportError: # Python 2
  90     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  91     # Python 2's version is apparently totally broken
  92     def _unquote(string, encoding='utf-8', errors='replace'):
  93         if string == '':
  94             return string
  95         res = string.split('%')
  96         if len(res) == 1:
  97             return string
  98         if encoding is None:
  99             encoding = 'utf-8'
 100         if errors is None:
 101             errors = 'replace'
 102         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 103         pct_sequence = b''
 104         string = res[0]
 105         for item in res[1:]:
 106             try:
 107                 if not item:
 108                     raise ValueError
 109                 pct_sequence += item[:2].decode('hex')
 110                 rest = item[2:]
 111                 if not rest:
 112                     # This segment was just a single percent-encoded character.
 113                     # May be part of a sequence of code units, so delay decoding.
 114                     # (Stored in pct_sequence).
 115                     continue
 116             except ValueError:
 117                 rest = '%' + item
 118             # Encountered non-percent-encoded characters. Flush the current
 119             # pct_sequence.
 120             string += pct_sequence.decode(encoding, errors) + rest
 121             pct_sequence = b''
 122         if pct_sequence:
 123             # Flush the final pct_sequence
 124             string += pct_sequence.decode(encoding, errors)
 125         return string
 126
 127     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 128                 encoding='utf-8', errors='replace'):
 129         qs, _coerce_result = qs, unicode
 130         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 131         r = []
 132         for name_value in pairs:
 133             if not name_value and not strict_parsing:
 134                 continue
 135             nv = name_value.split('=', 1)
 136             if len(nv) != 2:
 137                 if strict_parsing:
 138                     raise ValueError("bad query field: %r" % (name_value,))
 139                 # Handle case of a control-name with no equal sign
 140                 if keep_blank_values:
 141                     nv.append('')
 142                 else:
 143                     continue
 144             if len(nv[1]) or keep_blank_values:
 145                 name = nv[0].replace('+', ' ')
 146                 name = _unquote(name, encoding=encoding, errors=errors)
 147                 name = _coerce_result(name)
 148                 value = nv[1].replace('+', ' ')
 149                 value = _unquote(value, encoding=encoding, errors=errors)
 150                 value = _coerce_result(value)
 151                 r.append((name, value))
 152         return r
 153
 154     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 155                 encoding='utf-8', errors='replace'):
 156         parsed_result = {}
 157         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 158                         encoding=encoding, errors=errors)
 159         for name, value in pairs:
 160             if name in parsed_result:
 161                 parsed_result[name].append(value)
 162             else:
 163                 parsed_result[name] = [value]
 164         return parsed_result
 165
 166 try:
 167     compat_str = unicode # Python 2
 168 except NameError:
 169     compat_str = str
 170
 171 try:
 172     compat_chr = unichr # Python 2
 173 except NameError:
 174     compat_chr = chr
 175
 176 def compat_ord(c):
 177     if type(c) is int: return c
 178     else: return ord(c)
 179
 180 # This is not clearly defined otherwise
 181 compiled_regex_type = type(re.compile(''))
 182
 183 std_headers = {
 184     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 185     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 186     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 187     'Accept-Encoding': 'gzip, deflate',
 188     'Accept-Language': 'en-us,en;q=0.5',
 189 }
 190
 191 def preferredencoding():
 192     """Get preferred encoding.
 193
 194     Returns the best encoding scheme for the system, based on
 195     locale.getpreferredencoding() and some further tweaks.
 196     """
 197     try:
 198         pref = locale.getpreferredencoding()
 199         u'TEST'.encode(pref)
 200     except:
 201         pref = 'UTF-8'
 202
 203     return pref
 204
 205 if sys.version_info < (3,0):
 206     def compat_print(s):
 207         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 208 else:
 209     def compat_print(s):
 210         assert type(s) == type(u'')
 211         print(s)
 212
 213 # In Python 2.x, json.dump expects a bytestream.
 214 # In Python 3.x, it writes to a character stream
 215 if sys.version_info < (3,0):
 216     def write_json_file(obj, fn):
 217         with open(fn, 'wb') as f:
 218             json.dump(obj, f)
 219 else:
 220     def write_json_file(obj, fn):
 221         with open(fn, 'w', encoding='utf-8') as f:
 222             json.dump(obj, f)
 223
 224 if sys.version_info >= (2,7):
 225     def find_xpath_attr(node, xpath, key, val):
 226         """ Find the xpath xpath[@key=val] """
 227         assert re.match(r'^[a-zA-Z]+$', key)
 228         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 229         expr = xpath + u"[@%s='%s']" % (key, val)
 230         return node.find(expr)
 231 else:
 232     def find_xpath_attr(node, xpath, key, val):
 233         for f in node.findall(xpath):
 234             if f.attrib.get(key) == val:
 235                 return f
 236         return None
 237
 238 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 239 # the namespace parameter
 240 def xpath_with_ns(path, ns_map):
 241     components = [c.split(':') for c in path.split('/')]
 242     replaced = []
 243     for c in components:
 244         if len(c) == 1:
 245             replaced.append(c[0])
 246         else:
 247             ns, tag = c
 248             replaced.append('{%s}%s' % (ns_map[ns], tag))
 249     return '/'.join(replaced)
 250
 251 def htmlentity_transform(matchobj):
 252     """Transforms an HTML entity to a character.
 253
 254     This function receives a match object and is intended to be used with
 255     the re.sub() function.
 256     """
 257     entity = matchobj.group(1)
 258
 259     # Known non-numeric HTML entity
 260     if entity in compat_html_entities.name2codepoint:
 261         return compat_chr(compat_html_entities.name2codepoint[entity])
 262
 263     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 264     if mobj is not None:
 265         numstr = mobj.group(1)
 266         if numstr.startswith(u'x'):
 267             base = 16
 268             numstr = u'0%s' % numstr
 269         else:
 270             base = 10
 271         return compat_chr(int(numstr, base))
 272
 273     # Unknown entity in name, return its literal representation
 274     return (u'&%s;' % entity)
 275
 276 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 277 class BaseHTMLParser(compat_html_parser.HTMLParser):
 278     def __init(self):
 279         compat_html_parser.HTMLParser.__init__(self)
 280         self.html = None
 281
 282     def loads(self, html):
 283         self.html = html
 284         self.feed(html)
 285         self.close()
 286
 287 class AttrParser(BaseHTMLParser):
 288     """Modified HTMLParser that isolates a tag with the specified attribute"""
 289     def __init__(self, attribute, value):
 290         self.attribute = attribute
 291         self.value = value
 292         self.result = None
 293         self.started = False
 294         self.depth = {}
 295         self.watch_startpos = False
 296         self.error_count = 0
 297         BaseHTMLParser.__init__(self)
 298
 299     def error(self, message):
 300         if self.error_count > 10 or self.started:
 301             raise compat_html_parser.HTMLParseError(message, self.getpos())
 302         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 303         self.error_count += 1
 304         self.goahead(1)
 305
 306     def handle_starttag(self, tag, attrs):
 307         attrs = dict(attrs)
 308         if self.started:
 309             self.find_startpos(None)
 310         if self.attribute in attrs and attrs[self.attribute] == self.value:
 311             self.result = [tag]
 312             self.started = True
 313             self.watch_startpos = True
 314         if self.started:
 315             if not tag in self.depth: self.depth[tag] = 0
 316             self.depth[tag] += 1
 317
 318     def handle_endtag(self, tag):
 319         if self.started:
 320             if tag in self.depth: self.depth[tag] -= 1
 321             if self.depth[self.result[0]] == 0:
 322                 self.started = False
 323                 self.result.append(self.getpos())
 324
 325     def find_startpos(self, x):
 326         """Needed to put the start position of the result (self.result[1])
 327         after the opening tag with the requested id"""
 328         if self.watch_startpos:
 329             self.watch_startpos = False
 330             self.result.append(self.getpos())
 331     handle_entityref = handle_charref = handle_data = handle_comment = \
 332     handle_decl = handle_pi = unknown_decl = find_startpos
 333
 334     def get_result(self):
 335         if self.result is None:
 336             return None
 337         if len(self.result) != 3:
 338             return None
 339         lines = self.html.split('\n')
 340         lines = lines[self.result[1][0]-1:self.result[2][0]]
 341         lines[0] = lines[0][self.result[1][1]:]
 342         if len(lines) == 1:
 343             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 344         lines[-1] = lines[-1][:self.result[2][1]]
 345         return '\n'.join(lines).strip()
 346 # Hack for https://github.com/rg3/youtube-dl/issues/662
 347 if sys.version_info < (2, 7, 3):
 348     AttrParser.parse_endtag = (lambda self, i:
 349         i + len("</scr'+'ipt>")
 350         if self.rawdata[i:].startswith("</scr'+'ipt>")
 351         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 352
 353 def get_element_by_id(id, html):
 354     """Return the content of the tag with the specified ID in the passed HTML document"""
 355     return get_element_by_attribute("id", id, html)
 356
 357 def get_element_by_attribute(attribute, value, html):
 358     """Return the content of the tag with the specified attribute in the passed HTML document"""
 359     parser = AttrParser(attribute, value)
 360     try:
 361         parser.loads(html)
 362     except compat_html_parser.HTMLParseError:
 363         pass
 364     return parser.get_result()
 365
 366 class MetaParser(BaseHTMLParser):
 367     """
 368     Modified HTMLParser that isolates a meta tag with the specified name
 369     attribute.
 370     """
 371     def __init__(self, name):
 372         BaseHTMLParser.__init__(self)
 373         self.name = name
 374         self.content = None
 375         self.result = None
 376
 377     def handle_starttag(self, tag, attrs):
 378         if tag != 'meta':
 379             return
 380         attrs = dict(attrs)
 381         if attrs.get('name') == self.name:
 382             self.result = attrs.get('content')
 383
 384     def get_result(self):
 385         return self.result
 386
 387 def get_meta_content(name, html):
 388     """
 389     Return the content attribute from the meta tag with the given name attribute.
 390     """
 391     parser = MetaParser(name)
 392     try:
 393         parser.loads(html)
 394     except compat_html_parser.HTMLParseError:
 395         pass
 396     return parser.get_result()
 397
 398
 399 def clean_html(html):
 400     """Clean an HTML snippet into a readable string"""
 401     # Newline vs <br />
 402     html = html.replace('\n', ' ')
 403     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 404     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 405     # Strip html tags
 406     html = re.sub('<.*?>', '', html)
 407     # Replace html entities
 408     html = unescapeHTML(html)
 409     return html.strip()
 410
 411
 412 def sanitize_open(filename, open_mode):
 413     """Try to open the given filename, and slightly tweak it if this fails.
 414
 415     Attempts to open the given filename. If this fails, it tries to change
 416     the filename slightly, step by step, until it's either able to open it
 417     or it fails and raises a final exception, like the standard open()
 418     function.
 419
 420     It returns the tuple (stream, definitive_file_name).
 421     """
 422     try:
 423         if filename == u'-':
 424             if sys.platform == 'win32':
 425                 import msvcrt
 426                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 427             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 428         stream = open(encodeFilename(filename), open_mode)
 429         return (stream, filename)
 430     except (IOError, OSError) as err:
 431         if err.errno in (errno.EACCES,):
 432             raise
 433
 434         # In case of error, try to remove win32 forbidden chars
 435         alt_filename = os.path.join(
 436                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 437                         for path_part in os.path.split(filename)
 438                        )
 439         if alt_filename == filename:
 440             raise
 441         else:
 442             # An exception here should be caught in the caller
 443             stream = open(encodeFilename(filename), open_mode)
 444             return (stream, alt_filename)
 445
 446
 447 def timeconvert(timestr):
 448     """Convert RFC 2822 defined time string into system timestamp"""
 449     timestamp = None
 450     timetuple = email.utils.parsedate_tz(timestr)
 451     if timetuple is not None:
 452         timestamp = email.utils.mktime_tz(timetuple)
 453     return timestamp
 454
 455 def sanitize_filename(s, restricted=False, is_id=False):
 456     """Sanitizes a string so it could be used as part of a filename.
 457     If restricted is set, use a stricter subset of allowed characters.
 458     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 459     """
 460     def replace_insane(char):
 461         if char == '?' or ord(char) < 32 or ord(char) == 127:
 462             return ''
 463         elif char == '"':
 464             return '' if restricted else '\''
 465         elif char == ':':
 466             return '_-' if restricted else ' -'
 467         elif char in '\\/|*<>':
 468             return '_'
 469         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 470             return '_'
 471         if restricted and ord(char) > 127:
 472             return '_'
 473         return char
 474
 475     result = u''.join(map(replace_insane, s))
 476     if not is_id:
 477         while '__' in result:
 478             result = result.replace('__', '_')
 479         result = result.strip('_')
 480         # Common case of "Foreign band name - English song title"
 481         if restricted and result.startswith('-_'):
 482             result = result[2:]
 483         if not result:
 484             result = '_'
 485     return result
 486
 487 def orderedSet(iterable):
 488     """ Remove all duplicates from the input iterable """
 489     res = []
 490     for el in iterable:
 491         if el not in res:
 492             res.append(el)
 493     return res
 494
 495 def unescapeHTML(s):
 496     """
 497     @param s a string
 498     """
 499     assert type(s) == type(u'')
 500
 501     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 502     return result
 503
 504
 505 def encodeFilename(s, for_subprocess=False):
 506     """
 507     @param s The name of the file
 508     """
 509
 510     assert type(s) == compat_str
 511
 512     # Python 3 has a Unicode API
 513     if sys.version_info >= (3, 0):
 514         return s
 515
 516     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 517         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 518         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 519         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 520         if not for_subprocess:
 521             return s
 522         else:
 523             # For subprocess calls, encode with locale encoding
 524             # Refer to http://stackoverflow.com/a/9951851/35070
 525             encoding = preferredencoding()
 526     else:
 527         encoding = sys.getfilesystemencoding()
 528     if encoding is None:
 529         encoding = 'utf-8'
 530     return s.encode(encoding, 'ignore')
 531
 532
 533 def decodeOption(optval):
 534     if optval is None:
 535         return optval
 536     if isinstance(optval, bytes):
 537         optval = optval.decode(preferredencoding())
 538
 539     assert isinstance(optval, compat_str)
 540     return optval
 541
 542 def formatSeconds(secs):
 543     if secs > 3600:
 544         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 545     elif secs > 60:
 546         return '%d:%02d' % (secs // 60, secs % 60)
 547     else:
 548         return '%d' % secs
 549
 550
 551 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 552     if sys.version_info < (3, 2):
 553         import httplib
 554
 555         class HTTPSConnectionV3(httplib.HTTPSConnection):
 556             def __init__(self, *args, **kwargs):
 557                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 558
 559             def connect(self):
 560                 sock = socket.create_connection((self.host, self.port), self.timeout)
 561                 if getattr(self, '_tunnel_host', False):
 562                     self.sock = sock
 563                     self._tunnel()
 564                 try:
 565                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 566                 except ssl.SSLError:
 567                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 568
 569         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 570             def https_open(self, req):
 571                 return self.do_open(HTTPSConnectionV3, req)
 572         return HTTPSHandlerV3(**kwargs)
 573     else:
 574         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 575         context.verify_mode = (ssl.CERT_NONE
 576                                if opts_no_check_certificate
 577                                else ssl.CERT_REQUIRED)
 578         context.set_default_verify_paths()
 579         try:
 580             context.load_default_certs()
 581         except AttributeError:
 582             pass  # Python < 3.4
 583         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 584
 585 class ExtractorError(Exception):
 586     """Error during info extraction."""
 587     def __init__(self, msg, tb=None, expected=False, cause=None):
 588         """ tb, if given, is the original traceback (so that it can be printed out).
 589         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 590         """
 591
 592         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 593             expected = True
 594         if not expected:
 595             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 596         super(ExtractorError, self).__init__(msg)
 597
 598         self.traceback = tb
 599         self.exc_info = sys.exc_info()  # preserve original exception
 600         self.cause = cause
 601
 602     def format_traceback(self):
 603         if self.traceback is None:
 604             return None
 605         return u''.join(traceback.format_tb(self.traceback))
 606
 607
 608 class RegexNotFoundError(ExtractorError):
 609     """Error when a regex didn't match"""
 610     pass
 611
 612
 613 class DownloadError(Exception):
 614     """Download Error exception.
 615
 616     This exception may be thrown by FileDownloader objects if they are not
 617     configured to continue on errors. They will contain the appropriate
 618     error message.
 619     """
 620     def __init__(self, msg, exc_info=None):
 621         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 622         super(DownloadError, self).__init__(msg)
 623         self.exc_info = exc_info
 624
 625
 626 class SameFileError(Exception):
 627     """Same File exception.
 628
 629     This exception will be thrown by FileDownloader objects if they detect
 630     multiple files would have to be downloaded to the same file on disk.
 631     """
 632     pass
 633
 634
 635 class PostProcessingError(Exception):
 636     """Post Processing exception.
 637
 638     This exception may be raised by PostProcessor's .run() method to
 639     indicate an error in the postprocessing task.
 640     """
 641     def __init__(self, msg):
 642         self.msg = msg
 643
 644 class MaxDownloadsReached(Exception):
 645     """ --max-downloads limit has been reached. """
 646     pass
 647
 648
 649 class UnavailableVideoError(Exception):
 650     """Unavailable Format exception.
 651
 652     This exception will be thrown when a video is requested
 653     in a format that is not available for that video.
 654     """
 655     pass
 656
 657
 658 class ContentTooShortError(Exception):
 659     """Content Too Short exception.
 660
 661     This exception may be raised by FileDownloader objects when a file they
 662     download is too small for what the server announced first, indicating
 663     the connection was probably interrupted.
 664     """
 665     # Both in bytes
 666     downloaded = None
 667     expected = None
 668
 669     def __init__(self, downloaded, expected):
 670         self.downloaded = downloaded
 671         self.expected = expected
 672
 673 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 674     """Handler for HTTP requests and responses.
 675
 676     This class, when installed with an OpenerDirector, automatically adds
 677     the standard headers to every HTTP request and handles gzipped and
 678     deflated responses from web servers. If compression is to be avoided in
 679     a particular request, the original request in the program code only has
 680     to include the HTTP header "Youtubedl-No-Compression", which will be
 681     removed before making the real request.
 682
 683     Part of this code was copied from:
 684
 685     http://techknack.net/python-urllib2-handlers/
 686
 687     Andrew Rowls, the author of that code, agreed to release it to the
 688     public domain.
 689     """
 690
 691     @staticmethod
 692     def deflate(data):
 693         try:
 694             return zlib.decompress(data, -zlib.MAX_WBITS)
 695         except zlib.error:
 696             return zlib.decompress(data)
 697
 698     @staticmethod
 699     def addinfourl_wrapper(stream, headers, url, code):
 700         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 701             return compat_urllib_request.addinfourl(stream, headers, url, code)
 702         ret = compat_urllib_request.addinfourl(stream, headers, url)
 703         ret.code = code
 704         return ret
 705
 706     def http_request(self, req):
 707         for h,v in std_headers.items():
 708             if h in req.headers:
 709                 del req.headers[h]
 710             req.add_header(h, v)
 711         if 'Youtubedl-no-compression' in req.headers:
 712             if 'Accept-encoding' in req.headers:
 713                 del req.headers['Accept-encoding']
 714             del req.headers['Youtubedl-no-compression']
 715         if 'Youtubedl-user-agent' in req.headers:
 716             if 'User-agent' in req.headers:
 717                 del req.headers['User-agent']
 718             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 719             del req.headers['Youtubedl-user-agent']
 720         return req
 721
 722     def http_response(self, req, resp):
 723         old_resp = resp
 724         # gzip
 725         if resp.headers.get('Content-encoding', '') == 'gzip':
 726             content = resp.read()
 727             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 728             try:
 729                 uncompressed = io.BytesIO(gz.read())
 730             except IOError as original_ioerror:
 731                 # There may be junk add the end of the file
 732                 # See http://stackoverflow.com/q/4928560/35070 for details
 733                 for i in range(1, 1024):
 734                     try:
 735                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 736                         uncompressed = io.BytesIO(gz.read())
 737                     except IOError:
 738                         continue
 739                     break
 740                 else:
 741                     raise original_ioerror
 742             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 743             resp.msg = old_resp.msg
 744         # deflate
 745         if resp.headers.get('Content-encoding', '') == 'deflate':
 746             gz = io.BytesIO(self.deflate(resp.read()))
 747             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 748             resp.msg = old_resp.msg
 749         return resp
 750
 751     https_request = http_request
 752     https_response = http_response
 753
 754 def unified_strdate(date_str):
 755     """Return a string with the date in the format YYYYMMDD"""
 756     upload_date = None
 757     #Replace commas
 758     date_str = date_str.replace(',',' ')
 759     # %z (UTC offset) is only supported in python>=3.2
 760     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 761     format_expressions = [
 762         '%d %B %Y',
 763         '%B %d %Y',
 764         '%b %d %Y',
 765         '%Y-%m-%d',
 766         '%d/%m/%Y',
 767         '%Y/%m/%d %H:%M:%S',
 768         '%Y-%m-%d %H:%M:%S',
 769         '%d.%m.%Y %H:%M',
 770         '%Y-%m-%dT%H:%M:%SZ',
 771         '%Y-%m-%dT%H:%M:%S.%fZ',
 772         '%Y-%m-%dT%H:%M:%S.%f0Z',
 773         '%Y-%m-%dT%H:%M:%S',
 774     ]
 775     for expression in format_expressions:
 776         try:
 777             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 778         except:
 779             pass
 780     if upload_date is None:
 781         timetuple = email.utils.parsedate_tz(date_str)
 782         if timetuple:
 783             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 784     return upload_date
 785
 786 def determine_ext(url, default_ext=u'unknown_video'):
 787     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 788     if re.match(r'^[A-Za-z0-9]+$', guess):
 789         return guess
 790     else:
 791         return default_ext
 792
 793 def subtitles_filename(filename, sub_lang, sub_format):
 794     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 795
 796 def date_from_str(date_str):
 797     """
 798     Return a datetime object from a string in the format YYYYMMDD or
 799     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 800     today = datetime.date.today()
 801     if date_str == 'now'or date_str == 'today':
 802         return today
 803     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 804     if match is not None:
 805         sign = match.group('sign')
 806         time = int(match.group('time'))
 807         if sign == '-':
 808             time = -time
 809         unit = match.group('unit')
 810         #A bad aproximation?
 811         if unit == 'month':
 812             unit = 'day'
 813             time *= 30
 814         elif unit == 'year':
 815             unit = 'day'
 816             time *= 365
 817         unit += 's'
 818         delta = datetime.timedelta(**{unit: time})
 819         return today + delta
 820     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 821
 822 def hyphenate_date(date_str):
 823     """
 824     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 825     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 826     if match is not None:
 827         return '-'.join(match.groups())
 828     else:
 829         return date_str
 830
 831 class DateRange(object):
 832     """Represents a time interval between two dates"""
 833     def __init__(self, start=None, end=None):
 834         """start and end must be strings in the format accepted by date"""
 835         if start is not None:
 836             self.start = date_from_str(start)
 837         else:
 838             self.start = datetime.datetime.min.date()
 839         if end is not None:
 840             self.end = date_from_str(end)
 841         else:
 842             self.end = datetime.datetime.max.date()
 843         if self.start > self.end:
 844             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 845     @classmethod
 846     def day(cls, day):
 847         """Returns a range that only contains the given day"""
 848         return cls(day,day)
 849     def __contains__(self, date):
 850         """Check if the date is in the range"""
 851         if not isinstance(date, datetime.date):
 852             date = date_from_str(date)
 853         return self.start <= date <= self.end
 854     def __str__(self):
 855         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 856
 857
 858 def platform_name():
 859     """ Returns the platform name as a compat_str """
 860     res = platform.platform()
 861     if isinstance(res, bytes):
 862         res = res.decode(preferredencoding())
 863
 864     assert isinstance(res, compat_str)
 865     return res
 866
 867
 868 def write_string(s, out=None):
 869     if out is None:
 870         out = sys.stderr
 871     assert type(s) == compat_str
 872
 873     if ('b' in getattr(out, 'mode', '') or
 874             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 875         s = s.encode(preferredencoding(), 'ignore')
 876     try:
 877         out.write(s)
 878     except UnicodeEncodeError:
 879         # In Windows shells, this can fail even when the codec is just charmap!?
 880         # See https://wiki.python.org/moin/PrintFails#Issue
 881         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 882             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 883             out.write(s)
 884         else:
 885             raise
 886
 887     out.flush()
 888
 889
 890 def bytes_to_intlist(bs):
 891     if not bs:
 892         return []
 893     if isinstance(bs[0], int):  # Python 3
 894         return list(bs)
 895     else:
 896         return [ord(c) for c in bs]
 897
 898
 899 def intlist_to_bytes(xs):
 900     if not xs:
 901         return b''
 902     if isinstance(chr(0), bytes):  # Python 2
 903         return ''.join([chr(x) for x in xs])
 904     else:
 905         return bytes(xs)
 906
 907
 908 def get_cachedir(params={}):
 909     cache_root = os.environ.get('XDG_CACHE_HOME',
 910                                 os.path.expanduser('~/.cache'))
 911     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 912
 913
 914 # Cross-platform file locking
 915 if sys.platform == 'win32':
 916     import ctypes.wintypes
 917     import msvcrt
 918
 919     class OVERLAPPED(ctypes.Structure):
 920         _fields_ = [
 921             ('Internal', ctypes.wintypes.LPVOID),
 922             ('InternalHigh', ctypes.wintypes.LPVOID),
 923             ('Offset', ctypes.wintypes.DWORD),
 924             ('OffsetHigh', ctypes.wintypes.DWORD),
 925             ('hEvent', ctypes.wintypes.HANDLE),
 926         ]
 927
 928     kernel32 = ctypes.windll.kernel32
 929     LockFileEx = kernel32.LockFileEx
 930     LockFileEx.argtypes = [
 931         ctypes.wintypes.HANDLE,     # hFile
 932         ctypes.wintypes.DWORD,      # dwFlags
 933         ctypes.wintypes.DWORD,      # dwReserved
 934         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 935         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 936         ctypes.POINTER(OVERLAPPED)  # Overlapped
 937     ]
 938     LockFileEx.restype = ctypes.wintypes.BOOL
 939     UnlockFileEx = kernel32.UnlockFileEx
 940     UnlockFileEx.argtypes = [
 941         ctypes.wintypes.HANDLE,     # hFile
 942         ctypes.wintypes.DWORD,      # dwReserved
 943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 945         ctypes.POINTER(OVERLAPPED)  # Overlapped
 946     ]
 947     UnlockFileEx.restype = ctypes.wintypes.BOOL
 948     whole_low = 0xffffffff
 949     whole_high = 0x7fffffff
 950
 951     def _lock_file(f, exclusive):
 952         overlapped = OVERLAPPED()
 953         overlapped.Offset = 0
 954         overlapped.OffsetHigh = 0
 955         overlapped.hEvent = 0
 956         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 957         handle = msvcrt.get_osfhandle(f.fileno())
 958         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 959                           whole_low, whole_high, f._lock_file_overlapped_p):
 960             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 961
 962     def _unlock_file(f):
 963         assert f._lock_file_overlapped_p
 964         handle = msvcrt.get_osfhandle(f.fileno())
 965         if not UnlockFileEx(handle, 0,
 966                             whole_low, whole_high, f._lock_file_overlapped_p):
 967             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 968
 969 else:
 970     import fcntl
 971
 972     def _lock_file(f, exclusive):
 973         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 974
 975     def _unlock_file(f):
 976         fcntl.lockf(f, fcntl.LOCK_UN)
 977
 978
 979 class locked_file(object):
 980     def __init__(self, filename, mode, encoding=None):
 981         assert mode in ['r', 'a', 'w']
 982         self.f = io.open(filename, mode, encoding=encoding)
 983         self.mode = mode
 984
 985     def __enter__(self):
 986         exclusive = self.mode != 'r'
 987         try:
 988             _lock_file(self.f, exclusive)
 989         except IOError:
 990             self.f.close()
 991             raise
 992         return self
 993
 994     def __exit__(self, etype, value, traceback):
 995         try:
 996             _unlock_file(self.f)
 997         finally:
 998             self.f.close()
 999
1000     def __iter__(self):
1001         return iter(self.f)
1002
1003     def write(self, *args):
1004         return self.f.write(*args)
1005
1006     def read(self, *args):
1007         return self.f.read(*args)
1008
1009
1010 def shell_quote(args):
1011     quoted_args = []
1012     encoding = sys.getfilesystemencoding()
1013     if encoding is None:
1014         encoding = 'utf-8'
1015     for a in args:
1016         if isinstance(a, bytes):
1017             # We may get a filename encoded with 'encodeFilename'
1018             a = a.decode(encoding)
1019         quoted_args.append(pipes.quote(a))
1020     return u' '.join(quoted_args)
1021
1022
1023 def takewhile_inclusive(pred, seq):
1024     """ Like itertools.takewhile, but include the latest evaluated element
1025         (the first element so that Not pred(e)) """
1026     for e in seq:
1027         yield e
1028         if not pred(e):
1029             return
1030
1031
1032 def smuggle_url(url, data):
1033     """ Pass additional data in a URL for internal use. """
1034
1035     sdata = compat_urllib_parse.urlencode(
1036         {u'__youtubedl_smuggle': json.dumps(data)})
1037     return url + u'#' + sdata
1038
1039
1040 def unsmuggle_url(smug_url, default=None):
1041     if not '#__youtubedl_smuggle' in smug_url:
1042         return smug_url, default
1043     url, _, sdata = smug_url.rpartition(u'#')
1044     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1045     data = json.loads(jsond)
1046     return url, data
1047
1048
1049 def format_bytes(bytes):
1050     if bytes is None:
1051         return u'N/A'
1052     if type(bytes) is str:
1053         bytes = float(bytes)
1054     if bytes == 0.0:
1055         exponent = 0
1056     else:
1057         exponent = int(math.log(bytes, 1024.0))
1058     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1059     converted = float(bytes) / float(1024 ** exponent)
1060     return u'%.2f%s' % (converted, suffix)
1061
1062
1063 def str_to_int(int_str):
1064     int_str = re.sub(r'[,\.]', u'', int_str)
1065     return int(int_str)
1066
1067
1068 def get_term_width():
1069     columns = os.environ.get('COLUMNS', None)
1070     if columns:
1071         return int(columns)
1072
1073     try:
1074         sp = subprocess.Popen(
1075             ['stty', 'size'],
1076             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1077         out, err = sp.communicate()
1078         return int(out.split()[1])
1079     except:
1080         pass
1081     return None
1082
1083
1084 def month_by_name(name):
1085     """ Return the number of a month by (locale-independently) English name """
1086
1087     ENGLISH_NAMES = [
1088         u'January', u'February', u'March', u'April', u'May', u'June',
1089         u'July', u'August', u'September', u'October', u'November', u'December']
1090     try:
1091         return ENGLISH_NAMES.index(name) + 1
1092     except ValueError:
1093         return None
1094
1095
1096 def fix_xml_ampersands(xml_str):
1097     """Replace all the '&' by '&amp;' in XML"""
1098     return re.sub(
1099         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1100         u'&amp;',
1101         xml_str)
1102
1103
1104 def setproctitle(title):
1105     assert isinstance(title, compat_str)
1106     try:
1107         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1108     except OSError:
1109         return
1110     title = title
1111     buf = ctypes.create_string_buffer(len(title) + 1)
1112     buf.value = title.encode('utf-8')
1113     try:
1114         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1115     except AttributeError:
1116         return  # Strange libc, just skip this
1117
1118
1119 def remove_start(s, start):
1120     if s.startswith(start):
1121         return s[len(start):]
1122     return s
1123
1124
1125 def url_basename(url):
1126     path = compat_urlparse.urlparse(url).path
1127     return path.strip(u'/').split(u'/')[-1]
1128
1129
1130 class HEADRequest(compat_urllib_request.Request):
1131     def get_method(self):
1132         return "HEAD"
1133
1134
1135 def int_or_none(v, scale=1):
1136     return v if v is None else (int(v) // scale)
1137
1138
1139 def parse_duration(s):
1140     if s is None:
1141         return None
1142
1143     m = re.match(
1144         r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
1145     if not m:
1146         return None
1147     res = int(m.group('secs'))
1148     if m.group('mins'):
1149         res += int(m.group('mins')) * 60
1150         if m.group('hours'):
1151             res += int(m.group('hours')) * 60 * 60
1152     return res
1153
1154
1155 def prepend_extension(filename, ext):
1156     name, real_ext = os.path.splitext(filename)
1157     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1158
1159
1160 def check_executable(exe, args=[]):
1161     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1162     args can be a list of arguments for a short output (like -version) """
1163     try:
1164         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1165     except OSError:
1166         return False
1167     return exe
1168
1169
1170 class PagedList(object):
1171     def __init__(self, pagefunc, pagesize):
1172         self._pagefunc = pagefunc
1173         self._pagesize = pagesize
1174
1175     def getslice(self, start=0, end=None):
1176         res = []
1177         for pagenum in itertools.count(start // self._pagesize):
1178             firstid = pagenum * self._pagesize
1179             nextfirstid = pagenum * self._pagesize + self._pagesize
1180             if start >= nextfirstid:
1181                 continue
1182
1183             page_results = list(self._pagefunc(pagenum))
1184
1185             startv = (
1186                 start % self._pagesize
1187                 if firstid <= start < nextfirstid
1188                 else 0)
1189
1190             endv = (
1191                 ((end - 1) % self._pagesize) + 1
1192                 if (end is not None and firstid <= end <= nextfirstid)
1193                 else None)
1194
1195             if startv != 0 or endv is not None:
1196                 page_results = page_results[startv:endv]
1197             res.extend(page_results)
1198
1199             # A little optimization - if current page is not "full", ie. does
1200             # not contain page_size videos then we can assume that this page
1201             # is the last one - there are no more ids on further pages -
1202             # i.e. no need to query again.
1203             if len(page_results) + startv < self._pagesize:
1204                 break
1205
1206             # If we got the whole page, but the next page is not interesting,
1207             # break out early as well
1208             if end == nextfirstid:
1209                 break
1210         return res