_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206
 207 if sys.version_info >= (3, 0):
 208     compat_getenv = os.getenv
 209     compat_expanduser = os.path.expanduser
 210 else:
 211     # Environment variables should be decoded with filesystem encoding.
 212     # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
 213
 214     def compat_getenv(key, default=None):
 215         env = os.getenv(key, default)
 216         if env:
 217             env = env.decode(get_filesystem_encoding())
 218         return env
 219
 220     # HACK: The default implementations of os.path.expanduser from cpython do not decode
 221     # environment variables with filesystem encoding. We will work around this by
 222     # providing adjusted implementations.
 223     # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
 224     # for different platforms with correct environment variables decoding.
 225
 226     if os.name == 'posix':
 227         def compat_expanduser(path):
 228             """Expand ~ and ~user constructions.  If user or $HOME is unknown,
 229             do nothing."""
 230             if not path.startswith('~'):
 231                 return path
 232             i = path.find('/', 1)
 233             if i < 0:
 234                 i = len(path)
 235             if i == 1:
 236                 if 'HOME' not in os.environ:
 237                     import pwd
 238                     userhome = pwd.getpwuid(os.getuid()).pw_dir
 239                 else:
 240                     userhome = compat_getenv('HOME')
 241             else:
 242                 import pwd
 243                 try:
 244                     pwent = pwd.getpwnam(path[1:i])
 245                 except KeyError:
 246                     return path
 247                 userhome = pwent.pw_dir
 248             userhome = userhome.rstrip('/')
 249             return (userhome + path[i:]) or '/'
 250     elif os.name == 'nt' or os.name == 'ce':
 251         def compat_expanduser(path):
 252             """Expand ~ and ~user constructs.
 253
 254             If user or $HOME is unknown, do nothing."""
 255             if path[:1] != '~':
 256                 return path
 257             i, n = 1, len(path)
 258             while i < n and path[i] not in '/\\':
 259                 i = i + 1
 260
 261             if 'HOME' in os.environ:
 262                 userhome = compat_getenv('HOME')
 263             elif 'USERPROFILE' in os.environ:
 264                 userhome = compat_getenv('USERPROFILE')
 265             elif not 'HOMEPATH' in os.environ:
 266                 return path
 267             else:
 268                 try:
 269                     drive = compat_getenv('HOMEDRIVE')
 270                 except KeyError:
 271                     drive = ''
 272                 userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
 273
 274             if i != 1: #~user
 275                 userhome = os.path.join(os.path.dirname(userhome), path[1:i])
 276
 277             return userhome + path[i:]
 278     else:
 279         compat_expanduser = os.path.expanduser
 280
 281
 282 # This is not clearly defined otherwise
 283 compiled_regex_type = type(re.compile(''))
 284
 285 std_headers = {
 286     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 287     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 288     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 289     'Accept-Encoding': 'gzip, deflate',
 290     'Accept-Language': 'en-us,en;q=0.5',
 291 }
 292
 293 def preferredencoding():
 294     """Get preferred encoding.
 295
 296     Returns the best encoding scheme for the system, based on
 297     locale.getpreferredencoding() and some further tweaks.
 298     """
 299     try:
 300         pref = locale.getpreferredencoding()
 301         u'TEST'.encode(pref)
 302     except:
 303         pref = 'UTF-8'
 304
 305     return pref
 306
 307 if sys.version_info < (3,0):
 308     def compat_print(s):
 309         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 310 else:
 311     def compat_print(s):
 312         assert type(s) == type(u'')
 313         print(s)
 314
 315
 316 def write_json_file(obj, fn):
 317     """ Encode obj as JSON and write it to fn, atomically """
 318
 319     args = {
 320         'suffix': '.tmp',
 321         'prefix': os.path.basename(fn) + '.',
 322         'dir': os.path.dirname(fn),
 323         'delete': False,
 324     }
 325
 326     # In Python 2.x, json.dump expects a bytestream.
 327     # In Python 3.x, it writes to a character stream
 328     if sys.version_info < (3, 0):
 329         args['mode'] = 'wb'
 330     else:
 331         args.update({
 332             'mode': 'w',
 333             'encoding': 'utf-8',
 334         })
 335
 336     tf = tempfile.NamedTemporaryFile(**args)
 337
 338     try:
 339         with tf:
 340             json.dump(obj, tf)
 341         os.rename(tf.name, fn)
 342     except:
 343         try:
 344             os.remove(tf.name)
 345         except OSError:
 346             pass
 347         raise
 348
 349
 350 if sys.version_info >= (2, 7):
 351     def find_xpath_attr(node, xpath, key, val):
 352         """ Find the xpath xpath[@key=val] """
 353         assert re.match(r'^[a-zA-Z-]+$', key)
 354         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 355         expr = xpath + u"[@%s='%s']" % (key, val)
 356         return node.find(expr)
 357 else:
 358     def find_xpath_attr(node, xpath, key, val):
 359         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 360         # .//node does not match if a node is a direct child of . !
 361         if isinstance(xpath, unicode):
 362             xpath = xpath.encode('ascii')
 363
 364         for f in node.findall(xpath):
 365             if f.attrib.get(key) == val:
 366                 return f
 367         return None
 368
 369 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 370 # the namespace parameter
 371 def xpath_with_ns(path, ns_map):
 372     components = [c.split(':') for c in path.split('/')]
 373     replaced = []
 374     for c in components:
 375         if len(c) == 1:
 376             replaced.append(c[0])
 377         else:
 378             ns, tag = c
 379             replaced.append('{%s}%s' % (ns_map[ns], tag))
 380     return '/'.join(replaced)
 381
 382
 383 def xpath_text(node, xpath, name=None, fatal=False):
 384     if sys.version_info < (2, 7):  # Crazy 2.6
 385         xpath = xpath.encode('ascii')
 386
 387     n = node.find(xpath)
 388     if n is None:
 389         if fatal:
 390             name = xpath if name is None else name
 391             raise ExtractorError('Could not find XML element %s' % name)
 392         else:
 393             return None
 394     return n.text
 395
 396
 397 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 398 class BaseHTMLParser(compat_html_parser.HTMLParser):
 399     def __init(self):
 400         compat_html_parser.HTMLParser.__init__(self)
 401         self.html = None
 402
 403     def loads(self, html):
 404         self.html = html
 405         self.feed(html)
 406         self.close()
 407
 408 class AttrParser(BaseHTMLParser):
 409     """Modified HTMLParser that isolates a tag with the specified attribute"""
 410     def __init__(self, attribute, value):
 411         self.attribute = attribute
 412         self.value = value
 413         self.result = None
 414         self.started = False
 415         self.depth = {}
 416         self.watch_startpos = False
 417         self.error_count = 0
 418         BaseHTMLParser.__init__(self)
 419
 420     def error(self, message):
 421         if self.error_count > 10 or self.started:
 422             raise compat_html_parser.HTMLParseError(message, self.getpos())
 423         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 424         self.error_count += 1
 425         self.goahead(1)
 426
 427     def handle_starttag(self, tag, attrs):
 428         attrs = dict(attrs)
 429         if self.started:
 430             self.find_startpos(None)
 431         if self.attribute in attrs and attrs[self.attribute] == self.value:
 432             self.result = [tag]
 433             self.started = True
 434             self.watch_startpos = True
 435         if self.started:
 436             if not tag in self.depth: self.depth[tag] = 0
 437             self.depth[tag] += 1
 438
 439     def handle_endtag(self, tag):
 440         if self.started:
 441             if tag in self.depth: self.depth[tag] -= 1
 442             if self.depth[self.result[0]] == 0:
 443                 self.started = False
 444                 self.result.append(self.getpos())
 445
 446     def find_startpos(self, x):
 447         """Needed to put the start position of the result (self.result[1])
 448         after the opening tag with the requested id"""
 449         if self.watch_startpos:
 450             self.watch_startpos = False
 451             self.result.append(self.getpos())
 452     handle_entityref = handle_charref = handle_data = handle_comment = \
 453     handle_decl = handle_pi = unknown_decl = find_startpos
 454
 455     def get_result(self):
 456         if self.result is None:
 457             return None
 458         if len(self.result) != 3:
 459             return None
 460         lines = self.html.split('\n')
 461         lines = lines[self.result[1][0]-1:self.result[2][0]]
 462         lines[0] = lines[0][self.result[1][1]:]
 463         if len(lines) == 1:
 464             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 465         lines[-1] = lines[-1][:self.result[2][1]]
 466         return '\n'.join(lines).strip()
 467 # Hack for https://github.com/rg3/youtube-dl/issues/662
 468 if sys.version_info < (2, 7, 3):
 469     AttrParser.parse_endtag = (lambda self, i:
 470         i + len("</scr'+'ipt>")
 471         if self.rawdata[i:].startswith("</scr'+'ipt>")
 472         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 473
 474 def get_element_by_id(id, html):
 475     """Return the content of the tag with the specified ID in the passed HTML document"""
 476     return get_element_by_attribute("id", id, html)
 477
 478 def get_element_by_attribute(attribute, value, html):
 479     """Return the content of the tag with the specified attribute in the passed HTML document"""
 480     parser = AttrParser(attribute, value)
 481     try:
 482         parser.loads(html)
 483     except compat_html_parser.HTMLParseError:
 484         pass
 485     return parser.get_result()
 486
 487 class MetaParser(BaseHTMLParser):
 488     """
 489     Modified HTMLParser that isolates a meta tag with the specified name
 490     attribute.
 491     """
 492     def __init__(self, name):
 493         BaseHTMLParser.__init__(self)
 494         self.name = name
 495         self.content = None
 496         self.result = None
 497
 498     def handle_starttag(self, tag, attrs):
 499         if tag != 'meta':
 500             return
 501         attrs = dict(attrs)
 502         if attrs.get('name') == self.name:
 503             self.result = attrs.get('content')
 504
 505     def get_result(self):
 506         return self.result
 507
 508 def get_meta_content(name, html):
 509     """
 510     Return the content attribute from the meta tag with the given name attribute.
 511     """
 512     parser = MetaParser(name)
 513     try:
 514         parser.loads(html)
 515     except compat_html_parser.HTMLParseError:
 516         pass
 517     return parser.get_result()
 518
 519
 520 def clean_html(html):
 521     """Clean an HTML snippet into a readable string"""
 522     # Newline vs <br />
 523     html = html.replace('\n', ' ')
 524     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 525     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 526     # Strip html tags
 527     html = re.sub('<.*?>', '', html)
 528     # Replace html entities
 529     html = unescapeHTML(html)
 530     return html.strip()
 531
 532
 533 def sanitize_open(filename, open_mode):
 534     """Try to open the given filename, and slightly tweak it if this fails.
 535
 536     Attempts to open the given filename. If this fails, it tries to change
 537     the filename slightly, step by step, until it's either able to open it
 538     or it fails and raises a final exception, like the standard open()
 539     function.
 540
 541     It returns the tuple (stream, definitive_file_name).
 542     """
 543     try:
 544         if filename == u'-':
 545             if sys.platform == 'win32':
 546                 import msvcrt
 547                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 548             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 549         stream = open(encodeFilename(filename), open_mode)
 550         return (stream, filename)
 551     except (IOError, OSError) as err:
 552         if err.errno in (errno.EACCES,):
 553             raise
 554
 555         # In case of error, try to remove win32 forbidden chars
 556         alt_filename = os.path.join(
 557                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 558                         for path_part in os.path.split(filename)
 559                        )
 560         if alt_filename == filename:
 561             raise
 562         else:
 563             # An exception here should be caught in the caller
 564             stream = open(encodeFilename(filename), open_mode)
 565             return (stream, alt_filename)
 566
 567
 568 def timeconvert(timestr):
 569     """Convert RFC 2822 defined time string into system timestamp"""
 570     timestamp = None
 571     timetuple = email.utils.parsedate_tz(timestr)
 572     if timetuple is not None:
 573         timestamp = email.utils.mktime_tz(timetuple)
 574     return timestamp
 575
 576 def sanitize_filename(s, restricted=False, is_id=False):
 577     """Sanitizes a string so it could be used as part of a filename.
 578     If restricted is set, use a stricter subset of allowed characters.
 579     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 580     """
 581     def replace_insane(char):
 582         if char == '?' or ord(char) < 32 or ord(char) == 127:
 583             return ''
 584         elif char == '"':
 585             return '' if restricted else '\''
 586         elif char == ':':
 587             return '_-' if restricted else ' -'
 588         elif char in '\\/|*<>':
 589             return '_'
 590         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 591             return '_'
 592         if restricted and ord(char) > 127:
 593             return '_'
 594         return char
 595
 596     result = u''.join(map(replace_insane, s))
 597     if not is_id:
 598         while '__' in result:
 599             result = result.replace('__', '_')
 600         result = result.strip('_')
 601         # Common case of "Foreign band name - English song title"
 602         if restricted and result.startswith('-_'):
 603             result = result[2:]
 604         if not result:
 605             result = '_'
 606     return result
 607
 608 def orderedSet(iterable):
 609     """ Remove all duplicates from the input iterable """
 610     res = []
 611     for el in iterable:
 612         if el not in res:
 613             res.append(el)
 614     return res
 615
 616
 617 def _htmlentity_transform(entity):
 618     """Transforms an HTML entity to a character."""
 619     # Known non-numeric HTML entity
 620     if entity in compat_html_entities.name2codepoint:
 621         return compat_chr(compat_html_entities.name2codepoint[entity])
 622
 623     mobj = re.match(r'#(x?[0-9]+)', entity)
 624     if mobj is not None:
 625         numstr = mobj.group(1)
 626         if numstr.startswith(u'x'):
 627             base = 16
 628             numstr = u'0%s' % numstr
 629         else:
 630             base = 10
 631         return compat_chr(int(numstr, base))
 632
 633     # Unknown entity in name, return its literal representation
 634     return (u'&%s;' % entity)
 635
 636
 637 def unescapeHTML(s):
 638     if s is None:
 639         return None
 640     assert type(s) == compat_str
 641
 642     return re.sub(
 643         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 644
 645
 646 def encodeFilename(s, for_subprocess=False):
 647     """
 648     @param s The name of the file
 649     """
 650
 651     assert type(s) == compat_str
 652
 653     # Python 3 has a Unicode API
 654     if sys.version_info >= (3, 0):
 655         return s
 656
 657     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 658         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 659         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 660         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 661         if not for_subprocess:
 662             return s
 663         else:
 664             # For subprocess calls, encode with locale encoding
 665             # Refer to http://stackoverflow.com/a/9951851/35070
 666             encoding = preferredencoding()
 667     else:
 668         encoding = sys.getfilesystemencoding()
 669     if encoding is None:
 670         encoding = 'utf-8'
 671     return s.encode(encoding, 'ignore')
 672
 673
 674 def encodeArgument(s):
 675     if not isinstance(s, compat_str):
 676         # Legacy code that uses byte strings
 677         # Uncomment the following line after fixing all post processors
 678         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 679         s = s.decode('ascii')
 680     return encodeFilename(s, True)
 681
 682
 683 def decodeOption(optval):
 684     if optval is None:
 685         return optval
 686     if isinstance(optval, bytes):
 687         optval = optval.decode(preferredencoding())
 688
 689     assert isinstance(optval, compat_str)
 690     return optval
 691
 692 def formatSeconds(secs):
 693     if secs > 3600:
 694         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 695     elif secs > 60:
 696         return '%d:%02d' % (secs // 60, secs % 60)
 697     else:
 698         return '%d' % secs
 699
 700
 701 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 702     if sys.version_info < (3, 2):
 703         import httplib
 704
 705         class HTTPSConnectionV3(httplib.HTTPSConnection):
 706             def __init__(self, *args, **kwargs):
 707                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 708
 709             def connect(self):
 710                 sock = socket.create_connection((self.host, self.port), self.timeout)
 711                 if getattr(self, '_tunnel_host', False):
 712                     self.sock = sock
 713                     self._tunnel()
 714                 try:
 715                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 716                 except ssl.SSLError:
 717                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 718
 719         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 720             def https_open(self, req):
 721                 return self.do_open(HTTPSConnectionV3, req)
 722         return HTTPSHandlerV3(**kwargs)
 723     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 724         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 725         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 726         if opts_no_check_certificate:
 727             context.verify_mode = ssl.CERT_NONE
 728         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 729     else:  # Python < 3.4
 730         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 731         context.verify_mode = (ssl.CERT_NONE
 732                                if opts_no_check_certificate
 733                                else ssl.CERT_REQUIRED)
 734         context.set_default_verify_paths()
 735         try:
 736             context.load_default_certs()
 737         except AttributeError:
 738             pass  # Python < 3.4
 739         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 740
 741 class ExtractorError(Exception):
 742     """Error during info extraction."""
 743     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 744         """ tb, if given, is the original traceback (so that it can be printed out).
 745         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 746         """
 747
 748         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 749             expected = True
 750         if video_id is not None:
 751             msg = video_id + ': ' + msg
 752         if cause:
 753             msg += u' (caused by %r)' % cause
 754         if not expected:
 755             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 756         super(ExtractorError, self).__init__(msg)
 757
 758         self.traceback = tb
 759         self.exc_info = sys.exc_info()  # preserve original exception
 760         self.cause = cause
 761         self.video_id = video_id
 762
 763     def format_traceback(self):
 764         if self.traceback is None:
 765             return None
 766         return u''.join(traceback.format_tb(self.traceback))
 767
 768
 769 class RegexNotFoundError(ExtractorError):
 770     """Error when a regex didn't match"""
 771     pass
 772
 773
 774 class DownloadError(Exception):
 775     """Download Error exception.
 776
 777     This exception may be thrown by FileDownloader objects if they are not
 778     configured to continue on errors. They will contain the appropriate
 779     error message.
 780     """
 781     def __init__(self, msg, exc_info=None):
 782         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 783         super(DownloadError, self).__init__(msg)
 784         self.exc_info = exc_info
 785
 786
 787 class SameFileError(Exception):
 788     """Same File exception.
 789
 790     This exception will be thrown by FileDownloader objects if they detect
 791     multiple files would have to be downloaded to the same file on disk.
 792     """
 793     pass
 794
 795
 796 class PostProcessingError(Exception):
 797     """Post Processing exception.
 798
 799     This exception may be raised by PostProcessor's .run() method to
 800     indicate an error in the postprocessing task.
 801     """
 802     def __init__(self, msg):
 803         self.msg = msg
 804
 805 class MaxDownloadsReached(Exception):
 806     """ --max-downloads limit has been reached. """
 807     pass
 808
 809
 810 class UnavailableVideoError(Exception):
 811     """Unavailable Format exception.
 812
 813     This exception will be thrown when a video is requested
 814     in a format that is not available for that video.
 815     """
 816     pass
 817
 818
 819 class ContentTooShortError(Exception):
 820     """Content Too Short exception.
 821
 822     This exception may be raised by FileDownloader objects when a file they
 823     download is too small for what the server announced first, indicating
 824     the connection was probably interrupted.
 825     """
 826     # Both in bytes
 827     downloaded = None
 828     expected = None
 829
 830     def __init__(self, downloaded, expected):
 831         self.downloaded = downloaded
 832         self.expected = expected
 833
 834 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 835     """Handler for HTTP requests and responses.
 836
 837     This class, when installed with an OpenerDirector, automatically adds
 838     the standard headers to every HTTP request and handles gzipped and
 839     deflated responses from web servers. If compression is to be avoided in
 840     a particular request, the original request in the program code only has
 841     to include the HTTP header "Youtubedl-No-Compression", which will be
 842     removed before making the real request.
 843
 844     Part of this code was copied from:
 845
 846     http://techknack.net/python-urllib2-handlers/
 847
 848     Andrew Rowls, the author of that code, agreed to release it to the
 849     public domain.
 850     """
 851
 852     @staticmethod
 853     def deflate(data):
 854         try:
 855             return zlib.decompress(data, -zlib.MAX_WBITS)
 856         except zlib.error:
 857             return zlib.decompress(data)
 858
 859     @staticmethod
 860     def addinfourl_wrapper(stream, headers, url, code):
 861         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 862             return compat_urllib_request.addinfourl(stream, headers, url, code)
 863         ret = compat_urllib_request.addinfourl(stream, headers, url)
 864         ret.code = code
 865         return ret
 866
 867     def http_request(self, req):
 868         for h, v in std_headers.items():
 869             if h not in req.headers:
 870                 req.add_header(h, v)
 871         if 'Youtubedl-no-compression' in req.headers:
 872             if 'Accept-encoding' in req.headers:
 873                 del req.headers['Accept-encoding']
 874             del req.headers['Youtubedl-no-compression']
 875         if 'Youtubedl-user-agent' in req.headers:
 876             if 'User-agent' in req.headers:
 877                 del req.headers['User-agent']
 878             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 879             del req.headers['Youtubedl-user-agent']
 880
 881         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 882             # Python 2.6 is brain-dead when it comes to fragments
 883             req._Request__original = req._Request__original.partition('#')[0]
 884             req._Request__r_type = req._Request__r_type.partition('#')[0]
 885
 886         return req
 887
 888     def http_response(self, req, resp):
 889         old_resp = resp
 890         # gzip
 891         if resp.headers.get('Content-encoding', '') == 'gzip':
 892             content = resp.read()
 893             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 894             try:
 895                 uncompressed = io.BytesIO(gz.read())
 896             except IOError as original_ioerror:
 897                 # There may be junk add the end of the file
 898                 # See http://stackoverflow.com/q/4928560/35070 for details
 899                 for i in range(1, 1024):
 900                     try:
 901                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 902                         uncompressed = io.BytesIO(gz.read())
 903                     except IOError:
 904                         continue
 905                     break
 906                 else:
 907                     raise original_ioerror
 908             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 909             resp.msg = old_resp.msg
 910         # deflate
 911         if resp.headers.get('Content-encoding', '') == 'deflate':
 912             gz = io.BytesIO(self.deflate(resp.read()))
 913             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 914             resp.msg = old_resp.msg
 915         return resp
 916
 917     https_request = http_request
 918     https_response = http_response
 919
 920
 921 def parse_iso8601(date_str, delimiter='T'):
 922     """ Return a UNIX timestamp from the given date """
 923
 924     if date_str is None:
 925         return None
 926
 927     m = re.search(
 928         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 929         date_str)
 930     if not m:
 931         timezone = datetime.timedelta()
 932     else:
 933         date_str = date_str[:-len(m.group(0))]
 934         if not m.group('sign'):
 935             timezone = datetime.timedelta()
 936         else:
 937             sign = 1 if m.group('sign') == '+' else -1
 938             timezone = datetime.timedelta(
 939                 hours=sign * int(m.group('hours')),
 940                 minutes=sign * int(m.group('minutes')))
 941     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 942     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 943     return calendar.timegm(dt.timetuple())
 944
 945
 946 def unified_strdate(date_str):
 947     """Return a string with the date in the format YYYYMMDD"""
 948
 949     if date_str is None:
 950         return None
 951
 952     upload_date = None
 953     #Replace commas
 954     date_str = date_str.replace(',', ' ')
 955     # %z (UTC offset) is only supported in python>=3.2
 956     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 957     format_expressions = [
 958         '%d %B %Y',
 959         '%d %b %Y',
 960         '%B %d %Y',
 961         '%b %d %Y',
 962         '%b %dst %Y %I:%M%p',
 963         '%b %dnd %Y %I:%M%p',
 964         '%b %dth %Y %I:%M%p',
 965         '%Y-%m-%d',
 966         '%Y/%m/%d',
 967         '%d.%m.%Y',
 968         '%d/%m/%Y',
 969         '%d/%m/%y',
 970         '%Y/%m/%d %H:%M:%S',
 971         '%d/%m/%Y %H:%M:%S',
 972         '%Y-%m-%d %H:%M:%S',
 973         '%Y-%m-%d %H:%M:%S.%f',
 974         '%d.%m.%Y %H:%M',
 975         '%d.%m.%Y %H.%M',
 976         '%Y-%m-%dT%H:%M:%SZ',
 977         '%Y-%m-%dT%H:%M:%S.%fZ',
 978         '%Y-%m-%dT%H:%M:%S.%f0Z',
 979         '%Y-%m-%dT%H:%M:%S',
 980         '%Y-%m-%dT%H:%M:%S.%f',
 981         '%Y-%m-%dT%H:%M',
 982     ]
 983     for expression in format_expressions:
 984         try:
 985             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 986         except ValueError:
 987             pass
 988     if upload_date is None:
 989         timetuple = email.utils.parsedate_tz(date_str)
 990         if timetuple:
 991             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 992     return upload_date
 993
 994 def determine_ext(url, default_ext=u'unknown_video'):
 995     if url is None:
 996         return default_ext
 997     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 998     if re.match(r'^[A-Za-z0-9]+$', guess):
 999         return guess
1000     else:
1001         return default_ext
1002
1003 def subtitles_filename(filename, sub_lang, sub_format):
1004     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
1005
1006 def date_from_str(date_str):
1007     """
1008     Return a datetime object from a string in the format YYYYMMDD or
1009     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1010     today = datetime.date.today()
1011     if date_str == 'now'or date_str == 'today':
1012         return today
1013     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1014     if match is not None:
1015         sign = match.group('sign')
1016         time = int(match.group('time'))
1017         if sign == '-':
1018             time = -time
1019         unit = match.group('unit')
1020         #A bad aproximation?
1021         if unit == 'month':
1022             unit = 'day'
1023             time *= 30
1024         elif unit == 'year':
1025             unit = 'day'
1026             time *= 365
1027         unit += 's'
1028         delta = datetime.timedelta(**{unit: time})
1029         return today + delta
1030     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
1031
1032 def hyphenate_date(date_str):
1033     """
1034     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1035     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1036     if match is not None:
1037         return '-'.join(match.groups())
1038     else:
1039         return date_str
1040
1041 class DateRange(object):
1042     """Represents a time interval between two dates"""
1043     def __init__(self, start=None, end=None):
1044         """start and end must be strings in the format accepted by date"""
1045         if start is not None:
1046             self.start = date_from_str(start)
1047         else:
1048             self.start = datetime.datetime.min.date()
1049         if end is not None:
1050             self.end = date_from_str(end)
1051         else:
1052             self.end = datetime.datetime.max.date()
1053         if self.start > self.end:
1054             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1055     @classmethod
1056     def day(cls, day):
1057         """Returns a range that only contains the given day"""
1058         return cls(day,day)
1059     def __contains__(self, date):
1060         """Check if the date is in the range"""
1061         if not isinstance(date, datetime.date):
1062             date = date_from_str(date)
1063         return self.start <= date <= self.end
1064     def __str__(self):
1065         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
1066
1067
1068 def platform_name():
1069     """ Returns the platform name as a compat_str """
1070     res = platform.platform()
1071     if isinstance(res, bytes):
1072         res = res.decode(preferredencoding())
1073
1074     assert isinstance(res, compat_str)
1075     return res
1076
1077
1078 def _windows_write_string(s, out):
1079     """ Returns True if the string was written using special methods,
1080     False if it has yet to be written out."""
1081     # Adapted from http://stackoverflow.com/a/3259271/35070
1082
1083     import ctypes
1084     import ctypes.wintypes
1085
1086     WIN_OUTPUT_IDS = {
1087         1: -11,
1088         2: -12,
1089     }
1090
1091     try:
1092         fileno = out.fileno()
1093     except AttributeError:
1094         # If the output stream doesn't have a fileno, it's virtual
1095         return False
1096     if fileno not in WIN_OUTPUT_IDS:
1097         return False
1098
1099     GetStdHandle = ctypes.WINFUNCTYPE(
1100         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1101         ("GetStdHandle", ctypes.windll.kernel32))
1102     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1103
1104     WriteConsoleW = ctypes.WINFUNCTYPE(
1105         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1106         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1107         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1108     written = ctypes.wintypes.DWORD(0)
1109
1110     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1111     FILE_TYPE_CHAR = 0x0002
1112     FILE_TYPE_REMOTE = 0x8000
1113     GetConsoleMode = ctypes.WINFUNCTYPE(
1114         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1115         ctypes.POINTER(ctypes.wintypes.DWORD))(
1116         ("GetConsoleMode", ctypes.windll.kernel32))
1117     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1118
1119     def not_a_console(handle):
1120         if handle == INVALID_HANDLE_VALUE or handle is None:
1121             return True
1122         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1123                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1124
1125     if not_a_console(h):
1126         return False
1127
1128     def next_nonbmp_pos(s):
1129         try:
1130             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1131         except StopIteration:
1132             return len(s)
1133
1134     while s:
1135         count = min(next_nonbmp_pos(s), 1024)
1136
1137         ret = WriteConsoleW(
1138             h, s, count if count else 2, ctypes.byref(written), None)
1139         if ret == 0:
1140             raise OSError('Failed to write string')
1141         if not count:  # We just wrote a non-BMP character
1142             assert written.value == 2
1143             s = s[1:]
1144         else:
1145             assert written.value > 0
1146             s = s[written.value:]
1147     return True
1148
1149
1150 def write_string(s, out=None, encoding=None):
1151     if out is None:
1152         out = sys.stderr
1153     assert type(s) == compat_str
1154
1155     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1156         if _windows_write_string(s, out):
1157             return
1158
1159     if ('b' in getattr(out, 'mode', '') or
1160             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1161         byt = s.encode(encoding or preferredencoding(), 'ignore')
1162         out.write(byt)
1163     elif hasattr(out, 'buffer'):
1164         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1165         byt = s.encode(enc, 'ignore')
1166         out.buffer.write(byt)
1167     else:
1168         out.write(s)
1169     out.flush()
1170
1171
1172 def bytes_to_intlist(bs):
1173     if not bs:
1174         return []
1175     if isinstance(bs[0], int):  # Python 3
1176         return list(bs)
1177     else:
1178         return [ord(c) for c in bs]
1179
1180
1181 def intlist_to_bytes(xs):
1182     if not xs:
1183         return b''
1184     if isinstance(chr(0), bytes):  # Python 2
1185         return ''.join([chr(x) for x in xs])
1186     else:
1187         return bytes(xs)
1188
1189
1190 # Cross-platform file locking
1191 if sys.platform == 'win32':
1192     import ctypes.wintypes
1193     import msvcrt
1194
1195     class OVERLAPPED(ctypes.Structure):
1196         _fields_ = [
1197             ('Internal', ctypes.wintypes.LPVOID),
1198             ('InternalHigh', ctypes.wintypes.LPVOID),
1199             ('Offset', ctypes.wintypes.DWORD),
1200             ('OffsetHigh', ctypes.wintypes.DWORD),
1201             ('hEvent', ctypes.wintypes.HANDLE),
1202         ]
1203
1204     kernel32 = ctypes.windll.kernel32
1205     LockFileEx = kernel32.LockFileEx
1206     LockFileEx.argtypes = [
1207         ctypes.wintypes.HANDLE,     # hFile
1208         ctypes.wintypes.DWORD,      # dwFlags
1209         ctypes.wintypes.DWORD,      # dwReserved
1210         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1211         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1212         ctypes.POINTER(OVERLAPPED)  # Overlapped
1213     ]
1214     LockFileEx.restype = ctypes.wintypes.BOOL
1215     UnlockFileEx = kernel32.UnlockFileEx
1216     UnlockFileEx.argtypes = [
1217         ctypes.wintypes.HANDLE,     # hFile
1218         ctypes.wintypes.DWORD,      # dwReserved
1219         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1220         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1221         ctypes.POINTER(OVERLAPPED)  # Overlapped
1222     ]
1223     UnlockFileEx.restype = ctypes.wintypes.BOOL
1224     whole_low = 0xffffffff
1225     whole_high = 0x7fffffff
1226
1227     def _lock_file(f, exclusive):
1228         overlapped = OVERLAPPED()
1229         overlapped.Offset = 0
1230         overlapped.OffsetHigh = 0
1231         overlapped.hEvent = 0
1232         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1233         handle = msvcrt.get_osfhandle(f.fileno())
1234         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1235                           whole_low, whole_high, f._lock_file_overlapped_p):
1236             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1237
1238     def _unlock_file(f):
1239         assert f._lock_file_overlapped_p
1240         handle = msvcrt.get_osfhandle(f.fileno())
1241         if not UnlockFileEx(handle, 0,
1242                             whole_low, whole_high, f._lock_file_overlapped_p):
1243             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1244
1245 else:
1246     import fcntl
1247
1248     def _lock_file(f, exclusive):
1249         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1250
1251     def _unlock_file(f):
1252         fcntl.flock(f, fcntl.LOCK_UN)
1253
1254
1255 class locked_file(object):
1256     def __init__(self, filename, mode, encoding=None):
1257         assert mode in ['r', 'a', 'w']
1258         self.f = io.open(filename, mode, encoding=encoding)
1259         self.mode = mode
1260
1261     def __enter__(self):
1262         exclusive = self.mode != 'r'
1263         try:
1264             _lock_file(self.f, exclusive)
1265         except IOError:
1266             self.f.close()
1267             raise
1268         return self
1269
1270     def __exit__(self, etype, value, traceback):
1271         try:
1272             _unlock_file(self.f)
1273         finally:
1274             self.f.close()
1275
1276     def __iter__(self):
1277         return iter(self.f)
1278
1279     def write(self, *args):
1280         return self.f.write(*args)
1281
1282     def read(self, *args):
1283         return self.f.read(*args)
1284
1285
1286 def get_filesystem_encoding():
1287     encoding = sys.getfilesystemencoding()
1288     return encoding if encoding is not None else 'utf-8'
1289
1290
1291 def shell_quote(args):
1292     quoted_args = []
1293     encoding = get_filesystem_encoding()
1294     for a in args:
1295         if isinstance(a, bytes):
1296             # We may get a filename encoded with 'encodeFilename'
1297             a = a.decode(encoding)
1298         quoted_args.append(pipes.quote(a))
1299     return u' '.join(quoted_args)
1300
1301
1302 def takewhile_inclusive(pred, seq):
1303     """ Like itertools.takewhile, but include the latest evaluated element
1304         (the first element so that Not pred(e)) """
1305     for e in seq:
1306         yield e
1307         if not pred(e):
1308             return
1309
1310
1311 def smuggle_url(url, data):
1312     """ Pass additional data in a URL for internal use. """
1313
1314     sdata = compat_urllib_parse.urlencode(
1315         {u'__youtubedl_smuggle': json.dumps(data)})
1316     return url + u'#' + sdata
1317
1318
1319 def unsmuggle_url(smug_url, default=None):
1320     if not '#__youtubedl_smuggle' in smug_url:
1321         return smug_url, default
1322     url, _, sdata = smug_url.rpartition(u'#')
1323     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1324     data = json.loads(jsond)
1325     return url, data
1326
1327
1328 def format_bytes(bytes):
1329     if bytes is None:
1330         return u'N/A'
1331     if type(bytes) is str:
1332         bytes = float(bytes)
1333     if bytes == 0.0:
1334         exponent = 0
1335     else:
1336         exponent = int(math.log(bytes, 1024.0))
1337     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1338     converted = float(bytes) / float(1024 ** exponent)
1339     return u'%.2f%s' % (converted, suffix)
1340
1341
1342 def get_term_width():
1343     columns = compat_getenv('COLUMNS', None)
1344     if columns:
1345         return int(columns)
1346
1347     try:
1348         sp = subprocess.Popen(
1349             ['stty', 'size'],
1350             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1351         out, err = sp.communicate()
1352         return int(out.split()[1])
1353     except:
1354         pass
1355     return None
1356
1357
1358 def month_by_name(name):
1359     """ Return the number of a month by (locale-independently) English name """
1360
1361     ENGLISH_NAMES = [
1362         u'January', u'February', u'March', u'April', u'May', u'June',
1363         u'July', u'August', u'September', u'October', u'November', u'December']
1364     try:
1365         return ENGLISH_NAMES.index(name) + 1
1366     except ValueError:
1367         return None
1368
1369
1370 def fix_xml_ampersands(xml_str):
1371     """Replace all the '&' by '&amp;' in XML"""
1372     return re.sub(
1373         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1374         u'&amp;',
1375         xml_str)
1376
1377
1378 def setproctitle(title):
1379     assert isinstance(title, compat_str)
1380     try:
1381         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1382     except OSError:
1383         return
1384     title_bytes = title.encode('utf-8')
1385     buf = ctypes.create_string_buffer(len(title_bytes))
1386     buf.value = title_bytes
1387     try:
1388         libc.prctl(15, buf, 0, 0, 0)
1389     except AttributeError:
1390         return  # Strange libc, just skip this
1391
1392
1393 def remove_start(s, start):
1394     if s.startswith(start):
1395         return s[len(start):]
1396     return s
1397
1398
1399 def remove_end(s, end):
1400     if s.endswith(end):
1401         return s[:-len(end)]
1402     return s
1403
1404
1405 def url_basename(url):
1406     path = compat_urlparse.urlparse(url).path
1407     return path.strip(u'/').split(u'/')[-1]
1408
1409
1410 class HEADRequest(compat_urllib_request.Request):
1411     def get_method(self):
1412         return "HEAD"
1413
1414
1415 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1416     if get_attr:
1417         if v is not None:
1418             v = getattr(v, get_attr, None)
1419     if v == '':
1420         v = None
1421     return default if v is None else (int(v) * invscale // scale)
1422
1423
1424 def str_or_none(v, default=None):
1425     return default if v is None else compat_str(v)
1426
1427
1428 def str_to_int(int_str):
1429     """ A more relaxed version of int_or_none """
1430     if int_str is None:
1431         return None
1432     int_str = re.sub(r'[,\.\+]', u'', int_str)
1433     return int(int_str)
1434
1435
1436 def float_or_none(v, scale=1, invscale=1, default=None):
1437     return default if v is None else (float(v) * invscale / scale)
1438
1439
1440 def parse_duration(s):
1441     if s is None:
1442         return None
1443
1444     s = s.strip()
1445
1446     m = re.match(
1447         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1448     if not m:
1449         return None
1450     res = int(m.group('secs'))
1451     if m.group('mins'):
1452         res += int(m.group('mins')) * 60
1453         if m.group('hours'):
1454             res += int(m.group('hours')) * 60 * 60
1455     if m.group('ms'):
1456         res += float(m.group('ms'))
1457     return res
1458
1459
1460 def prepend_extension(filename, ext):
1461     name, real_ext = os.path.splitext(filename)
1462     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1463
1464
1465 def check_executable(exe, args=[]):
1466     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1467     args can be a list of arguments for a short output (like -version) """
1468     try:
1469         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1470     except OSError:
1471         return False
1472     return exe
1473
1474
1475 class PagedList(object):
1476     def __len__(self):
1477         # This is only useful for tests
1478         return len(self.getslice())
1479
1480
1481 class OnDemandPagedList(PagedList):
1482     def __init__(self, pagefunc, pagesize):
1483         self._pagefunc = pagefunc
1484         self._pagesize = pagesize
1485
1486     def getslice(self, start=0, end=None):
1487         res = []
1488         for pagenum in itertools.count(start // self._pagesize):
1489             firstid = pagenum * self._pagesize
1490             nextfirstid = pagenum * self._pagesize + self._pagesize
1491             if start >= nextfirstid:
1492                 continue
1493
1494             page_results = list(self._pagefunc(pagenum))
1495
1496             startv = (
1497                 start % self._pagesize
1498                 if firstid <= start < nextfirstid
1499                 else 0)
1500
1501             endv = (
1502                 ((end - 1) % self._pagesize) + 1
1503                 if (end is not None and firstid <= end <= nextfirstid)
1504                 else None)
1505
1506             if startv != 0 or endv is not None:
1507                 page_results = page_results[startv:endv]
1508             res.extend(page_results)
1509
1510             # A little optimization - if current page is not "full", ie. does
1511             # not contain page_size videos then we can assume that this page
1512             # is the last one - there are no more ids on further pages -
1513             # i.e. no need to query again.
1514             if len(page_results) + startv < self._pagesize:
1515                 break
1516
1517             # If we got the whole page, but the next page is not interesting,
1518             # break out early as well
1519             if end == nextfirstid:
1520                 break
1521         return res
1522
1523
1524 class InAdvancePagedList(PagedList):
1525     def __init__(self, pagefunc, pagecount, pagesize):
1526         self._pagefunc = pagefunc
1527         self._pagecount = pagecount
1528         self._pagesize = pagesize
1529
1530     def getslice(self, start=0, end=None):
1531         res = []
1532         start_page = start // self._pagesize
1533         end_page = (
1534             self._pagecount if end is None else (end // self._pagesize + 1))
1535         skip_elems = start - start_page * self._pagesize
1536         only_more = None if end is None else end - start
1537         for pagenum in range(start_page, end_page):
1538             page = list(self._pagefunc(pagenum))
1539             if skip_elems:
1540                 page = page[skip_elems:]
1541                 skip_elems = None
1542             if only_more is not None:
1543                 if len(page) < only_more:
1544                     only_more -= len(page)
1545                 else:
1546                     page = page[:only_more]
1547                     res.extend(page)
1548                     break
1549             res.extend(page)
1550         return res
1551
1552
1553 def uppercase_escape(s):
1554     unicode_escape = codecs.getdecoder('unicode_escape')
1555     return re.sub(
1556         r'\\U[0-9a-fA-F]{8}',
1557         lambda m: unicode_escape(m.group(0))[0],
1558         s)
1559
1560
1561 def escape_rfc3986(s):
1562     """Escape non-ASCII characters as suggested by RFC 3986"""
1563     if sys.version_info < (3, 0) and isinstance(s, unicode):
1564         s = s.encode('utf-8')
1565     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1566
1567
1568 def escape_url(url):
1569     """Escape URL as suggested by RFC 3986"""
1570     url_parsed = compat_urllib_parse_urlparse(url)
1571     return url_parsed._replace(
1572         path=escape_rfc3986(url_parsed.path),
1573         params=escape_rfc3986(url_parsed.params),
1574         query=escape_rfc3986(url_parsed.query),
1575         fragment=escape_rfc3986(url_parsed.fragment)
1576     ).geturl()
1577
1578 try:
1579     struct.pack(u'!I', 0)
1580 except TypeError:
1581     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1582     def struct_pack(spec, *args):
1583         if isinstance(spec, compat_str):
1584             spec = spec.encode('ascii')
1585         return struct.pack(spec, *args)
1586
1587     def struct_unpack(spec, *args):
1588         if isinstance(spec, compat_str):
1589             spec = spec.encode('ascii')
1590         return struct.unpack(spec, *args)
1591 else:
1592     struct_pack = struct.pack
1593     struct_unpack = struct.unpack
1594
1595
1596 def read_batch_urls(batch_fd):
1597     def fixup(url):
1598         if not isinstance(url, compat_str):
1599             url = url.decode('utf-8', 'replace')
1600         BOM_UTF8 = u'\xef\xbb\xbf'
1601         if url.startswith(BOM_UTF8):
1602             url = url[len(BOM_UTF8):]
1603         url = url.strip()
1604         if url.startswith(('#', ';', ']')):
1605             return False
1606         return url
1607
1608     with contextlib.closing(batch_fd) as fd:
1609         return [url for url in map(fixup, fd) if url]
1610
1611
1612 def urlencode_postdata(*args, **kargs):
1613     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1614
1615
1616 try:
1617     etree_iter = xml.etree.ElementTree.Element.iter
1618 except AttributeError:  # Python <=2.6
1619     etree_iter = lambda n: n.findall('.//*')
1620
1621
1622 def parse_xml(s):
1623     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1624         def doctype(self, name, pubid, system):
1625             pass  # Ignore doctypes
1626
1627     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1628     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1629     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1630     # Fix up XML parser in Python 2.x
1631     if sys.version_info < (3, 0):
1632         for n in etree_iter(tree):
1633             if n.text is not None:
1634                 if not isinstance(n.text, compat_str):
1635                     n.text = n.text.decode('utf-8')
1636     return tree
1637
1638
1639 if sys.version_info < (3, 0) and sys.platform == 'win32':
1640     def compat_getpass(prompt, *args, **kwargs):
1641         if isinstance(prompt, compat_str):
1642             prompt = prompt.encode(preferredencoding())
1643         return getpass.getpass(prompt, *args, **kwargs)
1644 else:
1645     compat_getpass = getpass.getpass
1646
1647
1648 US_RATINGS = {
1649     'G': 0,
1650     'PG': 10,
1651     'PG-13': 13,
1652     'R': 16,
1653     'NC': 18,
1654 }
1655
1656
1657 def parse_age_limit(s):
1658     if s is None:
1659         return None
1660     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1661     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1662
1663
1664 def strip_jsonp(code):
1665     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1666
1667
1668 def js_to_json(code):
1669     def fix_kv(m):
1670         v = m.group(0)
1671         if v in ('true', 'false', 'null'):
1672             return v
1673         if v.startswith('"'):
1674             return v
1675         if v.startswith("'"):
1676             v = v[1:-1]
1677             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1678                 '\\\\': '\\\\',
1679                 "\\'": "'",
1680                 '"': '\\"',
1681             }[m.group(0)], v)
1682         return '"%s"' % v
1683
1684     res = re.sub(r'''(?x)
1685         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1686         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1687         [a-zA-Z_][a-zA-Z_0-9]*
1688         ''', fix_kv, code)
1689     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1690     return res
1691
1692
1693 def qualities(quality_ids):
1694     """ Get a numeric quality value out of a list of possible values """
1695     def q(qid):
1696         try:
1697             return quality_ids.index(qid)
1698         except ValueError:
1699             return -1
1700     return q
1701
1702
1703 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1704
1705 try:
1706     subprocess_check_output = subprocess.check_output
1707 except AttributeError:
1708     def subprocess_check_output(*args, **kwargs):
1709         assert 'input' not in kwargs
1710         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1711         output, _ = p.communicate()
1712         ret = p.poll()
1713         if ret:
1714             raise subprocess.CalledProcessError(ret, p.args, output=output)
1715         return output
1716
1717
1718 def limit_length(s, length):
1719     """ Add ellipses to overly long strings """
1720     if s is None:
1721         return None
1722     ELLIPSES = '...'
1723     if len(s) > length:
1724         return s[:length - len(ELLIPSES)] + ELLIPSES
1725     return s