_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 284         # .//node does not match if a node is a direct child of . !
 285         if isinstance(xpath, unicode):
 286             xpath = xpath.encode('ascii')
 287
 288         for f in node.findall(xpath):
 289             if f.attrib.get(key) == val:
 290                 return f
 291         return None
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295 def xpath_with_ns(path, ns_map):
 296     components = [c.split(':') for c in path.split('/')]
 297     replaced = []
 298     for c in components:
 299         if len(c) == 1:
 300             replaced.append(c[0])
 301         else:
 302             ns, tag = c
 303             replaced.append('{%s}%s' % (ns_map[ns], tag))
 304     return '/'.join(replaced)
 305
 306
 307 def xpath_text(node, xpath, name=None, fatal=False):
 308     if sys.version_info < (2, 7):  # Crazy 2.6
 309         xpath = xpath.encode('ascii')
 310
 311     n = node.find(xpath)
 312     if n is None:
 313         if fatal:
 314             name = xpath if name is None else name
 315             raise ExtractorError('Could not find XML element %s' % name)
 316         else:
 317             return None
 318     return n.text
 319
 320
 321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 322 class BaseHTMLParser(compat_html_parser.HTMLParser):
 323     def __init(self):
 324         compat_html_parser.HTMLParser.__init__(self)
 325         self.html = None
 326
 327     def loads(self, html):
 328         self.html = html
 329         self.feed(html)
 330         self.close()
 331
 332 class AttrParser(BaseHTMLParser):
 333     """Modified HTMLParser that isolates a tag with the specified attribute"""
 334     def __init__(self, attribute, value):
 335         self.attribute = attribute
 336         self.value = value
 337         self.result = None
 338         self.started = False
 339         self.depth = {}
 340         self.watch_startpos = False
 341         self.error_count = 0
 342         BaseHTMLParser.__init__(self)
 343
 344     def error(self, message):
 345         if self.error_count > 10 or self.started:
 346             raise compat_html_parser.HTMLParseError(message, self.getpos())
 347         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 348         self.error_count += 1
 349         self.goahead(1)
 350
 351     def handle_starttag(self, tag, attrs):
 352         attrs = dict(attrs)
 353         if self.started:
 354             self.find_startpos(None)
 355         if self.attribute in attrs and attrs[self.attribute] == self.value:
 356             self.result = [tag]
 357             self.started = True
 358             self.watch_startpos = True
 359         if self.started:
 360             if not tag in self.depth: self.depth[tag] = 0
 361             self.depth[tag] += 1
 362
 363     def handle_endtag(self, tag):
 364         if self.started:
 365             if tag in self.depth: self.depth[tag] -= 1
 366             if self.depth[self.result[0]] == 0:
 367                 self.started = False
 368                 self.result.append(self.getpos())
 369
 370     def find_startpos(self, x):
 371         """Needed to put the start position of the result (self.result[1])
 372         after the opening tag with the requested id"""
 373         if self.watch_startpos:
 374             self.watch_startpos = False
 375             self.result.append(self.getpos())
 376     handle_entityref = handle_charref = handle_data = handle_comment = \
 377     handle_decl = handle_pi = unknown_decl = find_startpos
 378
 379     def get_result(self):
 380         if self.result is None:
 381             return None
 382         if len(self.result) != 3:
 383             return None
 384         lines = self.html.split('\n')
 385         lines = lines[self.result[1][0]-1:self.result[2][0]]
 386         lines[0] = lines[0][self.result[1][1]:]
 387         if len(lines) == 1:
 388             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 389         lines[-1] = lines[-1][:self.result[2][1]]
 390         return '\n'.join(lines).strip()
 391 # Hack for https://github.com/rg3/youtube-dl/issues/662
 392 if sys.version_info < (2, 7, 3):
 393     AttrParser.parse_endtag = (lambda self, i:
 394         i + len("</scr'+'ipt>")
 395         if self.rawdata[i:].startswith("</scr'+'ipt>")
 396         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 397
 398 def get_element_by_id(id, html):
 399     """Return the content of the tag with the specified ID in the passed HTML document"""
 400     return get_element_by_attribute("id", id, html)
 401
 402 def get_element_by_attribute(attribute, value, html):
 403     """Return the content of the tag with the specified attribute in the passed HTML document"""
 404     parser = AttrParser(attribute, value)
 405     try:
 406         parser.loads(html)
 407     except compat_html_parser.HTMLParseError:
 408         pass
 409     return parser.get_result()
 410
 411 class MetaParser(BaseHTMLParser):
 412     """
 413     Modified HTMLParser that isolates a meta tag with the specified name
 414     attribute.
 415     """
 416     def __init__(self, name):
 417         BaseHTMLParser.__init__(self)
 418         self.name = name
 419         self.content = None
 420         self.result = None
 421
 422     def handle_starttag(self, tag, attrs):
 423         if tag != 'meta':
 424             return
 425         attrs = dict(attrs)
 426         if attrs.get('name') == self.name:
 427             self.result = attrs.get('content')
 428
 429     def get_result(self):
 430         return self.result
 431
 432 def get_meta_content(name, html):
 433     """
 434     Return the content attribute from the meta tag with the given name attribute.
 435     """
 436     parser = MetaParser(name)
 437     try:
 438         parser.loads(html)
 439     except compat_html_parser.HTMLParseError:
 440         pass
 441     return parser.get_result()
 442
 443
 444 def clean_html(html):
 445     """Clean an HTML snippet into a readable string"""
 446     # Newline vs <br />
 447     html = html.replace('\n', ' ')
 448     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 449     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 450     # Strip html tags
 451     html = re.sub('<.*?>', '', html)
 452     # Replace html entities
 453     html = unescapeHTML(html)
 454     return html.strip()
 455
 456
 457 def sanitize_open(filename, open_mode):
 458     """Try to open the given filename, and slightly tweak it if this fails.
 459
 460     Attempts to open the given filename. If this fails, it tries to change
 461     the filename slightly, step by step, until it's either able to open it
 462     or it fails and raises a final exception, like the standard open()
 463     function.
 464
 465     It returns the tuple (stream, definitive_file_name).
 466     """
 467     try:
 468         if filename == u'-':
 469             if sys.platform == 'win32':
 470                 import msvcrt
 471                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 472             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 473         stream = open(encodeFilename(filename), open_mode)
 474         return (stream, filename)
 475     except (IOError, OSError) as err:
 476         if err.errno in (errno.EACCES,):
 477             raise
 478
 479         # In case of error, try to remove win32 forbidden chars
 480         alt_filename = os.path.join(
 481                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 482                         for path_part in os.path.split(filename)
 483                        )
 484         if alt_filename == filename:
 485             raise
 486         else:
 487             # An exception here should be caught in the caller
 488             stream = open(encodeFilename(filename), open_mode)
 489             return (stream, alt_filename)
 490
 491
 492 def timeconvert(timestr):
 493     """Convert RFC 2822 defined time string into system timestamp"""
 494     timestamp = None
 495     timetuple = email.utils.parsedate_tz(timestr)
 496     if timetuple is not None:
 497         timestamp = email.utils.mktime_tz(timetuple)
 498     return timestamp
 499
 500 def sanitize_filename(s, restricted=False, is_id=False):
 501     """Sanitizes a string so it could be used as part of a filename.
 502     If restricted is set, use a stricter subset of allowed characters.
 503     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 504     """
 505     def replace_insane(char):
 506         if char == '?' or ord(char) < 32 or ord(char) == 127:
 507             return ''
 508         elif char == '"':
 509             return '' if restricted else '\''
 510         elif char == ':':
 511             return '_-' if restricted else ' -'
 512         elif char in '\\/|*<>':
 513             return '_'
 514         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 515             return '_'
 516         if restricted and ord(char) > 127:
 517             return '_'
 518         return char
 519
 520     result = u''.join(map(replace_insane, s))
 521     if not is_id:
 522         while '__' in result:
 523             result = result.replace('__', '_')
 524         result = result.strip('_')
 525         # Common case of "Foreign band name - English song title"
 526         if restricted and result.startswith('-_'):
 527             result = result[2:]
 528         if not result:
 529             result = '_'
 530     return result
 531
 532 def orderedSet(iterable):
 533     """ Remove all duplicates from the input iterable """
 534     res = []
 535     for el in iterable:
 536         if el not in res:
 537             res.append(el)
 538     return res
 539
 540
 541 def _htmlentity_transform(entity):
 542     """Transforms an HTML entity to a character."""
 543     # Known non-numeric HTML entity
 544     if entity in compat_html_entities.name2codepoint:
 545         return compat_chr(compat_html_entities.name2codepoint[entity])
 546
 547     mobj = re.match(r'#(x?[0-9]+)', entity)
 548     if mobj is not None:
 549         numstr = mobj.group(1)
 550         if numstr.startswith(u'x'):
 551             base = 16
 552             numstr = u'0%s' % numstr
 553         else:
 554             base = 10
 555         return compat_chr(int(numstr, base))
 556
 557     # Unknown entity in name, return its literal representation
 558     return (u'&%s;' % entity)
 559
 560
 561 def unescapeHTML(s):
 562     if s is None:
 563         return None
 564     assert type(s) == compat_str
 565
 566     return re.sub(
 567         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 568
 569
 570 def encodeFilename(s, for_subprocess=False):
 571     """
 572     @param s The name of the file
 573     """
 574
 575     assert type(s) == compat_str
 576
 577     # Python 3 has a Unicode API
 578     if sys.version_info >= (3, 0):
 579         return s
 580
 581     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 582         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 583         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 584         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 585         if not for_subprocess:
 586             return s
 587         else:
 588             # For subprocess calls, encode with locale encoding
 589             # Refer to http://stackoverflow.com/a/9951851/35070
 590             encoding = preferredencoding()
 591     else:
 592         encoding = sys.getfilesystemencoding()
 593     if encoding is None:
 594         encoding = 'utf-8'
 595     return s.encode(encoding, 'ignore')
 596
 597
 598 def encodeArgument(s):
 599     if not isinstance(s, compat_str):
 600         # Legacy code that uses byte strings
 601         # Uncomment the following line after fixing all post processors
 602         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 603         s = s.decode('ascii')
 604     return encodeFilename(s, True)
 605
 606
 607 def decodeOption(optval):
 608     if optval is None:
 609         return optval
 610     if isinstance(optval, bytes):
 611         optval = optval.decode(preferredencoding())
 612
 613     assert isinstance(optval, compat_str)
 614     return optval
 615
 616 def formatSeconds(secs):
 617     if secs > 3600:
 618         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 619     elif secs > 60:
 620         return '%d:%02d' % (secs // 60, secs % 60)
 621     else:
 622         return '%d' % secs
 623
 624
 625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 626     if sys.version_info < (3, 2):
 627         import httplib
 628
 629         class HTTPSConnectionV3(httplib.HTTPSConnection):
 630             def __init__(self, *args, **kwargs):
 631                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 632
 633             def connect(self):
 634                 sock = socket.create_connection((self.host, self.port), self.timeout)
 635                 if getattr(self, '_tunnel_host', False):
 636                     self.sock = sock
 637                     self._tunnel()
 638                 try:
 639                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 640                 except ssl.SSLError:
 641                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 642
 643         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 644             def https_open(self, req):
 645                 return self.do_open(HTTPSConnectionV3, req)
 646         return HTTPSHandlerV3(**kwargs)
 647     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 648         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 649         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 650         if opts_no_check_certificate:
 651             context.verify_mode = ssl.CERT_NONE
 652         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 653     else:  # Python < 3.4
 654         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 655         context.verify_mode = (ssl.CERT_NONE
 656                                if opts_no_check_certificate
 657                                else ssl.CERT_REQUIRED)
 658         context.set_default_verify_paths()
 659         try:
 660             context.load_default_certs()
 661         except AttributeError:
 662             pass  # Python < 3.4
 663         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 664
 665 class ExtractorError(Exception):
 666     """Error during info extraction."""
 667     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 668         """ tb, if given, is the original traceback (so that it can be printed out).
 669         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 670         """
 671
 672         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 673             expected = True
 674         if video_id is not None:
 675             msg = video_id + ': ' + msg
 676         if cause:
 677             msg += u' (caused by %r)' % cause
 678         if not expected:
 679             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 680         super(ExtractorError, self).__init__(msg)
 681
 682         self.traceback = tb
 683         self.exc_info = sys.exc_info()  # preserve original exception
 684         self.cause = cause
 685         self.video_id = video_id
 686
 687     def format_traceback(self):
 688         if self.traceback is None:
 689             return None
 690         return u''.join(traceback.format_tb(self.traceback))
 691
 692
 693 class RegexNotFoundError(ExtractorError):
 694     """Error when a regex didn't match"""
 695     pass
 696
 697
 698 class DownloadError(Exception):
 699     """Download Error exception.
 700
 701     This exception may be thrown by FileDownloader objects if they are not
 702     configured to continue on errors. They will contain the appropriate
 703     error message.
 704     """
 705     def __init__(self, msg, exc_info=None):
 706         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 707         super(DownloadError, self).__init__(msg)
 708         self.exc_info = exc_info
 709
 710
 711 class SameFileError(Exception):
 712     """Same File exception.
 713
 714     This exception will be thrown by FileDownloader objects if they detect
 715     multiple files would have to be downloaded to the same file on disk.
 716     """
 717     pass
 718
 719
 720 class PostProcessingError(Exception):
 721     """Post Processing exception.
 722
 723     This exception may be raised by PostProcessor's .run() method to
 724     indicate an error in the postprocessing task.
 725     """
 726     def __init__(self, msg):
 727         self.msg = msg
 728
 729 class MaxDownloadsReached(Exception):
 730     """ --max-downloads limit has been reached. """
 731     pass
 732
 733
 734 class UnavailableVideoError(Exception):
 735     """Unavailable Format exception.
 736
 737     This exception will be thrown when a video is requested
 738     in a format that is not available for that video.
 739     """
 740     pass
 741
 742
 743 class ContentTooShortError(Exception):
 744     """Content Too Short exception.
 745
 746     This exception may be raised by FileDownloader objects when a file they
 747     download is too small for what the server announced first, indicating
 748     the connection was probably interrupted.
 749     """
 750     # Both in bytes
 751     downloaded = None
 752     expected = None
 753
 754     def __init__(self, downloaded, expected):
 755         self.downloaded = downloaded
 756         self.expected = expected
 757
 758 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 759     """Handler for HTTP requests and responses.
 760
 761     This class, when installed with an OpenerDirector, automatically adds
 762     the standard headers to every HTTP request and handles gzipped and
 763     deflated responses from web servers. If compression is to be avoided in
 764     a particular request, the original request in the program code only has
 765     to include the HTTP header "Youtubedl-No-Compression", which will be
 766     removed before making the real request.
 767
 768     Part of this code was copied from:
 769
 770     http://techknack.net/python-urllib2-handlers/
 771
 772     Andrew Rowls, the author of that code, agreed to release it to the
 773     public domain.
 774     """
 775
 776     @staticmethod
 777     def deflate(data):
 778         try:
 779             return zlib.decompress(data, -zlib.MAX_WBITS)
 780         except zlib.error:
 781             return zlib.decompress(data)
 782
 783     @staticmethod
 784     def addinfourl_wrapper(stream, headers, url, code):
 785         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 786             return compat_urllib_request.addinfourl(stream, headers, url, code)
 787         ret = compat_urllib_request.addinfourl(stream, headers, url)
 788         ret.code = code
 789         return ret
 790
 791     def http_request(self, req):
 792         for h, v in std_headers.items():
 793             if h not in req.headers:
 794                 req.add_header(h, v)
 795         if 'Youtubedl-no-compression' in req.headers:
 796             if 'Accept-encoding' in req.headers:
 797                 del req.headers['Accept-encoding']
 798             del req.headers['Youtubedl-no-compression']
 799         if 'Youtubedl-user-agent' in req.headers:
 800             if 'User-agent' in req.headers:
 801                 del req.headers['User-agent']
 802             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 803             del req.headers['Youtubedl-user-agent']
 804
 805         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 806             # Python 2.6 is brain-dead when it comes to fragments
 807             req._Request__original = req._Request__original.partition('#')[0]
 808             req._Request__r_type = req._Request__r_type.partition('#')[0]
 809
 810         return req
 811
 812     def http_response(self, req, resp):
 813         old_resp = resp
 814         # gzip
 815         if resp.headers.get('Content-encoding', '') == 'gzip':
 816             content = resp.read()
 817             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 818             try:
 819                 uncompressed = io.BytesIO(gz.read())
 820             except IOError as original_ioerror:
 821                 # There may be junk add the end of the file
 822                 # See http://stackoverflow.com/q/4928560/35070 for details
 823                 for i in range(1, 1024):
 824                     try:
 825                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 826                         uncompressed = io.BytesIO(gz.read())
 827                     except IOError:
 828                         continue
 829                     break
 830                 else:
 831                     raise original_ioerror
 832             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 833             resp.msg = old_resp.msg
 834         # deflate
 835         if resp.headers.get('Content-encoding', '') == 'deflate':
 836             gz = io.BytesIO(self.deflate(resp.read()))
 837             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 838             resp.msg = old_resp.msg
 839         return resp
 840
 841     https_request = http_request
 842     https_response = http_response
 843
 844
 845 def parse_iso8601(date_str, delimiter='T'):
 846     """ Return a UNIX timestamp from the given date """
 847
 848     if date_str is None:
 849         return None
 850
 851     m = re.search(
 852         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 853         date_str)
 854     if not m:
 855         timezone = datetime.timedelta()
 856     else:
 857         date_str = date_str[:-len(m.group(0))]
 858         if not m.group('sign'):
 859             timezone = datetime.timedelta()
 860         else:
 861             sign = 1 if m.group('sign') == '+' else -1
 862             timezone = datetime.timedelta(
 863                 hours=sign * int(m.group('hours')),
 864                 minutes=sign * int(m.group('minutes')))
 865     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 866     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 867     return calendar.timegm(dt.timetuple())
 868
 869
 870 def unified_strdate(date_str):
 871     """Return a string with the date in the format YYYYMMDD"""
 872
 873     if date_str is None:
 874         return None
 875
 876     upload_date = None
 877     #Replace commas
 878     date_str = date_str.replace(',', ' ')
 879     # %z (UTC offset) is only supported in python>=3.2
 880     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 881     format_expressions = [
 882         '%d %B %Y',
 883         '%d %b %Y',
 884         '%B %d %Y',
 885         '%b %d %Y',
 886         '%b %dst %Y %I:%M%p',
 887         '%b %dnd %Y %I:%M%p',
 888         '%b %dth %Y %I:%M%p',
 889         '%Y-%m-%d',
 890         '%Y/%m/%d',
 891         '%d.%m.%Y',
 892         '%d/%m/%Y',
 893         '%d/%m/%y',
 894         '%Y/%m/%d %H:%M:%S',
 895         '%d/%m/%Y %H:%M:%S',
 896         '%Y-%m-%d %H:%M:%S',
 897         '%d.%m.%Y %H:%M',
 898         '%d.%m.%Y %H.%M',
 899         '%Y-%m-%dT%H:%M:%SZ',
 900         '%Y-%m-%dT%H:%M:%S.%fZ',
 901         '%Y-%m-%dT%H:%M:%S.%f0Z',
 902         '%Y-%m-%dT%H:%M:%S',
 903         '%Y-%m-%dT%H:%M:%S.%f',
 904         '%Y-%m-%dT%H:%M',
 905     ]
 906     for expression in format_expressions:
 907         try:
 908             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 909         except ValueError:
 910             pass
 911     if upload_date is None:
 912         timetuple = email.utils.parsedate_tz(date_str)
 913         if timetuple:
 914             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 915     return upload_date
 916
 917 def determine_ext(url, default_ext=u'unknown_video'):
 918     if url is None:
 919         return default_ext
 920     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 921     if re.match(r'^[A-Za-z0-9]+$', guess):
 922         return guess
 923     else:
 924         return default_ext
 925
 926 def subtitles_filename(filename, sub_lang, sub_format):
 927     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 928
 929 def date_from_str(date_str):
 930     """
 931     Return a datetime object from a string in the format YYYYMMDD or
 932     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 933     today = datetime.date.today()
 934     if date_str == 'now'or date_str == 'today':
 935         return today
 936     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 937     if match is not None:
 938         sign = match.group('sign')
 939         time = int(match.group('time'))
 940         if sign == '-':
 941             time = -time
 942         unit = match.group('unit')
 943         #A bad aproximation?
 944         if unit == 'month':
 945             unit = 'day'
 946             time *= 30
 947         elif unit == 'year':
 948             unit = 'day'
 949             time *= 365
 950         unit += 's'
 951         delta = datetime.timedelta(**{unit: time})
 952         return today + delta
 953     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 954
 955 def hyphenate_date(date_str):
 956     """
 957     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 958     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 959     if match is not None:
 960         return '-'.join(match.groups())
 961     else:
 962         return date_str
 963
 964 class DateRange(object):
 965     """Represents a time interval between two dates"""
 966     def __init__(self, start=None, end=None):
 967         """start and end must be strings in the format accepted by date"""
 968         if start is not None:
 969             self.start = date_from_str(start)
 970         else:
 971             self.start = datetime.datetime.min.date()
 972         if end is not None:
 973             self.end = date_from_str(end)
 974         else:
 975             self.end = datetime.datetime.max.date()
 976         if self.start > self.end:
 977             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 978     @classmethod
 979     def day(cls, day):
 980         """Returns a range that only contains the given day"""
 981         return cls(day,day)
 982     def __contains__(self, date):
 983         """Check if the date is in the range"""
 984         if not isinstance(date, datetime.date):
 985             date = date_from_str(date)
 986         return self.start <= date <= self.end
 987     def __str__(self):
 988         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 989
 990
 991 def platform_name():
 992     """ Returns the platform name as a compat_str """
 993     res = platform.platform()
 994     if isinstance(res, bytes):
 995         res = res.decode(preferredencoding())
 996
 997     assert isinstance(res, compat_str)
 998     return res
 999
1000
1001 def _windows_write_string(s, out):
1002     """ Returns True if the string was written using special methods,
1003     False if it has yet to be written out."""
1004     # Adapted from http://stackoverflow.com/a/3259271/35070
1005
1006     import ctypes
1007     import ctypes.wintypes
1008
1009     WIN_OUTPUT_IDS = {
1010         1: -11,
1011         2: -12,
1012     }
1013
1014     try:
1015         fileno = out.fileno()
1016     except AttributeError:
1017         # If the output stream doesn't have a fileno, it's virtual
1018         return False
1019     if fileno not in WIN_OUTPUT_IDS:
1020         return False
1021
1022     GetStdHandle = ctypes.WINFUNCTYPE(
1023         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1024         ("GetStdHandle", ctypes.windll.kernel32))
1025     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1026
1027     WriteConsoleW = ctypes.WINFUNCTYPE(
1028         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1029         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1030         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1031     written = ctypes.wintypes.DWORD(0)
1032
1033     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1034     FILE_TYPE_CHAR = 0x0002
1035     FILE_TYPE_REMOTE = 0x8000
1036     GetConsoleMode = ctypes.WINFUNCTYPE(
1037         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1038         ctypes.POINTER(ctypes.wintypes.DWORD))(
1039         ("GetConsoleMode", ctypes.windll.kernel32))
1040     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1041
1042     def not_a_console(handle):
1043         if handle == INVALID_HANDLE_VALUE or handle is None:
1044             return True
1045         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1046                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1047
1048     if not_a_console(h):
1049         return False
1050
1051     def next_nonbmp_pos(s):
1052         try:
1053             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1054         except StopIteration:
1055             return len(s)
1056
1057     while s:
1058         count = min(next_nonbmp_pos(s), 1024)
1059
1060         ret = WriteConsoleW(
1061             h, s, count if count else 2, ctypes.byref(written), None)
1062         if ret == 0:
1063             raise OSError('Failed to write string')
1064         if not count:  # We just wrote a non-BMP character
1065             assert written.value == 2
1066             s = s[1:]
1067         else:
1068             assert written.value > 0
1069             s = s[written.value:]
1070     return True
1071
1072
1073 def write_string(s, out=None, encoding=None):
1074     if out is None:
1075         out = sys.stderr
1076     assert type(s) == compat_str
1077
1078     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1079         if _windows_write_string(s, out):
1080             return
1081
1082     if ('b' in getattr(out, 'mode', '') or
1083             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1084         byt = s.encode(encoding or preferredencoding(), 'ignore')
1085         out.write(byt)
1086     elif hasattr(out, 'buffer'):
1087         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1088         byt = s.encode(enc, 'ignore')
1089         out.buffer.write(byt)
1090     else:
1091         out.write(s)
1092     out.flush()
1093
1094
1095 def bytes_to_intlist(bs):
1096     if not bs:
1097         return []
1098     if isinstance(bs[0], int):  # Python 3
1099         return list(bs)
1100     else:
1101         return [ord(c) for c in bs]
1102
1103
1104 def intlist_to_bytes(xs):
1105     if not xs:
1106         return b''
1107     if isinstance(chr(0), bytes):  # Python 2
1108         return ''.join([chr(x) for x in xs])
1109     else:
1110         return bytes(xs)
1111
1112
1113 # Cross-platform file locking
1114 if sys.platform == 'win32':
1115     import ctypes.wintypes
1116     import msvcrt
1117
1118     class OVERLAPPED(ctypes.Structure):
1119         _fields_ = [
1120             ('Internal', ctypes.wintypes.LPVOID),
1121             ('InternalHigh', ctypes.wintypes.LPVOID),
1122             ('Offset', ctypes.wintypes.DWORD),
1123             ('OffsetHigh', ctypes.wintypes.DWORD),
1124             ('hEvent', ctypes.wintypes.HANDLE),
1125         ]
1126
1127     kernel32 = ctypes.windll.kernel32
1128     LockFileEx = kernel32.LockFileEx
1129     LockFileEx.argtypes = [
1130         ctypes.wintypes.HANDLE,     # hFile
1131         ctypes.wintypes.DWORD,      # dwFlags
1132         ctypes.wintypes.DWORD,      # dwReserved
1133         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1134         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1135         ctypes.POINTER(OVERLAPPED)  # Overlapped
1136     ]
1137     LockFileEx.restype = ctypes.wintypes.BOOL
1138     UnlockFileEx = kernel32.UnlockFileEx
1139     UnlockFileEx.argtypes = [
1140         ctypes.wintypes.HANDLE,     # hFile
1141         ctypes.wintypes.DWORD,      # dwReserved
1142         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1143         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1144         ctypes.POINTER(OVERLAPPED)  # Overlapped
1145     ]
1146     UnlockFileEx.restype = ctypes.wintypes.BOOL
1147     whole_low = 0xffffffff
1148     whole_high = 0x7fffffff
1149
1150     def _lock_file(f, exclusive):
1151         overlapped = OVERLAPPED()
1152         overlapped.Offset = 0
1153         overlapped.OffsetHigh = 0
1154         overlapped.hEvent = 0
1155         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156         handle = msvcrt.get_osfhandle(f.fileno())
1157         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158                           whole_low, whole_high, f._lock_file_overlapped_p):
1159             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1160
1161     def _unlock_file(f):
1162         assert f._lock_file_overlapped_p
1163         handle = msvcrt.get_osfhandle(f.fileno())
1164         if not UnlockFileEx(handle, 0,
1165                             whole_low, whole_high, f._lock_file_overlapped_p):
1166             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1167
1168 else:
1169     import fcntl
1170
1171     def _lock_file(f, exclusive):
1172         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1173
1174     def _unlock_file(f):
1175         fcntl.flock(f, fcntl.LOCK_UN)
1176
1177
1178 class locked_file(object):
1179     def __init__(self, filename, mode, encoding=None):
1180         assert mode in ['r', 'a', 'w']
1181         self.f = io.open(filename, mode, encoding=encoding)
1182         self.mode = mode
1183
1184     def __enter__(self):
1185         exclusive = self.mode != 'r'
1186         try:
1187             _lock_file(self.f, exclusive)
1188         except IOError:
1189             self.f.close()
1190             raise
1191         return self
1192
1193     def __exit__(self, etype, value, traceback):
1194         try:
1195             _unlock_file(self.f)
1196         finally:
1197             self.f.close()
1198
1199     def __iter__(self):
1200         return iter(self.f)
1201
1202     def write(self, *args):
1203         return self.f.write(*args)
1204
1205     def read(self, *args):
1206         return self.f.read(*args)
1207
1208
1209 def shell_quote(args):
1210     quoted_args = []
1211     encoding = sys.getfilesystemencoding()
1212     if encoding is None:
1213         encoding = 'utf-8'
1214     for a in args:
1215         if isinstance(a, bytes):
1216             # We may get a filename encoded with 'encodeFilename'
1217             a = a.decode(encoding)
1218         quoted_args.append(pipes.quote(a))
1219     return u' '.join(quoted_args)
1220
1221
1222 def takewhile_inclusive(pred, seq):
1223     """ Like itertools.takewhile, but include the latest evaluated element
1224         (the first element so that Not pred(e)) """
1225     for e in seq:
1226         yield e
1227         if not pred(e):
1228             return
1229
1230
1231 def smuggle_url(url, data):
1232     """ Pass additional data in a URL for internal use. """
1233
1234     sdata = compat_urllib_parse.urlencode(
1235         {u'__youtubedl_smuggle': json.dumps(data)})
1236     return url + u'#' + sdata
1237
1238
1239 def unsmuggle_url(smug_url, default=None):
1240     if not '#__youtubedl_smuggle' in smug_url:
1241         return smug_url, default
1242     url, _, sdata = smug_url.rpartition(u'#')
1243     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1244     data = json.loads(jsond)
1245     return url, data
1246
1247
1248 def format_bytes(bytes):
1249     if bytes is None:
1250         return u'N/A'
1251     if type(bytes) is str:
1252         bytes = float(bytes)
1253     if bytes == 0.0:
1254         exponent = 0
1255     else:
1256         exponent = int(math.log(bytes, 1024.0))
1257     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1258     converted = float(bytes) / float(1024 ** exponent)
1259     return u'%.2f%s' % (converted, suffix)
1260
1261
1262 def get_term_width():
1263     columns = os.environ.get('COLUMNS', None)
1264     if columns:
1265         return int(columns)
1266
1267     try:
1268         sp = subprocess.Popen(
1269             ['stty', 'size'],
1270             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1271         out, err = sp.communicate()
1272         return int(out.split()[1])
1273     except:
1274         pass
1275     return None
1276
1277
1278 def month_by_name(name):
1279     """ Return the number of a month by (locale-independently) English name """
1280
1281     ENGLISH_NAMES = [
1282         u'January', u'February', u'March', u'April', u'May', u'June',
1283         u'July', u'August', u'September', u'October', u'November', u'December']
1284     try:
1285         return ENGLISH_NAMES.index(name) + 1
1286     except ValueError:
1287         return None
1288
1289
1290 def fix_xml_ampersands(xml_str):
1291     """Replace all the '&' by '&amp;' in XML"""
1292     return re.sub(
1293         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1294         u'&amp;',
1295         xml_str)
1296
1297
1298 def setproctitle(title):
1299     assert isinstance(title, compat_str)
1300     try:
1301         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1302     except OSError:
1303         return
1304     title_bytes = title.encode('utf-8')
1305     buf = ctypes.create_string_buffer(len(title_bytes))
1306     buf.value = title_bytes
1307     try:
1308         libc.prctl(15, buf, 0, 0, 0)
1309     except AttributeError:
1310         return  # Strange libc, just skip this
1311
1312
1313 def remove_start(s, start):
1314     if s.startswith(start):
1315         return s[len(start):]
1316     return s
1317
1318
1319 def remove_end(s, end):
1320     if s.endswith(end):
1321         return s[:-len(end)]
1322     return s
1323
1324
1325 def url_basename(url):
1326     path = compat_urlparse.urlparse(url).path
1327     return path.strip(u'/').split(u'/')[-1]
1328
1329
1330 class HEADRequest(compat_urllib_request.Request):
1331     def get_method(self):
1332         return "HEAD"
1333
1334
1335 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1336     if get_attr:
1337         if v is not None:
1338             v = getattr(v, get_attr, None)
1339     if v == '':
1340         v = None
1341     return default if v is None else (int(v) * invscale // scale)
1342
1343
1344 def str_or_none(v, default=None):
1345     return default if v is None else compat_str(v)
1346
1347
1348 def str_to_int(int_str):
1349     """ A more relaxed version of int_or_none """
1350     if int_str is None:
1351         return None
1352     int_str = re.sub(r'[,\.\+]', u'', int_str)
1353     return int(int_str)
1354
1355
1356 def float_or_none(v, scale=1, invscale=1, default=None):
1357     return default if v is None else (float(v) * invscale / scale)
1358
1359
1360 def parse_duration(s):
1361     if s is None:
1362         return None
1363
1364     s = s.strip()
1365
1366     m = re.match(
1367         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1368     if not m:
1369         return None
1370     res = int(m.group('secs'))
1371     if m.group('mins'):
1372         res += int(m.group('mins')) * 60
1373         if m.group('hours'):
1374             res += int(m.group('hours')) * 60 * 60
1375     if m.group('ms'):
1376         res += float(m.group('ms'))
1377     return res
1378
1379
1380 def prepend_extension(filename, ext):
1381     name, real_ext = os.path.splitext(filename)
1382     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1383
1384
1385 def check_executable(exe, args=[]):
1386     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1387     args can be a list of arguments for a short output (like -version) """
1388     try:
1389         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1390     except OSError:
1391         return False
1392     return exe
1393
1394
1395 class PagedList(object):
1396     def __len__(self):
1397         # This is only useful for tests
1398         return len(self.getslice())
1399
1400
1401 class OnDemandPagedList(PagedList):
1402     def __init__(self, pagefunc, pagesize):
1403         self._pagefunc = pagefunc
1404         self._pagesize = pagesize
1405
1406     def getslice(self, start=0, end=None):
1407         res = []
1408         for pagenum in itertools.count(start // self._pagesize):
1409             firstid = pagenum * self._pagesize
1410             nextfirstid = pagenum * self._pagesize + self._pagesize
1411             if start >= nextfirstid:
1412                 continue
1413
1414             page_results = list(self._pagefunc(pagenum))
1415
1416             startv = (
1417                 start % self._pagesize
1418                 if firstid <= start < nextfirstid
1419                 else 0)
1420
1421             endv = (
1422                 ((end - 1) % self._pagesize) + 1
1423                 if (end is not None and firstid <= end <= nextfirstid)
1424                 else None)
1425
1426             if startv != 0 or endv is not None:
1427                 page_results = page_results[startv:endv]
1428             res.extend(page_results)
1429
1430             # A little optimization - if current page is not "full", ie. does
1431             # not contain page_size videos then we can assume that this page
1432             # is the last one - there are no more ids on further pages -
1433             # i.e. no need to query again.
1434             if len(page_results) + startv < self._pagesize:
1435                 break
1436
1437             # If we got the whole page, but the next page is not interesting,
1438             # break out early as well
1439             if end == nextfirstid:
1440                 break
1441         return res
1442
1443
1444 class InAdvancePagedList(PagedList):
1445     def __init__(self, pagefunc, pagecount, pagesize):
1446         self._pagefunc = pagefunc
1447         self._pagecount = pagecount
1448         self._pagesize = pagesize
1449
1450     def getslice(self, start=0, end=None):
1451         res = []
1452         start_page = start // self._pagesize
1453         end_page = (
1454             self._pagecount if end is None else (end // self._pagesize + 1))
1455         skip_elems = start - start_page * self._pagesize
1456         only_more = None if end is None else end - start
1457         for pagenum in range(start_page, end_page):
1458             page = list(self._pagefunc(pagenum))
1459             if skip_elems:
1460                 page = page[skip_elems:]
1461                 skip_elems = None
1462             if only_more is not None:
1463                 if len(page) < only_more:
1464                     only_more -= len(page)
1465                 else:
1466                     page = page[:only_more]
1467                     res.extend(page)
1468                     break
1469             res.extend(page)
1470         return res
1471
1472
1473 def uppercase_escape(s):
1474     unicode_escape = codecs.getdecoder('unicode_escape')
1475     return re.sub(
1476         r'\\U[0-9a-fA-F]{8}',
1477         lambda m: unicode_escape(m.group(0))[0],
1478         s)
1479
1480
1481 def escape_rfc3986(s):
1482     """Escape non-ASCII characters as suggested by RFC 3986"""
1483     if sys.version_info < (3, 0) and isinstance(s, unicode):
1484         s = s.encode('utf-8')
1485     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1486
1487
1488 def escape_url(url):
1489     """Escape URL as suggested by RFC 3986"""
1490     url_parsed = compat_urllib_parse_urlparse(url)
1491     return url_parsed._replace(
1492         path=escape_rfc3986(url_parsed.path),
1493         params=escape_rfc3986(url_parsed.params),
1494         query=escape_rfc3986(url_parsed.query),
1495         fragment=escape_rfc3986(url_parsed.fragment)
1496     ).geturl()
1497
1498 try:
1499     struct.pack(u'!I', 0)
1500 except TypeError:
1501     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1502     def struct_pack(spec, *args):
1503         if isinstance(spec, compat_str):
1504             spec = spec.encode('ascii')
1505         return struct.pack(spec, *args)
1506
1507     def struct_unpack(spec, *args):
1508         if isinstance(spec, compat_str):
1509             spec = spec.encode('ascii')
1510         return struct.unpack(spec, *args)
1511 else:
1512     struct_pack = struct.pack
1513     struct_unpack = struct.unpack
1514
1515
1516 def read_batch_urls(batch_fd):
1517     def fixup(url):
1518         if not isinstance(url, compat_str):
1519             url = url.decode('utf-8', 'replace')
1520         BOM_UTF8 = u'\xef\xbb\xbf'
1521         if url.startswith(BOM_UTF8):
1522             url = url[len(BOM_UTF8):]
1523         url = url.strip()
1524         if url.startswith(('#', ';', ']')):
1525             return False
1526         return url
1527
1528     with contextlib.closing(batch_fd) as fd:
1529         return [url for url in map(fixup, fd) if url]
1530
1531
1532 def urlencode_postdata(*args, **kargs):
1533     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1534
1535
1536 try:
1537     etree_iter = xml.etree.ElementTree.Element.iter
1538 except AttributeError:  # Python <=2.6
1539     etree_iter = lambda n: n.findall('.//*')
1540
1541
1542 def parse_xml(s):
1543     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1544         def doctype(self, name, pubid, system):
1545             pass  # Ignore doctypes
1546
1547     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1548     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1549     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1550     # Fix up XML parser in Python 2.x
1551     if sys.version_info < (3, 0):
1552         for n in etree_iter(tree):
1553             if n.text is not None:
1554                 if not isinstance(n.text, compat_str):
1555                     n.text = n.text.decode('utf-8')
1556     return tree
1557
1558
1559 if sys.version_info < (3, 0) and sys.platform == 'win32':
1560     def compat_getpass(prompt, *args, **kwargs):
1561         if isinstance(prompt, compat_str):
1562             prompt = prompt.encode(preferredencoding())
1563         return getpass.getpass(prompt, *args, **kwargs)
1564 else:
1565     compat_getpass = getpass.getpass
1566
1567
1568 US_RATINGS = {
1569     'G': 0,
1570     'PG': 10,
1571     'PG-13': 13,
1572     'R': 16,
1573     'NC': 18,
1574 }
1575
1576
1577 def parse_age_limit(s):
1578     if s is None:
1579         return 0
1580     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1581     return int(m.group('age')) if m else US_RATINGS.get(s, 0)
1582
1583
1584 def strip_jsonp(code):
1585     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1586
1587
1588 def js_to_json(code):
1589     def fix_kv(m):
1590         v = m.group(0)
1591         if v in ('true', 'false', 'null'):
1592             return v
1593         if v.startswith('"'):
1594             return v
1595         if v.startswith("'"):
1596             v = v[1:-1]
1597             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1598                 '\\\\': '\\\\',
1599                 "\\'": "'",
1600                 '"': '\\"',
1601             }[m.group(0)], v)
1602         return '"%s"' % v
1603
1604     res = re.sub(r'''(?x)
1605         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1606         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1607         [a-zA-Z_][a-zA-Z_0-9]*
1608         ''', fix_kv, code)
1609     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1610     return res
1611
1612
1613 def qualities(quality_ids):
1614     """ Get a numeric quality value out of a list of possible values """
1615     def q(qid):
1616         try:
1617             return quality_ids.index(qid)
1618         except ValueError:
1619             return -1
1620     return q
1621
1622
1623 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1624
1625 try:
1626     subprocess_check_output = subprocess.check_output
1627 except AttributeError:
1628     def subprocess_check_output(*args, **kwargs):
1629         assert 'input' not in kwargs
1630         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1631         output, _ = p.communicate()
1632         ret = p.poll()
1633         if ret:
1634             raise subprocess.CalledProcessError(ret, p.args, output=output)
1635         return output
1636
1637
1638 def limit_length(s, length):
1639     """ Add ellipses to overly long strings """
1640     if s is None:
1641         return None
1642     ELLIPSES = '...'
1643     if len(s) > length:
1644         return s[:length - len(ELLIPSES)] + ELLIPSES
1645     return s