_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 284         # .//node does not match if a node is a direct child of . !
 285         if isinstance(xpath, unicode):
 286             xpath = xpath.encode('ascii')
 287
 288         for f in node.findall(xpath):
 289             if f.attrib.get(key) == val:
 290                 return f
 291         return None
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295 def xpath_with_ns(path, ns_map):
 296     components = [c.split(':') for c in path.split('/')]
 297     replaced = []
 298     for c in components:
 299         if len(c) == 1:
 300             replaced.append(c[0])
 301         else:
 302             ns, tag = c
 303             replaced.append('{%s}%s' % (ns_map[ns], tag))
 304     return '/'.join(replaced)
 305
 306
 307 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 308 class BaseHTMLParser(compat_html_parser.HTMLParser):
 309     def __init(self):
 310         compat_html_parser.HTMLParser.__init__(self)
 311         self.html = None
 312
 313     def loads(self, html):
 314         self.html = html
 315         self.feed(html)
 316         self.close()
 317
 318 class AttrParser(BaseHTMLParser):
 319     """Modified HTMLParser that isolates a tag with the specified attribute"""
 320     def __init__(self, attribute, value):
 321         self.attribute = attribute
 322         self.value = value
 323         self.result = None
 324         self.started = False
 325         self.depth = {}
 326         self.watch_startpos = False
 327         self.error_count = 0
 328         BaseHTMLParser.__init__(self)
 329
 330     def error(self, message):
 331         if self.error_count > 10 or self.started:
 332             raise compat_html_parser.HTMLParseError(message, self.getpos())
 333         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 334         self.error_count += 1
 335         self.goahead(1)
 336
 337     def handle_starttag(self, tag, attrs):
 338         attrs = dict(attrs)
 339         if self.started:
 340             self.find_startpos(None)
 341         if self.attribute in attrs and attrs[self.attribute] == self.value:
 342             self.result = [tag]
 343             self.started = True
 344             self.watch_startpos = True
 345         if self.started:
 346             if not tag in self.depth: self.depth[tag] = 0
 347             self.depth[tag] += 1
 348
 349     def handle_endtag(self, tag):
 350         if self.started:
 351             if tag in self.depth: self.depth[tag] -= 1
 352             if self.depth[self.result[0]] == 0:
 353                 self.started = False
 354                 self.result.append(self.getpos())
 355
 356     def find_startpos(self, x):
 357         """Needed to put the start position of the result (self.result[1])
 358         after the opening tag with the requested id"""
 359         if self.watch_startpos:
 360             self.watch_startpos = False
 361             self.result.append(self.getpos())
 362     handle_entityref = handle_charref = handle_data = handle_comment = \
 363     handle_decl = handle_pi = unknown_decl = find_startpos
 364
 365     def get_result(self):
 366         if self.result is None:
 367             return None
 368         if len(self.result) != 3:
 369             return None
 370         lines = self.html.split('\n')
 371         lines = lines[self.result[1][0]-1:self.result[2][0]]
 372         lines[0] = lines[0][self.result[1][1]:]
 373         if len(lines) == 1:
 374             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 375         lines[-1] = lines[-1][:self.result[2][1]]
 376         return '\n'.join(lines).strip()
 377 # Hack for https://github.com/rg3/youtube-dl/issues/662
 378 if sys.version_info < (2, 7, 3):
 379     AttrParser.parse_endtag = (lambda self, i:
 380         i + len("</scr'+'ipt>")
 381         if self.rawdata[i:].startswith("</scr'+'ipt>")
 382         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 383
 384 def get_element_by_id(id, html):
 385     """Return the content of the tag with the specified ID in the passed HTML document"""
 386     return get_element_by_attribute("id", id, html)
 387
 388 def get_element_by_attribute(attribute, value, html):
 389     """Return the content of the tag with the specified attribute in the passed HTML document"""
 390     parser = AttrParser(attribute, value)
 391     try:
 392         parser.loads(html)
 393     except compat_html_parser.HTMLParseError:
 394         pass
 395     return parser.get_result()
 396
 397 class MetaParser(BaseHTMLParser):
 398     """
 399     Modified HTMLParser that isolates a meta tag with the specified name
 400     attribute.
 401     """
 402     def __init__(self, name):
 403         BaseHTMLParser.__init__(self)
 404         self.name = name
 405         self.content = None
 406         self.result = None
 407
 408     def handle_starttag(self, tag, attrs):
 409         if tag != 'meta':
 410             return
 411         attrs = dict(attrs)
 412         if attrs.get('name') == self.name:
 413             self.result = attrs.get('content')
 414
 415     def get_result(self):
 416         return self.result
 417
 418 def get_meta_content(name, html):
 419     """
 420     Return the content attribute from the meta tag with the given name attribute.
 421     """
 422     parser = MetaParser(name)
 423     try:
 424         parser.loads(html)
 425     except compat_html_parser.HTMLParseError:
 426         pass
 427     return parser.get_result()
 428
 429
 430 def clean_html(html):
 431     """Clean an HTML snippet into a readable string"""
 432     # Newline vs <br />
 433     html = html.replace('\n', ' ')
 434     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 435     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 436     # Strip html tags
 437     html = re.sub('<.*?>', '', html)
 438     # Replace html entities
 439     html = unescapeHTML(html)
 440     return html.strip()
 441
 442
 443 def sanitize_open(filename, open_mode):
 444     """Try to open the given filename, and slightly tweak it if this fails.
 445
 446     Attempts to open the given filename. If this fails, it tries to change
 447     the filename slightly, step by step, until it's either able to open it
 448     or it fails and raises a final exception, like the standard open()
 449     function.
 450
 451     It returns the tuple (stream, definitive_file_name).
 452     """
 453     try:
 454         if filename == u'-':
 455             if sys.platform == 'win32':
 456                 import msvcrt
 457                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 458             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 459         stream = open(encodeFilename(filename), open_mode)
 460         return (stream, filename)
 461     except (IOError, OSError) as err:
 462         if err.errno in (errno.EACCES,):
 463             raise
 464
 465         # In case of error, try to remove win32 forbidden chars
 466         alt_filename = os.path.join(
 467                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 468                         for path_part in os.path.split(filename)
 469                        )
 470         if alt_filename == filename:
 471             raise
 472         else:
 473             # An exception here should be caught in the caller
 474             stream = open(encodeFilename(filename), open_mode)
 475             return (stream, alt_filename)
 476
 477
 478 def timeconvert(timestr):
 479     """Convert RFC 2822 defined time string into system timestamp"""
 480     timestamp = None
 481     timetuple = email.utils.parsedate_tz(timestr)
 482     if timetuple is not None:
 483         timestamp = email.utils.mktime_tz(timetuple)
 484     return timestamp
 485
 486 def sanitize_filename(s, restricted=False, is_id=False):
 487     """Sanitizes a string so it could be used as part of a filename.
 488     If restricted is set, use a stricter subset of allowed characters.
 489     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 490     """
 491     def replace_insane(char):
 492         if char == '?' or ord(char) < 32 or ord(char) == 127:
 493             return ''
 494         elif char == '"':
 495             return '' if restricted else '\''
 496         elif char == ':':
 497             return '_-' if restricted else ' -'
 498         elif char in '\\/|*<>':
 499             return '_'
 500         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 501             return '_'
 502         if restricted and ord(char) > 127:
 503             return '_'
 504         return char
 505
 506     result = u''.join(map(replace_insane, s))
 507     if not is_id:
 508         while '__' in result:
 509             result = result.replace('__', '_')
 510         result = result.strip('_')
 511         # Common case of "Foreign band name - English song title"
 512         if restricted and result.startswith('-_'):
 513             result = result[2:]
 514         if not result:
 515             result = '_'
 516     return result
 517
 518 def orderedSet(iterable):
 519     """ Remove all duplicates from the input iterable """
 520     res = []
 521     for el in iterable:
 522         if el not in res:
 523             res.append(el)
 524     return res
 525
 526
 527 def _htmlentity_transform(entity):
 528     """Transforms an HTML entity to a character."""
 529     # Known non-numeric HTML entity
 530     if entity in compat_html_entities.name2codepoint:
 531         return compat_chr(compat_html_entities.name2codepoint[entity])
 532
 533     mobj = re.match(r'#(x?[0-9]+)', entity)
 534     if mobj is not None:
 535         numstr = mobj.group(1)
 536         if numstr.startswith(u'x'):
 537             base = 16
 538             numstr = u'0%s' % numstr
 539         else:
 540             base = 10
 541         return compat_chr(int(numstr, base))
 542
 543     # Unknown entity in name, return its literal representation
 544     return (u'&%s;' % entity)
 545
 546
 547 def unescapeHTML(s):
 548     if s is None:
 549         return None
 550     assert type(s) == compat_str
 551
 552     return re.sub(
 553         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 554
 555
 556 def encodeFilename(s, for_subprocess=False):
 557     """
 558     @param s The name of the file
 559     """
 560
 561     assert type(s) == compat_str
 562
 563     # Python 3 has a Unicode API
 564     if sys.version_info >= (3, 0):
 565         return s
 566
 567     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 568         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 569         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 570         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 571         if not for_subprocess:
 572             return s
 573         else:
 574             # For subprocess calls, encode with locale encoding
 575             # Refer to http://stackoverflow.com/a/9951851/35070
 576             encoding = preferredencoding()
 577     else:
 578         encoding = sys.getfilesystemencoding()
 579     if encoding is None:
 580         encoding = 'utf-8'
 581     return s.encode(encoding, 'ignore')
 582
 583
 584 def encodeArgument(s):
 585     if not isinstance(s, compat_str):
 586         # Legacy code that uses byte strings
 587         # Uncomment the following line after fixing all post processors
 588         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 589         s = s.decode('ascii')
 590     return encodeFilename(s, True)
 591
 592
 593 def decodeOption(optval):
 594     if optval is None:
 595         return optval
 596     if isinstance(optval, bytes):
 597         optval = optval.decode(preferredencoding())
 598
 599     assert isinstance(optval, compat_str)
 600     return optval
 601
 602 def formatSeconds(secs):
 603     if secs > 3600:
 604         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 605     elif secs > 60:
 606         return '%d:%02d' % (secs // 60, secs % 60)
 607     else:
 608         return '%d' % secs
 609
 610
 611 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 612     if sys.version_info < (3, 2):
 613         import httplib
 614
 615         class HTTPSConnectionV3(httplib.HTTPSConnection):
 616             def __init__(self, *args, **kwargs):
 617                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 618
 619             def connect(self):
 620                 sock = socket.create_connection((self.host, self.port), self.timeout)
 621                 if getattr(self, '_tunnel_host', False):
 622                     self.sock = sock
 623                     self._tunnel()
 624                 try:
 625                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 626                 except ssl.SSLError:
 627                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 628
 629         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 630             def https_open(self, req):
 631                 return self.do_open(HTTPSConnectionV3, req)
 632         return HTTPSHandlerV3(**kwargs)
 633     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 634         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 635         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 636         if opts_no_check_certificate:
 637             context.verify_mode = ssl.CERT_NONE
 638         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 639     else:  # Python < 3.4
 640         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 641         context.verify_mode = (ssl.CERT_NONE
 642                                if opts_no_check_certificate
 643                                else ssl.CERT_REQUIRED)
 644         context.set_default_verify_paths()
 645         try:
 646             context.load_default_certs()
 647         except AttributeError:
 648             pass  # Python < 3.4
 649         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 650
 651 class ExtractorError(Exception):
 652     """Error during info extraction."""
 653     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 654         """ tb, if given, is the original traceback (so that it can be printed out).
 655         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 656         """
 657
 658         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 659             expected = True
 660         if video_id is not None:
 661             msg = video_id + ': ' + msg
 662         if not expected:
 663             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 664         super(ExtractorError, self).__init__(msg)
 665
 666         self.traceback = tb
 667         self.exc_info = sys.exc_info()  # preserve original exception
 668         self.cause = cause
 669         self.video_id = video_id
 670
 671     def format_traceback(self):
 672         if self.traceback is None:
 673             return None
 674         return u''.join(traceback.format_tb(self.traceback))
 675
 676
 677 class RegexNotFoundError(ExtractorError):
 678     """Error when a regex didn't match"""
 679     pass
 680
 681
 682 class DownloadError(Exception):
 683     """Download Error exception.
 684
 685     This exception may be thrown by FileDownloader objects if they are not
 686     configured to continue on errors. They will contain the appropriate
 687     error message.
 688     """
 689     def __init__(self, msg, exc_info=None):
 690         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 691         super(DownloadError, self).__init__(msg)
 692         self.exc_info = exc_info
 693
 694
 695 class SameFileError(Exception):
 696     """Same File exception.
 697
 698     This exception will be thrown by FileDownloader objects if they detect
 699     multiple files would have to be downloaded to the same file on disk.
 700     """
 701     pass
 702
 703
 704 class PostProcessingError(Exception):
 705     """Post Processing exception.
 706
 707     This exception may be raised by PostProcessor's .run() method to
 708     indicate an error in the postprocessing task.
 709     """
 710     def __init__(self, msg):
 711         self.msg = msg
 712
 713 class MaxDownloadsReached(Exception):
 714     """ --max-downloads limit has been reached. """
 715     pass
 716
 717
 718 class UnavailableVideoError(Exception):
 719     """Unavailable Format exception.
 720
 721     This exception will be thrown when a video is requested
 722     in a format that is not available for that video.
 723     """
 724     pass
 725
 726
 727 class ContentTooShortError(Exception):
 728     """Content Too Short exception.
 729
 730     This exception may be raised by FileDownloader objects when a file they
 731     download is too small for what the server announced first, indicating
 732     the connection was probably interrupted.
 733     """
 734     # Both in bytes
 735     downloaded = None
 736     expected = None
 737
 738     def __init__(self, downloaded, expected):
 739         self.downloaded = downloaded
 740         self.expected = expected
 741
 742 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 743     """Handler for HTTP requests and responses.
 744
 745     This class, when installed with an OpenerDirector, automatically adds
 746     the standard headers to every HTTP request and handles gzipped and
 747     deflated responses from web servers. If compression is to be avoided in
 748     a particular request, the original request in the program code only has
 749     to include the HTTP header "Youtubedl-No-Compression", which will be
 750     removed before making the real request.
 751
 752     Part of this code was copied from:
 753
 754     http://techknack.net/python-urllib2-handlers/
 755
 756     Andrew Rowls, the author of that code, agreed to release it to the
 757     public domain.
 758     """
 759
 760     @staticmethod
 761     def deflate(data):
 762         try:
 763             return zlib.decompress(data, -zlib.MAX_WBITS)
 764         except zlib.error:
 765             return zlib.decompress(data)
 766
 767     @staticmethod
 768     def addinfourl_wrapper(stream, headers, url, code):
 769         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 770             return compat_urllib_request.addinfourl(stream, headers, url, code)
 771         ret = compat_urllib_request.addinfourl(stream, headers, url)
 772         ret.code = code
 773         return ret
 774
 775     def http_request(self, req):
 776         for h, v in std_headers.items():
 777             if h not in req.headers:
 778                 req.add_header(h, v)
 779         if 'Youtubedl-no-compression' in req.headers:
 780             if 'Accept-encoding' in req.headers:
 781                 del req.headers['Accept-encoding']
 782             del req.headers['Youtubedl-no-compression']
 783         if 'Youtubedl-user-agent' in req.headers:
 784             if 'User-agent' in req.headers:
 785                 del req.headers['User-agent']
 786             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 787             del req.headers['Youtubedl-user-agent']
 788         return req
 789
 790     def http_response(self, req, resp):
 791         old_resp = resp
 792         # gzip
 793         if resp.headers.get('Content-encoding', '') == 'gzip':
 794             content = resp.read()
 795             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 796             try:
 797                 uncompressed = io.BytesIO(gz.read())
 798             except IOError as original_ioerror:
 799                 # There may be junk add the end of the file
 800                 # See http://stackoverflow.com/q/4928560/35070 for details
 801                 for i in range(1, 1024):
 802                     try:
 803                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 804                         uncompressed = io.BytesIO(gz.read())
 805                     except IOError:
 806                         continue
 807                     break
 808                 else:
 809                     raise original_ioerror
 810             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 811             resp.msg = old_resp.msg
 812         # deflate
 813         if resp.headers.get('Content-encoding', '') == 'deflate':
 814             gz = io.BytesIO(self.deflate(resp.read()))
 815             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 816             resp.msg = old_resp.msg
 817         return resp
 818
 819     https_request = http_request
 820     https_response = http_response
 821
 822
 823 def parse_iso8601(date_str, delimiter='T'):
 824     """ Return a UNIX timestamp from the given date """
 825
 826     if date_str is None:
 827         return None
 828
 829     m = re.search(
 830         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 831         date_str)
 832     if not m:
 833         timezone = datetime.timedelta()
 834     else:
 835         date_str = date_str[:-len(m.group(0))]
 836         if not m.group('sign'):
 837             timezone = datetime.timedelta()
 838         else:
 839             sign = 1 if m.group('sign') == '+' else -1
 840             timezone = datetime.timedelta(
 841                 hours=sign * int(m.group('hours')),
 842                 minutes=sign * int(m.group('minutes')))
 843     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 844     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 845     return calendar.timegm(dt.timetuple())
 846
 847
 848 def unified_strdate(date_str):
 849     """Return a string with the date in the format YYYYMMDD"""
 850
 851     if date_str is None:
 852         return None
 853
 854     upload_date = None
 855     #Replace commas
 856     date_str = date_str.replace(',', ' ')
 857     # %z (UTC offset) is only supported in python>=3.2
 858     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 859     format_expressions = [
 860         '%d %B %Y',
 861         '%d %b %Y',
 862         '%B %d %Y',
 863         '%b %d %Y',
 864         '%b %dst %Y %I:%M%p',
 865         '%b %dnd %Y %I:%M%p',
 866         '%b %dth %Y %I:%M%p',
 867         '%Y-%m-%d',
 868         '%Y/%m/%d',
 869         '%d.%m.%Y',
 870         '%d/%m/%Y',
 871         '%d/%m/%y',
 872         '%Y/%m/%d %H:%M:%S',
 873         '%Y-%m-%d %H:%M:%S',
 874         '%d.%m.%Y %H:%M',
 875         '%d.%m.%Y %H.%M',
 876         '%Y-%m-%dT%H:%M:%SZ',
 877         '%Y-%m-%dT%H:%M:%S.%fZ',
 878         '%Y-%m-%dT%H:%M:%S.%f0Z',
 879         '%Y-%m-%dT%H:%M:%S',
 880         '%Y-%m-%dT%H:%M:%S.%f',
 881         '%Y-%m-%dT%H:%M',
 882     ]
 883     for expression in format_expressions:
 884         try:
 885             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 886         except ValueError:
 887             pass
 888     if upload_date is None:
 889         timetuple = email.utils.parsedate_tz(date_str)
 890         if timetuple:
 891             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 892     return upload_date
 893
 894 def determine_ext(url, default_ext=u'unknown_video'):
 895     if url is None:
 896         return default_ext
 897     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 898     if re.match(r'^[A-Za-z0-9]+$', guess):
 899         return guess
 900     else:
 901         return default_ext
 902
 903 def subtitles_filename(filename, sub_lang, sub_format):
 904     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 905
 906 def date_from_str(date_str):
 907     """
 908     Return a datetime object from a string in the format YYYYMMDD or
 909     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 910     today = datetime.date.today()
 911     if date_str == 'now'or date_str == 'today':
 912         return today
 913     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 914     if match is not None:
 915         sign = match.group('sign')
 916         time = int(match.group('time'))
 917         if sign == '-':
 918             time = -time
 919         unit = match.group('unit')
 920         #A bad aproximation?
 921         if unit == 'month':
 922             unit = 'day'
 923             time *= 30
 924         elif unit == 'year':
 925             unit = 'day'
 926             time *= 365
 927         unit += 's'
 928         delta = datetime.timedelta(**{unit: time})
 929         return today + delta
 930     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 931
 932 def hyphenate_date(date_str):
 933     """
 934     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 935     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 936     if match is not None:
 937         return '-'.join(match.groups())
 938     else:
 939         return date_str
 940
 941 class DateRange(object):
 942     """Represents a time interval between two dates"""
 943     def __init__(self, start=None, end=None):
 944         """start and end must be strings in the format accepted by date"""
 945         if start is not None:
 946             self.start = date_from_str(start)
 947         else:
 948             self.start = datetime.datetime.min.date()
 949         if end is not None:
 950             self.end = date_from_str(end)
 951         else:
 952             self.end = datetime.datetime.max.date()
 953         if self.start > self.end:
 954             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 955     @classmethod
 956     def day(cls, day):
 957         """Returns a range that only contains the given day"""
 958         return cls(day,day)
 959     def __contains__(self, date):
 960         """Check if the date is in the range"""
 961         if not isinstance(date, datetime.date):
 962             date = date_from_str(date)
 963         return self.start <= date <= self.end
 964     def __str__(self):
 965         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 966
 967
 968 def platform_name():
 969     """ Returns the platform name as a compat_str """
 970     res = platform.platform()
 971     if isinstance(res, bytes):
 972         res = res.decode(preferredencoding())
 973
 974     assert isinstance(res, compat_str)
 975     return res
 976
 977
 978 def _windows_write_string(s, out):
 979     """ Returns True if the string was written using special methods,
 980     False if it has yet to be written out."""
 981     # Adapted from http://stackoverflow.com/a/3259271/35070
 982
 983     import ctypes
 984     import ctypes.wintypes
 985
 986     WIN_OUTPUT_IDS = {
 987         1: -11,
 988         2: -12,
 989     }
 990
 991     try:
 992         fileno = out.fileno()
 993     except AttributeError:
 994         # If the output stream doesn't have a fileno, it's virtual
 995         return False
 996     if fileno not in WIN_OUTPUT_IDS:
 997         return False
 998
 999     GetStdHandle = ctypes.WINFUNCTYPE(
1000         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1001         ("GetStdHandle", ctypes.windll.kernel32))
1002     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1003
1004     WriteConsoleW = ctypes.WINFUNCTYPE(
1005         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1006         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1007         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1008     written = ctypes.wintypes.DWORD(0)
1009
1010     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1011     FILE_TYPE_CHAR = 0x0002
1012     FILE_TYPE_REMOTE = 0x8000
1013     GetConsoleMode = ctypes.WINFUNCTYPE(
1014         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1015         ctypes.POINTER(ctypes.wintypes.DWORD))(
1016         ("GetConsoleMode", ctypes.windll.kernel32))
1017     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1018
1019     def not_a_console(handle):
1020         if handle == INVALID_HANDLE_VALUE or handle is None:
1021             return True
1022         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1023                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1024
1025     if not_a_console(h):
1026         return False
1027
1028     def next_nonbmp_pos(s):
1029         try:
1030             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1031         except StopIteration:
1032             return len(s)
1033
1034     while s:
1035         count = min(next_nonbmp_pos(s), 1024)
1036
1037         ret = WriteConsoleW(
1038             h, s, count if count else 2, ctypes.byref(written), None)
1039         if ret == 0:
1040             raise OSError('Failed to write string')
1041         if not count:  # We just wrote a non-BMP character
1042             assert written.value == 2
1043             s = s[1:]
1044         else:
1045             assert written.value > 0
1046             s = s[written.value:]
1047     return True
1048
1049
1050 def write_string(s, out=None, encoding=None):
1051     if out is None:
1052         out = sys.stderr
1053     assert type(s) == compat_str
1054
1055     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1056         if _windows_write_string(s, out):
1057             return
1058
1059     if ('b' in getattr(out, 'mode', '') or
1060             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1061         byt = s.encode(encoding or preferredencoding(), 'ignore')
1062         out.write(byt)
1063     elif hasattr(out, 'buffer'):
1064         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1065         byt = s.encode(enc, 'ignore')
1066         out.buffer.write(byt)
1067     else:
1068         out.write(s)
1069     out.flush()
1070
1071
1072 def bytes_to_intlist(bs):
1073     if not bs:
1074         return []
1075     if isinstance(bs[0], int):  # Python 3
1076         return list(bs)
1077     else:
1078         return [ord(c) for c in bs]
1079
1080
1081 def intlist_to_bytes(xs):
1082     if not xs:
1083         return b''
1084     if isinstance(chr(0), bytes):  # Python 2
1085         return ''.join([chr(x) for x in xs])
1086     else:
1087         return bytes(xs)
1088
1089
1090 # Cross-platform file locking
1091 if sys.platform == 'win32':
1092     import ctypes.wintypes
1093     import msvcrt
1094
1095     class OVERLAPPED(ctypes.Structure):
1096         _fields_ = [
1097             ('Internal', ctypes.wintypes.LPVOID),
1098             ('InternalHigh', ctypes.wintypes.LPVOID),
1099             ('Offset', ctypes.wintypes.DWORD),
1100             ('OffsetHigh', ctypes.wintypes.DWORD),
1101             ('hEvent', ctypes.wintypes.HANDLE),
1102         ]
1103
1104     kernel32 = ctypes.windll.kernel32
1105     LockFileEx = kernel32.LockFileEx
1106     LockFileEx.argtypes = [
1107         ctypes.wintypes.HANDLE,     # hFile
1108         ctypes.wintypes.DWORD,      # dwFlags
1109         ctypes.wintypes.DWORD,      # dwReserved
1110         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1111         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1112         ctypes.POINTER(OVERLAPPED)  # Overlapped
1113     ]
1114     LockFileEx.restype = ctypes.wintypes.BOOL
1115     UnlockFileEx = kernel32.UnlockFileEx
1116     UnlockFileEx.argtypes = [
1117         ctypes.wintypes.HANDLE,     # hFile
1118         ctypes.wintypes.DWORD,      # dwReserved
1119         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1120         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1121         ctypes.POINTER(OVERLAPPED)  # Overlapped
1122     ]
1123     UnlockFileEx.restype = ctypes.wintypes.BOOL
1124     whole_low = 0xffffffff
1125     whole_high = 0x7fffffff
1126
1127     def _lock_file(f, exclusive):
1128         overlapped = OVERLAPPED()
1129         overlapped.Offset = 0
1130         overlapped.OffsetHigh = 0
1131         overlapped.hEvent = 0
1132         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1133         handle = msvcrt.get_osfhandle(f.fileno())
1134         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1135                           whole_low, whole_high, f._lock_file_overlapped_p):
1136             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1137
1138     def _unlock_file(f):
1139         assert f._lock_file_overlapped_p
1140         handle = msvcrt.get_osfhandle(f.fileno())
1141         if not UnlockFileEx(handle, 0,
1142                             whole_low, whole_high, f._lock_file_overlapped_p):
1143             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1144
1145 else:
1146     import fcntl
1147
1148     def _lock_file(f, exclusive):
1149         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1150
1151     def _unlock_file(f):
1152         fcntl.flock(f, fcntl.LOCK_UN)
1153
1154
1155 class locked_file(object):
1156     def __init__(self, filename, mode, encoding=None):
1157         assert mode in ['r', 'a', 'w']
1158         self.f = io.open(filename, mode, encoding=encoding)
1159         self.mode = mode
1160
1161     def __enter__(self):
1162         exclusive = self.mode != 'r'
1163         try:
1164             _lock_file(self.f, exclusive)
1165         except IOError:
1166             self.f.close()
1167             raise
1168         return self
1169
1170     def __exit__(self, etype, value, traceback):
1171         try:
1172             _unlock_file(self.f)
1173         finally:
1174             self.f.close()
1175
1176     def __iter__(self):
1177         return iter(self.f)
1178
1179     def write(self, *args):
1180         return self.f.write(*args)
1181
1182     def read(self, *args):
1183         return self.f.read(*args)
1184
1185
1186 def shell_quote(args):
1187     quoted_args = []
1188     encoding = sys.getfilesystemencoding()
1189     if encoding is None:
1190         encoding = 'utf-8'
1191     for a in args:
1192         if isinstance(a, bytes):
1193             # We may get a filename encoded with 'encodeFilename'
1194             a = a.decode(encoding)
1195         quoted_args.append(pipes.quote(a))
1196     return u' '.join(quoted_args)
1197
1198
1199 def takewhile_inclusive(pred, seq):
1200     """ Like itertools.takewhile, but include the latest evaluated element
1201         (the first element so that Not pred(e)) """
1202     for e in seq:
1203         yield e
1204         if not pred(e):
1205             return
1206
1207
1208 def smuggle_url(url, data):
1209     """ Pass additional data in a URL for internal use. """
1210
1211     sdata = compat_urllib_parse.urlencode(
1212         {u'__youtubedl_smuggle': json.dumps(data)})
1213     return url + u'#' + sdata
1214
1215
1216 def unsmuggle_url(smug_url, default=None):
1217     if not '#__youtubedl_smuggle' in smug_url:
1218         return smug_url, default
1219     url, _, sdata = smug_url.rpartition(u'#')
1220     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1221     data = json.loads(jsond)
1222     return url, data
1223
1224
1225 def format_bytes(bytes):
1226     if bytes is None:
1227         return u'N/A'
1228     if type(bytes) is str:
1229         bytes = float(bytes)
1230     if bytes == 0.0:
1231         exponent = 0
1232     else:
1233         exponent = int(math.log(bytes, 1024.0))
1234     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1235     converted = float(bytes) / float(1024 ** exponent)
1236     return u'%.2f%s' % (converted, suffix)
1237
1238
1239 def get_term_width():
1240     columns = os.environ.get('COLUMNS', None)
1241     if columns:
1242         return int(columns)
1243
1244     try:
1245         sp = subprocess.Popen(
1246             ['stty', 'size'],
1247             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1248         out, err = sp.communicate()
1249         return int(out.split()[1])
1250     except:
1251         pass
1252     return None
1253
1254
1255 def month_by_name(name):
1256     """ Return the number of a month by (locale-independently) English name """
1257
1258     ENGLISH_NAMES = [
1259         u'January', u'February', u'March', u'April', u'May', u'June',
1260         u'July', u'August', u'September', u'October', u'November', u'December']
1261     try:
1262         return ENGLISH_NAMES.index(name) + 1
1263     except ValueError:
1264         return None
1265
1266
1267 def fix_xml_ampersands(xml_str):
1268     """Replace all the '&' by '&amp;' in XML"""
1269     return re.sub(
1270         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1271         u'&amp;',
1272         xml_str)
1273
1274
1275 def setproctitle(title):
1276     assert isinstance(title, compat_str)
1277     try:
1278         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1279     except OSError:
1280         return
1281     title_bytes = title.encode('utf-8')
1282     buf = ctypes.create_string_buffer(len(title_bytes))
1283     buf.value = title_bytes
1284     try:
1285         libc.prctl(15, buf, 0, 0, 0)
1286     except AttributeError:
1287         return  # Strange libc, just skip this
1288
1289
1290 def remove_start(s, start):
1291     if s.startswith(start):
1292         return s[len(start):]
1293     return s
1294
1295
1296 def remove_end(s, end):
1297     if s.endswith(end):
1298         return s[:-len(end)]
1299     return s
1300
1301
1302 def url_basename(url):
1303     path = compat_urlparse.urlparse(url).path
1304     return path.strip(u'/').split(u'/')[-1]
1305
1306
1307 class HEADRequest(compat_urllib_request.Request):
1308     def get_method(self):
1309         return "HEAD"
1310
1311
1312 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1313     if get_attr:
1314         if v is not None:
1315             v = getattr(v, get_attr, None)
1316     if v == '':
1317         v = None
1318     return default if v is None else (int(v) * invscale // scale)
1319
1320
1321 def str_or_none(v, default=None):
1322     return default if v is None else compat_str(v)
1323
1324
1325 def str_to_int(int_str):
1326     """ A more relaxed version of int_or_none """
1327     if int_str is None:
1328         return None
1329     int_str = re.sub(r'[,\.\+]', u'', int_str)
1330     return int(int_str)
1331
1332
1333 def float_or_none(v, scale=1, invscale=1, default=None):
1334     return default if v is None else (float(v) * invscale / scale)
1335
1336
1337 def parse_duration(s):
1338     if s is None:
1339         return None
1340
1341     s = s.strip()
1342
1343     m = re.match(
1344         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1345     if not m:
1346         return None
1347     res = int(m.group('secs'))
1348     if m.group('mins'):
1349         res += int(m.group('mins')) * 60
1350         if m.group('hours'):
1351             res += int(m.group('hours')) * 60 * 60
1352     if m.group('ms'):
1353         res += float(m.group('ms'))
1354     return res
1355
1356
1357 def prepend_extension(filename, ext):
1358     name, real_ext = os.path.splitext(filename)
1359     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1360
1361
1362 def check_executable(exe, args=[]):
1363     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1364     args can be a list of arguments for a short output (like -version) """
1365     try:
1366         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1367     except OSError:
1368         return False
1369     return exe
1370
1371
1372 class PagedList(object):
1373     def __init__(self, pagefunc, pagesize):
1374         self._pagefunc = pagefunc
1375         self._pagesize = pagesize
1376
1377     def __len__(self):
1378         # This is only useful for tests
1379         return len(self.getslice())
1380
1381     def getslice(self, start=0, end=None):
1382         res = []
1383         for pagenum in itertools.count(start // self._pagesize):
1384             firstid = pagenum * self._pagesize
1385             nextfirstid = pagenum * self._pagesize + self._pagesize
1386             if start >= nextfirstid:
1387                 continue
1388
1389             page_results = list(self._pagefunc(pagenum))
1390
1391             startv = (
1392                 start % self._pagesize
1393                 if firstid <= start < nextfirstid
1394                 else 0)
1395
1396             endv = (
1397                 ((end - 1) % self._pagesize) + 1
1398                 if (end is not None and firstid <= end <= nextfirstid)
1399                 else None)
1400
1401             if startv != 0 or endv is not None:
1402                 page_results = page_results[startv:endv]
1403             res.extend(page_results)
1404
1405             # A little optimization - if current page is not "full", ie. does
1406             # not contain page_size videos then we can assume that this page
1407             # is the last one - there are no more ids on further pages -
1408             # i.e. no need to query again.
1409             if len(page_results) + startv < self._pagesize:
1410                 break
1411
1412             # If we got the whole page, but the next page is not interesting,
1413             # break out early as well
1414             if end == nextfirstid:
1415                 break
1416         return res
1417
1418
1419 def uppercase_escape(s):
1420     unicode_escape = codecs.getdecoder('unicode_escape')
1421     return re.sub(
1422         r'\\U[0-9a-fA-F]{8}',
1423         lambda m: unicode_escape(m.group(0))[0],
1424         s)
1425
1426 try:
1427     struct.pack(u'!I', 0)
1428 except TypeError:
1429     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1430     def struct_pack(spec, *args):
1431         if isinstance(spec, compat_str):
1432             spec = spec.encode('ascii')
1433         return struct.pack(spec, *args)
1434
1435     def struct_unpack(spec, *args):
1436         if isinstance(spec, compat_str):
1437             spec = spec.encode('ascii')
1438         return struct.unpack(spec, *args)
1439 else:
1440     struct_pack = struct.pack
1441     struct_unpack = struct.unpack
1442
1443
1444 def read_batch_urls(batch_fd):
1445     def fixup(url):
1446         if not isinstance(url, compat_str):
1447             url = url.decode('utf-8', 'replace')
1448         BOM_UTF8 = u'\xef\xbb\xbf'
1449         if url.startswith(BOM_UTF8):
1450             url = url[len(BOM_UTF8):]
1451         url = url.strip()
1452         if url.startswith(('#', ';', ']')):
1453             return False
1454         return url
1455
1456     with contextlib.closing(batch_fd) as fd:
1457         return [url for url in map(fixup, fd) if url]
1458
1459
1460 def urlencode_postdata(*args, **kargs):
1461     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1462
1463
1464 try:
1465     etree_iter = xml.etree.ElementTree.Element.iter
1466 except AttributeError:  # Python <=2.6
1467     etree_iter = lambda n: n.findall('.//*')
1468
1469
1470 def parse_xml(s):
1471     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1472         def doctype(self, name, pubid, system):
1473             pass  # Ignore doctypes
1474
1475     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1476     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1477     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1478     # Fix up XML parser in Python 2.x
1479     if sys.version_info < (3, 0):
1480         for n in etree_iter(tree):
1481             if n.text is not None:
1482                 if not isinstance(n.text, compat_str):
1483                     n.text = n.text.decode('utf-8')
1484     return tree
1485
1486
1487 if sys.version_info < (3, 0) and sys.platform == 'win32':
1488     def compat_getpass(prompt, *args, **kwargs):
1489         if isinstance(prompt, compat_str):
1490             prompt = prompt.encode(preferredencoding())
1491         return getpass.getpass(prompt, *args, **kwargs)
1492 else:
1493     compat_getpass = getpass.getpass
1494
1495
1496 US_RATINGS = {
1497     'G': 0,
1498     'PG': 10,
1499     'PG-13': 13,
1500     'R': 16,
1501     'NC': 18,
1502 }
1503
1504
1505 def strip_jsonp(code):
1506     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1507
1508
1509 def js_to_json(code):
1510     def fix_kv(m):
1511         key = m.group(2)
1512         if key.startswith("'"):
1513             assert key.endswith("'")
1514             assert '"' not in key
1515             key = '"%s"' % key[1:-1]
1516         elif not key.startswith('"'):
1517             key = '"%s"' % key
1518
1519         value = m.group(4)
1520         if value.startswith("'"):
1521             assert value.endswith("'")
1522             assert '"' not in value
1523             value = '"%s"' % value[1:-1]
1524
1525         return m.group(1) + key + m.group(3) + value
1526
1527     res = re.sub(r'''(?x)
1528             ([{,]\s*)
1529             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1530             (:\s*)
1531             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1532         ''', fix_kv, code)
1533     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1534     return res
1535
1536
1537 def qualities(quality_ids):
1538     """ Get a numeric quality value out of a list of possible values """
1539     def q(qid):
1540         try:
1541             return quality_ids.index(qid)
1542         except ValueError:
1543             return -1
1544     return q
1545
1546
1547 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1548
1549 try:
1550     subprocess_check_output = subprocess.check_output
1551 except AttributeError:
1552     def subprocess_check_output(*args, **kwargs):
1553         assert 'input' not in kwargs
1554         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1555         output, _ = p.communicate()
1556         ret = p.poll()
1557         if ret:
1558             raise subprocess.CalledProcessError(ret, p.args, output=output)
1559         return output