_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 284         # .//node does not match if a node is a direct child of . !
 285         if isinstance(xpath, unicode):
 286             xpath = xpath.encode('ascii')
 287
 288         for f in node.findall(xpath):
 289             if f.attrib.get(key) == val:
 290                 return f
 291         return None
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295 def xpath_with_ns(path, ns_map):
 296     components = [c.split(':') for c in path.split('/')]
 297     replaced = []
 298     for c in components:
 299         if len(c) == 1:
 300             replaced.append(c[0])
 301         else:
 302             ns, tag = c
 303             replaced.append('{%s}%s' % (ns_map[ns], tag))
 304     return '/'.join(replaced)
 305
 306
 307 def xpath_text(node, xpath, name=None, fatal=False):
 308     n = node.find(xpath)
 309     if n is None:
 310         if fatal:
 311             name = xpath if name is None else name
 312             raise ExtractorError('Could not find XML element %s' % name)
 313         else:
 314             return None
 315     return n.text
 316
 317
 318 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 319 class BaseHTMLParser(compat_html_parser.HTMLParser):
 320     def __init(self):
 321         compat_html_parser.HTMLParser.__init__(self)
 322         self.html = None
 323
 324     def loads(self, html):
 325         self.html = html
 326         self.feed(html)
 327         self.close()
 328
 329 class AttrParser(BaseHTMLParser):
 330     """Modified HTMLParser that isolates a tag with the specified attribute"""
 331     def __init__(self, attribute, value):
 332         self.attribute = attribute
 333         self.value = value
 334         self.result = None
 335         self.started = False
 336         self.depth = {}
 337         self.watch_startpos = False
 338         self.error_count = 0
 339         BaseHTMLParser.__init__(self)
 340
 341     def error(self, message):
 342         if self.error_count > 10 or self.started:
 343             raise compat_html_parser.HTMLParseError(message, self.getpos())
 344         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 345         self.error_count += 1
 346         self.goahead(1)
 347
 348     def handle_starttag(self, tag, attrs):
 349         attrs = dict(attrs)
 350         if self.started:
 351             self.find_startpos(None)
 352         if self.attribute in attrs and attrs[self.attribute] == self.value:
 353             self.result = [tag]
 354             self.started = True
 355             self.watch_startpos = True
 356         if self.started:
 357             if not tag in self.depth: self.depth[tag] = 0
 358             self.depth[tag] += 1
 359
 360     def handle_endtag(self, tag):
 361         if self.started:
 362             if tag in self.depth: self.depth[tag] -= 1
 363             if self.depth[self.result[0]] == 0:
 364                 self.started = False
 365                 self.result.append(self.getpos())
 366
 367     def find_startpos(self, x):
 368         """Needed to put the start position of the result (self.result[1])
 369         after the opening tag with the requested id"""
 370         if self.watch_startpos:
 371             self.watch_startpos = False
 372             self.result.append(self.getpos())
 373     handle_entityref = handle_charref = handle_data = handle_comment = \
 374     handle_decl = handle_pi = unknown_decl = find_startpos
 375
 376     def get_result(self):
 377         if self.result is None:
 378             return None
 379         if len(self.result) != 3:
 380             return None
 381         lines = self.html.split('\n')
 382         lines = lines[self.result[1][0]-1:self.result[2][0]]
 383         lines[0] = lines[0][self.result[1][1]:]
 384         if len(lines) == 1:
 385             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 386         lines[-1] = lines[-1][:self.result[2][1]]
 387         return '\n'.join(lines).strip()
 388 # Hack for https://github.com/rg3/youtube-dl/issues/662
 389 if sys.version_info < (2, 7, 3):
 390     AttrParser.parse_endtag = (lambda self, i:
 391         i + len("</scr'+'ipt>")
 392         if self.rawdata[i:].startswith("</scr'+'ipt>")
 393         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 394
 395 def get_element_by_id(id, html):
 396     """Return the content of the tag with the specified ID in the passed HTML document"""
 397     return get_element_by_attribute("id", id, html)
 398
 399 def get_element_by_attribute(attribute, value, html):
 400     """Return the content of the tag with the specified attribute in the passed HTML document"""
 401     parser = AttrParser(attribute, value)
 402     try:
 403         parser.loads(html)
 404     except compat_html_parser.HTMLParseError:
 405         pass
 406     return parser.get_result()
 407
 408 class MetaParser(BaseHTMLParser):
 409     """
 410     Modified HTMLParser that isolates a meta tag with the specified name
 411     attribute.
 412     """
 413     def __init__(self, name):
 414         BaseHTMLParser.__init__(self)
 415         self.name = name
 416         self.content = None
 417         self.result = None
 418
 419     def handle_starttag(self, tag, attrs):
 420         if tag != 'meta':
 421             return
 422         attrs = dict(attrs)
 423         if attrs.get('name') == self.name:
 424             self.result = attrs.get('content')
 425
 426     def get_result(self):
 427         return self.result
 428
 429 def get_meta_content(name, html):
 430     """
 431     Return the content attribute from the meta tag with the given name attribute.
 432     """
 433     parser = MetaParser(name)
 434     try:
 435         parser.loads(html)
 436     except compat_html_parser.HTMLParseError:
 437         pass
 438     return parser.get_result()
 439
 440
 441 def clean_html(html):
 442     """Clean an HTML snippet into a readable string"""
 443     # Newline vs <br />
 444     html = html.replace('\n', ' ')
 445     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 446     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 447     # Strip html tags
 448     html = re.sub('<.*?>', '', html)
 449     # Replace html entities
 450     html = unescapeHTML(html)
 451     return html.strip()
 452
 453
 454 def sanitize_open(filename, open_mode):
 455     """Try to open the given filename, and slightly tweak it if this fails.
 456
 457     Attempts to open the given filename. If this fails, it tries to change
 458     the filename slightly, step by step, until it's either able to open it
 459     or it fails and raises a final exception, like the standard open()
 460     function.
 461
 462     It returns the tuple (stream, definitive_file_name).
 463     """
 464     try:
 465         if filename == u'-':
 466             if sys.platform == 'win32':
 467                 import msvcrt
 468                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 469             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 470         stream = open(encodeFilename(filename), open_mode)
 471         return (stream, filename)
 472     except (IOError, OSError) as err:
 473         if err.errno in (errno.EACCES,):
 474             raise
 475
 476         # In case of error, try to remove win32 forbidden chars
 477         alt_filename = os.path.join(
 478                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 479                         for path_part in os.path.split(filename)
 480                        )
 481         if alt_filename == filename:
 482             raise
 483         else:
 484             # An exception here should be caught in the caller
 485             stream = open(encodeFilename(filename), open_mode)
 486             return (stream, alt_filename)
 487
 488
 489 def timeconvert(timestr):
 490     """Convert RFC 2822 defined time string into system timestamp"""
 491     timestamp = None
 492     timetuple = email.utils.parsedate_tz(timestr)
 493     if timetuple is not None:
 494         timestamp = email.utils.mktime_tz(timetuple)
 495     return timestamp
 496
 497 def sanitize_filename(s, restricted=False, is_id=False):
 498     """Sanitizes a string so it could be used as part of a filename.
 499     If restricted is set, use a stricter subset of allowed characters.
 500     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 501     """
 502     def replace_insane(char):
 503         if char == '?' or ord(char) < 32 or ord(char) == 127:
 504             return ''
 505         elif char == '"':
 506             return '' if restricted else '\''
 507         elif char == ':':
 508             return '_-' if restricted else ' -'
 509         elif char in '\\/|*<>':
 510             return '_'
 511         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 512             return '_'
 513         if restricted and ord(char) > 127:
 514             return '_'
 515         return char
 516
 517     result = u''.join(map(replace_insane, s))
 518     if not is_id:
 519         while '__' in result:
 520             result = result.replace('__', '_')
 521         result = result.strip('_')
 522         # Common case of "Foreign band name - English song title"
 523         if restricted and result.startswith('-_'):
 524             result = result[2:]
 525         if not result:
 526             result = '_'
 527     return result
 528
 529 def orderedSet(iterable):
 530     """ Remove all duplicates from the input iterable """
 531     res = []
 532     for el in iterable:
 533         if el not in res:
 534             res.append(el)
 535     return res
 536
 537
 538 def _htmlentity_transform(entity):
 539     """Transforms an HTML entity to a character."""
 540     # Known non-numeric HTML entity
 541     if entity in compat_html_entities.name2codepoint:
 542         return compat_chr(compat_html_entities.name2codepoint[entity])
 543
 544     mobj = re.match(r'#(x?[0-9]+)', entity)
 545     if mobj is not None:
 546         numstr = mobj.group(1)
 547         if numstr.startswith(u'x'):
 548             base = 16
 549             numstr = u'0%s' % numstr
 550         else:
 551             base = 10
 552         return compat_chr(int(numstr, base))
 553
 554     # Unknown entity in name, return its literal representation
 555     return (u'&%s;' % entity)
 556
 557
 558 def unescapeHTML(s):
 559     if s is None:
 560         return None
 561     assert type(s) == compat_str
 562
 563     return re.sub(
 564         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 565
 566
 567 def encodeFilename(s, for_subprocess=False):
 568     """
 569     @param s The name of the file
 570     """
 571
 572     assert type(s) == compat_str
 573
 574     # Python 3 has a Unicode API
 575     if sys.version_info >= (3, 0):
 576         return s
 577
 578     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 579         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 580         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 581         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 582         if not for_subprocess:
 583             return s
 584         else:
 585             # For subprocess calls, encode with locale encoding
 586             # Refer to http://stackoverflow.com/a/9951851/35070
 587             encoding = preferredencoding()
 588     else:
 589         encoding = sys.getfilesystemencoding()
 590     if encoding is None:
 591         encoding = 'utf-8'
 592     return s.encode(encoding, 'ignore')
 593
 594
 595 def encodeArgument(s):
 596     if not isinstance(s, compat_str):
 597         # Legacy code that uses byte strings
 598         # Uncomment the following line after fixing all post processors
 599         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 600         s = s.decode('ascii')
 601     return encodeFilename(s, True)
 602
 603
 604 def decodeOption(optval):
 605     if optval is None:
 606         return optval
 607     if isinstance(optval, bytes):
 608         optval = optval.decode(preferredencoding())
 609
 610     assert isinstance(optval, compat_str)
 611     return optval
 612
 613 def formatSeconds(secs):
 614     if secs > 3600:
 615         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 616     elif secs > 60:
 617         return '%d:%02d' % (secs // 60, secs % 60)
 618     else:
 619         return '%d' % secs
 620
 621
 622 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 623     if sys.version_info < (3, 2):
 624         import httplib
 625
 626         class HTTPSConnectionV3(httplib.HTTPSConnection):
 627             def __init__(self, *args, **kwargs):
 628                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 629
 630             def connect(self):
 631                 sock = socket.create_connection((self.host, self.port), self.timeout)
 632                 if getattr(self, '_tunnel_host', False):
 633                     self.sock = sock
 634                     self._tunnel()
 635                 try:
 636                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 637                 except ssl.SSLError:
 638                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 639
 640         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 641             def https_open(self, req):
 642                 return self.do_open(HTTPSConnectionV3, req)
 643         return HTTPSHandlerV3(**kwargs)
 644     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 645         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 646         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 647         if opts_no_check_certificate:
 648             context.verify_mode = ssl.CERT_NONE
 649         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 650     else:  # Python < 3.4
 651         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 652         context.verify_mode = (ssl.CERT_NONE
 653                                if opts_no_check_certificate
 654                                else ssl.CERT_REQUIRED)
 655         context.set_default_verify_paths()
 656         try:
 657             context.load_default_certs()
 658         except AttributeError:
 659             pass  # Python < 3.4
 660         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 661
 662 class ExtractorError(Exception):
 663     """Error during info extraction."""
 664     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 665         """ tb, if given, is the original traceback (so that it can be printed out).
 666         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 667         """
 668
 669         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 670             expected = True
 671         if video_id is not None:
 672             msg = video_id + ': ' + msg
 673         if not expected:
 674             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 675         super(ExtractorError, self).__init__(msg)
 676
 677         self.traceback = tb
 678         self.exc_info = sys.exc_info()  # preserve original exception
 679         self.cause = cause
 680         self.video_id = video_id
 681
 682     def format_traceback(self):
 683         if self.traceback is None:
 684             return None
 685         return u''.join(traceback.format_tb(self.traceback))
 686
 687
 688 class RegexNotFoundError(ExtractorError):
 689     """Error when a regex didn't match"""
 690     pass
 691
 692
 693 class DownloadError(Exception):
 694     """Download Error exception.
 695
 696     This exception may be thrown by FileDownloader objects if they are not
 697     configured to continue on errors. They will contain the appropriate
 698     error message.
 699     """
 700     def __init__(self, msg, exc_info=None):
 701         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 702         super(DownloadError, self).__init__(msg)
 703         self.exc_info = exc_info
 704
 705
 706 class SameFileError(Exception):
 707     """Same File exception.
 708
 709     This exception will be thrown by FileDownloader objects if they detect
 710     multiple files would have to be downloaded to the same file on disk.
 711     """
 712     pass
 713
 714
 715 class PostProcessingError(Exception):
 716     """Post Processing exception.
 717
 718     This exception may be raised by PostProcessor's .run() method to
 719     indicate an error in the postprocessing task.
 720     """
 721     def __init__(self, msg):
 722         self.msg = msg
 723
 724 class MaxDownloadsReached(Exception):
 725     """ --max-downloads limit has been reached. """
 726     pass
 727
 728
 729 class UnavailableVideoError(Exception):
 730     """Unavailable Format exception.
 731
 732     This exception will be thrown when a video is requested
 733     in a format that is not available for that video.
 734     """
 735     pass
 736
 737
 738 class ContentTooShortError(Exception):
 739     """Content Too Short exception.
 740
 741     This exception may be raised by FileDownloader objects when a file they
 742     download is too small for what the server announced first, indicating
 743     the connection was probably interrupted.
 744     """
 745     # Both in bytes
 746     downloaded = None
 747     expected = None
 748
 749     def __init__(self, downloaded, expected):
 750         self.downloaded = downloaded
 751         self.expected = expected
 752
 753 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 754     """Handler for HTTP requests and responses.
 755
 756     This class, when installed with an OpenerDirector, automatically adds
 757     the standard headers to every HTTP request and handles gzipped and
 758     deflated responses from web servers. If compression is to be avoided in
 759     a particular request, the original request in the program code only has
 760     to include the HTTP header "Youtubedl-No-Compression", which will be
 761     removed before making the real request.
 762
 763     Part of this code was copied from:
 764
 765     http://techknack.net/python-urllib2-handlers/
 766
 767     Andrew Rowls, the author of that code, agreed to release it to the
 768     public domain.
 769     """
 770
 771     @staticmethod
 772     def deflate(data):
 773         try:
 774             return zlib.decompress(data, -zlib.MAX_WBITS)
 775         except zlib.error:
 776             return zlib.decompress(data)
 777
 778     @staticmethod
 779     def addinfourl_wrapper(stream, headers, url, code):
 780         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 781             return compat_urllib_request.addinfourl(stream, headers, url, code)
 782         ret = compat_urllib_request.addinfourl(stream, headers, url)
 783         ret.code = code
 784         return ret
 785
 786     def http_request(self, req):
 787         for h, v in std_headers.items():
 788             if h not in req.headers:
 789                 req.add_header(h, v)
 790         if 'Youtubedl-no-compression' in req.headers:
 791             if 'Accept-encoding' in req.headers:
 792                 del req.headers['Accept-encoding']
 793             del req.headers['Youtubedl-no-compression']
 794         if 'Youtubedl-user-agent' in req.headers:
 795             if 'User-agent' in req.headers:
 796                 del req.headers['User-agent']
 797             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 798             del req.headers['Youtubedl-user-agent']
 799         return req
 800
 801     def http_response(self, req, resp):
 802         old_resp = resp
 803         # gzip
 804         if resp.headers.get('Content-encoding', '') == 'gzip':
 805             content = resp.read()
 806             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 807             try:
 808                 uncompressed = io.BytesIO(gz.read())
 809             except IOError as original_ioerror:
 810                 # There may be junk add the end of the file
 811                 # See http://stackoverflow.com/q/4928560/35070 for details
 812                 for i in range(1, 1024):
 813                     try:
 814                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 815                         uncompressed = io.BytesIO(gz.read())
 816                     except IOError:
 817                         continue
 818                     break
 819                 else:
 820                     raise original_ioerror
 821             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 822             resp.msg = old_resp.msg
 823         # deflate
 824         if resp.headers.get('Content-encoding', '') == 'deflate':
 825             gz = io.BytesIO(self.deflate(resp.read()))
 826             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 827             resp.msg = old_resp.msg
 828         return resp
 829
 830     https_request = http_request
 831     https_response = http_response
 832
 833
 834 def parse_iso8601(date_str, delimiter='T'):
 835     """ Return a UNIX timestamp from the given date """
 836
 837     if date_str is None:
 838         return None
 839
 840     m = re.search(
 841         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 842         date_str)
 843     if not m:
 844         timezone = datetime.timedelta()
 845     else:
 846         date_str = date_str[:-len(m.group(0))]
 847         if not m.group('sign'):
 848             timezone = datetime.timedelta()
 849         else:
 850             sign = 1 if m.group('sign') == '+' else -1
 851             timezone = datetime.timedelta(
 852                 hours=sign * int(m.group('hours')),
 853                 minutes=sign * int(m.group('minutes')))
 854     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 855     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 856     return calendar.timegm(dt.timetuple())
 857
 858
 859 def unified_strdate(date_str):
 860     """Return a string with the date in the format YYYYMMDD"""
 861
 862     if date_str is None:
 863         return None
 864
 865     upload_date = None
 866     #Replace commas
 867     date_str = date_str.replace(',', ' ')
 868     # %z (UTC offset) is only supported in python>=3.2
 869     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 870     format_expressions = [
 871         '%d %B %Y',
 872         '%d %b %Y',
 873         '%B %d %Y',
 874         '%b %d %Y',
 875         '%b %dst %Y %I:%M%p',
 876         '%b %dnd %Y %I:%M%p',
 877         '%b %dth %Y %I:%M%p',
 878         '%Y-%m-%d',
 879         '%Y/%m/%d',
 880         '%d.%m.%Y',
 881         '%d/%m/%Y',
 882         '%d/%m/%y',
 883         '%Y/%m/%d %H:%M:%S',
 884         '%Y-%m-%d %H:%M:%S',
 885         '%d.%m.%Y %H:%M',
 886         '%d.%m.%Y %H.%M',
 887         '%Y-%m-%dT%H:%M:%SZ',
 888         '%Y-%m-%dT%H:%M:%S.%fZ',
 889         '%Y-%m-%dT%H:%M:%S.%f0Z',
 890         '%Y-%m-%dT%H:%M:%S',
 891         '%Y-%m-%dT%H:%M:%S.%f',
 892         '%Y-%m-%dT%H:%M',
 893     ]
 894     for expression in format_expressions:
 895         try:
 896             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 897         except ValueError:
 898             pass
 899     if upload_date is None:
 900         timetuple = email.utils.parsedate_tz(date_str)
 901         if timetuple:
 902             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 903     return upload_date
 904
 905 def determine_ext(url, default_ext=u'unknown_video'):
 906     if url is None:
 907         return default_ext
 908     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 909     if re.match(r'^[A-Za-z0-9]+$', guess):
 910         return guess
 911     else:
 912         return default_ext
 913
 914 def subtitles_filename(filename, sub_lang, sub_format):
 915     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 916
 917 def date_from_str(date_str):
 918     """
 919     Return a datetime object from a string in the format YYYYMMDD or
 920     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 921     today = datetime.date.today()
 922     if date_str == 'now'or date_str == 'today':
 923         return today
 924     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 925     if match is not None:
 926         sign = match.group('sign')
 927         time = int(match.group('time'))
 928         if sign == '-':
 929             time = -time
 930         unit = match.group('unit')
 931         #A bad aproximation?
 932         if unit == 'month':
 933             unit = 'day'
 934             time *= 30
 935         elif unit == 'year':
 936             unit = 'day'
 937             time *= 365
 938         unit += 's'
 939         delta = datetime.timedelta(**{unit: time})
 940         return today + delta
 941     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 942
 943 def hyphenate_date(date_str):
 944     """
 945     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 946     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 947     if match is not None:
 948         return '-'.join(match.groups())
 949     else:
 950         return date_str
 951
 952 class DateRange(object):
 953     """Represents a time interval between two dates"""
 954     def __init__(self, start=None, end=None):
 955         """start and end must be strings in the format accepted by date"""
 956         if start is not None:
 957             self.start = date_from_str(start)
 958         else:
 959             self.start = datetime.datetime.min.date()
 960         if end is not None:
 961             self.end = date_from_str(end)
 962         else:
 963             self.end = datetime.datetime.max.date()
 964         if self.start > self.end:
 965             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 966     @classmethod
 967     def day(cls, day):
 968         """Returns a range that only contains the given day"""
 969         return cls(day,day)
 970     def __contains__(self, date):
 971         """Check if the date is in the range"""
 972         if not isinstance(date, datetime.date):
 973             date = date_from_str(date)
 974         return self.start <= date <= self.end
 975     def __str__(self):
 976         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 977
 978
 979 def platform_name():
 980     """ Returns the platform name as a compat_str """
 981     res = platform.platform()
 982     if isinstance(res, bytes):
 983         res = res.decode(preferredencoding())
 984
 985     assert isinstance(res, compat_str)
 986     return res
 987
 988
 989 def _windows_write_string(s, out):
 990     """ Returns True if the string was written using special methods,
 991     False if it has yet to be written out."""
 992     # Adapted from http://stackoverflow.com/a/3259271/35070
 993
 994     import ctypes
 995     import ctypes.wintypes
 996
 997     WIN_OUTPUT_IDS = {
 998         1: -11,
 999         2: -12,
1000     }
1001
1002     try:
1003         fileno = out.fileno()
1004     except AttributeError:
1005         # If the output stream doesn't have a fileno, it's virtual
1006         return False
1007     if fileno not in WIN_OUTPUT_IDS:
1008         return False
1009
1010     GetStdHandle = ctypes.WINFUNCTYPE(
1011         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1012         ("GetStdHandle", ctypes.windll.kernel32))
1013     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1014
1015     WriteConsoleW = ctypes.WINFUNCTYPE(
1016         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1017         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1018         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1019     written = ctypes.wintypes.DWORD(0)
1020
1021     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1022     FILE_TYPE_CHAR = 0x0002
1023     FILE_TYPE_REMOTE = 0x8000
1024     GetConsoleMode = ctypes.WINFUNCTYPE(
1025         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1026         ctypes.POINTER(ctypes.wintypes.DWORD))(
1027         ("GetConsoleMode", ctypes.windll.kernel32))
1028     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1029
1030     def not_a_console(handle):
1031         if handle == INVALID_HANDLE_VALUE or handle is None:
1032             return True
1033         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1034                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1035
1036     if not_a_console(h):
1037         return False
1038
1039     def next_nonbmp_pos(s):
1040         try:
1041             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1042         except StopIteration:
1043             return len(s)
1044
1045     while s:
1046         count = min(next_nonbmp_pos(s), 1024)
1047
1048         ret = WriteConsoleW(
1049             h, s, count if count else 2, ctypes.byref(written), None)
1050         if ret == 0:
1051             raise OSError('Failed to write string')
1052         if not count:  # We just wrote a non-BMP character
1053             assert written.value == 2
1054             s = s[1:]
1055         else:
1056             assert written.value > 0
1057             s = s[written.value:]
1058     return True
1059
1060
1061 def write_string(s, out=None, encoding=None):
1062     if out is None:
1063         out = sys.stderr
1064     assert type(s) == compat_str
1065
1066     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1067         if _windows_write_string(s, out):
1068             return
1069
1070     if ('b' in getattr(out, 'mode', '') or
1071             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1072         byt = s.encode(encoding or preferredencoding(), 'ignore')
1073         out.write(byt)
1074     elif hasattr(out, 'buffer'):
1075         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1076         byt = s.encode(enc, 'ignore')
1077         out.buffer.write(byt)
1078     else:
1079         out.write(s)
1080     out.flush()
1081
1082
1083 def bytes_to_intlist(bs):
1084     if not bs:
1085         return []
1086     if isinstance(bs[0], int):  # Python 3
1087         return list(bs)
1088     else:
1089         return [ord(c) for c in bs]
1090
1091
1092 def intlist_to_bytes(xs):
1093     if not xs:
1094         return b''
1095     if isinstance(chr(0), bytes):  # Python 2
1096         return ''.join([chr(x) for x in xs])
1097     else:
1098         return bytes(xs)
1099
1100
1101 # Cross-platform file locking
1102 if sys.platform == 'win32':
1103     import ctypes.wintypes
1104     import msvcrt
1105
1106     class OVERLAPPED(ctypes.Structure):
1107         _fields_ = [
1108             ('Internal', ctypes.wintypes.LPVOID),
1109             ('InternalHigh', ctypes.wintypes.LPVOID),
1110             ('Offset', ctypes.wintypes.DWORD),
1111             ('OffsetHigh', ctypes.wintypes.DWORD),
1112             ('hEvent', ctypes.wintypes.HANDLE),
1113         ]
1114
1115     kernel32 = ctypes.windll.kernel32
1116     LockFileEx = kernel32.LockFileEx
1117     LockFileEx.argtypes = [
1118         ctypes.wintypes.HANDLE,     # hFile
1119         ctypes.wintypes.DWORD,      # dwFlags
1120         ctypes.wintypes.DWORD,      # dwReserved
1121         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1122         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1123         ctypes.POINTER(OVERLAPPED)  # Overlapped
1124     ]
1125     LockFileEx.restype = ctypes.wintypes.BOOL
1126     UnlockFileEx = kernel32.UnlockFileEx
1127     UnlockFileEx.argtypes = [
1128         ctypes.wintypes.HANDLE,     # hFile
1129         ctypes.wintypes.DWORD,      # dwReserved
1130         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1131         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1132         ctypes.POINTER(OVERLAPPED)  # Overlapped
1133     ]
1134     UnlockFileEx.restype = ctypes.wintypes.BOOL
1135     whole_low = 0xffffffff
1136     whole_high = 0x7fffffff
1137
1138     def _lock_file(f, exclusive):
1139         overlapped = OVERLAPPED()
1140         overlapped.Offset = 0
1141         overlapped.OffsetHigh = 0
1142         overlapped.hEvent = 0
1143         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1144         handle = msvcrt.get_osfhandle(f.fileno())
1145         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1146                           whole_low, whole_high, f._lock_file_overlapped_p):
1147             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1148
1149     def _unlock_file(f):
1150         assert f._lock_file_overlapped_p
1151         handle = msvcrt.get_osfhandle(f.fileno())
1152         if not UnlockFileEx(handle, 0,
1153                             whole_low, whole_high, f._lock_file_overlapped_p):
1154             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1155
1156 else:
1157     import fcntl
1158
1159     def _lock_file(f, exclusive):
1160         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1161
1162     def _unlock_file(f):
1163         fcntl.flock(f, fcntl.LOCK_UN)
1164
1165
1166 class locked_file(object):
1167     def __init__(self, filename, mode, encoding=None):
1168         assert mode in ['r', 'a', 'w']
1169         self.f = io.open(filename, mode, encoding=encoding)
1170         self.mode = mode
1171
1172     def __enter__(self):
1173         exclusive = self.mode != 'r'
1174         try:
1175             _lock_file(self.f, exclusive)
1176         except IOError:
1177             self.f.close()
1178             raise
1179         return self
1180
1181     def __exit__(self, etype, value, traceback):
1182         try:
1183             _unlock_file(self.f)
1184         finally:
1185             self.f.close()
1186
1187     def __iter__(self):
1188         return iter(self.f)
1189
1190     def write(self, *args):
1191         return self.f.write(*args)
1192
1193     def read(self, *args):
1194         return self.f.read(*args)
1195
1196
1197 def shell_quote(args):
1198     quoted_args = []
1199     encoding = sys.getfilesystemencoding()
1200     if encoding is None:
1201         encoding = 'utf-8'
1202     for a in args:
1203         if isinstance(a, bytes):
1204             # We may get a filename encoded with 'encodeFilename'
1205             a = a.decode(encoding)
1206         quoted_args.append(pipes.quote(a))
1207     return u' '.join(quoted_args)
1208
1209
1210 def takewhile_inclusive(pred, seq):
1211     """ Like itertools.takewhile, but include the latest evaluated element
1212         (the first element so that Not pred(e)) """
1213     for e in seq:
1214         yield e
1215         if not pred(e):
1216             return
1217
1218
1219 def smuggle_url(url, data):
1220     """ Pass additional data in a URL for internal use. """
1221
1222     sdata = compat_urllib_parse.urlencode(
1223         {u'__youtubedl_smuggle': json.dumps(data)})
1224     return url + u'#' + sdata
1225
1226
1227 def unsmuggle_url(smug_url, default=None):
1228     if not '#__youtubedl_smuggle' in smug_url:
1229         return smug_url, default
1230     url, _, sdata = smug_url.rpartition(u'#')
1231     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1232     data = json.loads(jsond)
1233     return url, data
1234
1235
1236 def format_bytes(bytes):
1237     if bytes is None:
1238         return u'N/A'
1239     if type(bytes) is str:
1240         bytes = float(bytes)
1241     if bytes == 0.0:
1242         exponent = 0
1243     else:
1244         exponent = int(math.log(bytes, 1024.0))
1245     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1246     converted = float(bytes) / float(1024 ** exponent)
1247     return u'%.2f%s' % (converted, suffix)
1248
1249
1250 def get_term_width():
1251     columns = os.environ.get('COLUMNS', None)
1252     if columns:
1253         return int(columns)
1254
1255     try:
1256         sp = subprocess.Popen(
1257             ['stty', 'size'],
1258             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1259         out, err = sp.communicate()
1260         return int(out.split()[1])
1261     except:
1262         pass
1263     return None
1264
1265
1266 def month_by_name(name):
1267     """ Return the number of a month by (locale-independently) English name """
1268
1269     ENGLISH_NAMES = [
1270         u'January', u'February', u'March', u'April', u'May', u'June',
1271         u'July', u'August', u'September', u'October', u'November', u'December']
1272     try:
1273         return ENGLISH_NAMES.index(name) + 1
1274     except ValueError:
1275         return None
1276
1277
1278 def fix_xml_ampersands(xml_str):
1279     """Replace all the '&' by '&amp;' in XML"""
1280     return re.sub(
1281         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1282         u'&amp;',
1283         xml_str)
1284
1285
1286 def setproctitle(title):
1287     assert isinstance(title, compat_str)
1288     try:
1289         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1290     except OSError:
1291         return
1292     title_bytes = title.encode('utf-8')
1293     buf = ctypes.create_string_buffer(len(title_bytes))
1294     buf.value = title_bytes
1295     try:
1296         libc.prctl(15, buf, 0, 0, 0)
1297     except AttributeError:
1298         return  # Strange libc, just skip this
1299
1300
1301 def remove_start(s, start):
1302     if s.startswith(start):
1303         return s[len(start):]
1304     return s
1305
1306
1307 def remove_end(s, end):
1308     if s.endswith(end):
1309         return s[:-len(end)]
1310     return s
1311
1312
1313 def url_basename(url):
1314     path = compat_urlparse.urlparse(url).path
1315     return path.strip(u'/').split(u'/')[-1]
1316
1317
1318 class HEADRequest(compat_urllib_request.Request):
1319     def get_method(self):
1320         return "HEAD"
1321
1322
1323 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1324     if get_attr:
1325         if v is not None:
1326             v = getattr(v, get_attr, None)
1327     if v == '':
1328         v = None
1329     return default if v is None else (int(v) * invscale // scale)
1330
1331
1332 def str_or_none(v, default=None):
1333     return default if v is None else compat_str(v)
1334
1335
1336 def str_to_int(int_str):
1337     """ A more relaxed version of int_or_none """
1338     if int_str is None:
1339         return None
1340     int_str = re.sub(r'[,\.\+]', u'', int_str)
1341     return int(int_str)
1342
1343
1344 def float_or_none(v, scale=1, invscale=1, default=None):
1345     return default if v is None else (float(v) * invscale / scale)
1346
1347
1348 def parse_duration(s):
1349     if s is None:
1350         return None
1351
1352     s = s.strip()
1353
1354     m = re.match(
1355         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1356     if not m:
1357         return None
1358     res = int(m.group('secs'))
1359     if m.group('mins'):
1360         res += int(m.group('mins')) * 60
1361         if m.group('hours'):
1362             res += int(m.group('hours')) * 60 * 60
1363     if m.group('ms'):
1364         res += float(m.group('ms'))
1365     return res
1366
1367
1368 def prepend_extension(filename, ext):
1369     name, real_ext = os.path.splitext(filename)
1370     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1371
1372
1373 def check_executable(exe, args=[]):
1374     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1375     args can be a list of arguments for a short output (like -version) """
1376     try:
1377         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1378     except OSError:
1379         return False
1380     return exe
1381
1382
1383 class PagedList(object):
1384     def __init__(self, pagefunc, pagesize):
1385         self._pagefunc = pagefunc
1386         self._pagesize = pagesize
1387
1388     def __len__(self):
1389         # This is only useful for tests
1390         return len(self.getslice())
1391
1392     def getslice(self, start=0, end=None):
1393         res = []
1394         for pagenum in itertools.count(start // self._pagesize):
1395             firstid = pagenum * self._pagesize
1396             nextfirstid = pagenum * self._pagesize + self._pagesize
1397             if start >= nextfirstid:
1398                 continue
1399
1400             page_results = list(self._pagefunc(pagenum))
1401
1402             startv = (
1403                 start % self._pagesize
1404                 if firstid <= start < nextfirstid
1405                 else 0)
1406
1407             endv = (
1408                 ((end - 1) % self._pagesize) + 1
1409                 if (end is not None and firstid <= end <= nextfirstid)
1410                 else None)
1411
1412             if startv != 0 or endv is not None:
1413                 page_results = page_results[startv:endv]
1414             res.extend(page_results)
1415
1416             # A little optimization - if current page is not "full", ie. does
1417             # not contain page_size videos then we can assume that this page
1418             # is the last one - there are no more ids on further pages -
1419             # i.e. no need to query again.
1420             if len(page_results) + startv < self._pagesize:
1421                 break
1422
1423             # If we got the whole page, but the next page is not interesting,
1424             # break out early as well
1425             if end == nextfirstid:
1426                 break
1427         return res
1428
1429
1430 def uppercase_escape(s):
1431     unicode_escape = codecs.getdecoder('unicode_escape')
1432     return re.sub(
1433         r'\\U[0-9a-fA-F]{8}',
1434         lambda m: unicode_escape(m.group(0))[0],
1435         s)
1436
1437 try:
1438     struct.pack(u'!I', 0)
1439 except TypeError:
1440     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1441     def struct_pack(spec, *args):
1442         if isinstance(spec, compat_str):
1443             spec = spec.encode('ascii')
1444         return struct.pack(spec, *args)
1445
1446     def struct_unpack(spec, *args):
1447         if isinstance(spec, compat_str):
1448             spec = spec.encode('ascii')
1449         return struct.unpack(spec, *args)
1450 else:
1451     struct_pack = struct.pack
1452     struct_unpack = struct.unpack
1453
1454
1455 def read_batch_urls(batch_fd):
1456     def fixup(url):
1457         if not isinstance(url, compat_str):
1458             url = url.decode('utf-8', 'replace')
1459         BOM_UTF8 = u'\xef\xbb\xbf'
1460         if url.startswith(BOM_UTF8):
1461             url = url[len(BOM_UTF8):]
1462         url = url.strip()
1463         if url.startswith(('#', ';', ']')):
1464             return False
1465         return url
1466
1467     with contextlib.closing(batch_fd) as fd:
1468         return [url for url in map(fixup, fd) if url]
1469
1470
1471 def urlencode_postdata(*args, **kargs):
1472     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1473
1474
1475 try:
1476     etree_iter = xml.etree.ElementTree.Element.iter
1477 except AttributeError:  # Python <=2.6
1478     etree_iter = lambda n: n.findall('.//*')
1479
1480
1481 def parse_xml(s):
1482     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1483         def doctype(self, name, pubid, system):
1484             pass  # Ignore doctypes
1485
1486     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1487     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1488     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1489     # Fix up XML parser in Python 2.x
1490     if sys.version_info < (3, 0):
1491         for n in etree_iter(tree):
1492             if n.text is not None:
1493                 if not isinstance(n.text, compat_str):
1494                     n.text = n.text.decode('utf-8')
1495     return tree
1496
1497
1498 if sys.version_info < (3, 0) and sys.platform == 'win32':
1499     def compat_getpass(prompt, *args, **kwargs):
1500         if isinstance(prompt, compat_str):
1501             prompt = prompt.encode(preferredencoding())
1502         return getpass.getpass(prompt, *args, **kwargs)
1503 else:
1504     compat_getpass = getpass.getpass
1505
1506
1507 US_RATINGS = {
1508     'G': 0,
1509     'PG': 10,
1510     'PG-13': 13,
1511     'R': 16,
1512     'NC': 18,
1513 }
1514
1515
1516 def strip_jsonp(code):
1517     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1518
1519
1520 def js_to_json(code):
1521     def fix_kv(m):
1522         key = m.group(2)
1523         if key.startswith("'"):
1524             assert key.endswith("'")
1525             assert '"' not in key
1526             key = '"%s"' % key[1:-1]
1527         elif not key.startswith('"'):
1528             key = '"%s"' % key
1529
1530         value = m.group(4)
1531         if value.startswith("'"):
1532             assert value.endswith("'")
1533             assert '"' not in value
1534             value = '"%s"' % value[1:-1]
1535
1536         return m.group(1) + key + m.group(3) + value
1537
1538     res = re.sub(r'''(?x)
1539             ([{,]\s*)
1540             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1541             (:\s*)
1542             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1543         ''', fix_kv, code)
1544     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1545     return res
1546
1547
1548 def qualities(quality_ids):
1549     """ Get a numeric quality value out of a list of possible values """
1550     def q(qid):
1551         try:
1552             return quality_ids.index(qid)
1553         except ValueError:
1554             return -1
1555     return q
1556
1557
1558 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1559
1560 try:
1561     subprocess_check_output = subprocess.check_output
1562 except AttributeError:
1563     def subprocess_check_output(*args, **kwargs):
1564         assert 'input' not in kwargs
1565         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1566         output, _ = p.communicate()
1567         ret = p.poll()
1568         if ret:
1569             raise subprocess.CalledProcessError(ret, p.args, output=output)
1570         return output