_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_html_parser,
  38     compat_parse_qs,
  39     compat_str,
  40     compat_urllib_error,
  41     compat_urllib_parse,
  42     compat_urllib_parse_urlparse,
  43     compat_urllib_request,
  44     compat_urlparse,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59 def preferredencoding():
  60     """Get preferred encoding.
  61
  62     Returns the best encoding scheme for the system, based on
  63     locale.getpreferredencoding() and some further tweaks.
  64     """
  65     try:
  66         pref = locale.getpreferredencoding()
  67         u'TEST'.encode(pref)
  68     except:
  69         pref = 'UTF-8'
  70
  71     return pref
  72
  73
  74 def write_json_file(obj, fn):
  75     """ Encode obj as JSON and write it to fn, atomically """
  76
  77     args = {
  78         'suffix': '.tmp',
  79         'prefix': os.path.basename(fn) + '.',
  80         'dir': os.path.dirname(fn),
  81         'delete': False,
  82     }
  83
  84     # In Python 2.x, json.dump expects a bytestream.
  85     # In Python 3.x, it writes to a character stream
  86     if sys.version_info < (3, 0):
  87         args['mode'] = 'wb'
  88     else:
  89         args.update({
  90             'mode': 'w',
  91             'encoding': 'utf-8',
  92         })
  93
  94     tf = tempfile.NamedTemporaryFile(**args)
  95
  96     try:
  97         with tf:
  98             json.dump(obj, tf)
  99         os.rename(tf.name, fn)
 100     except:
 101         try:
 102             os.remove(tf.name)
 103         except OSError:
 104             pass
 105         raise
 106
 107
 108 if sys.version_info >= (2, 7):
 109     def find_xpath_attr(node, xpath, key, val):
 110         """ Find the xpath xpath[@key=val] """
 111         assert re.match(r'^[a-zA-Z-]+$', key)
 112         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 113         expr = xpath + u"[@%s='%s']" % (key, val)
 114         return node.find(expr)
 115 else:
 116     def find_xpath_attr(node, xpath, key, val):
 117         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 118         # .//node does not match if a node is a direct child of . !
 119         if isinstance(xpath, unicode):
 120             xpath = xpath.encode('ascii')
 121
 122         for f in node.findall(xpath):
 123             if f.attrib.get(key) == val:
 124                 return f
 125         return None
 126
 127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 128 # the namespace parameter
 129 def xpath_with_ns(path, ns_map):
 130     components = [c.split(':') for c in path.split('/')]
 131     replaced = []
 132     for c in components:
 133         if len(c) == 1:
 134             replaced.append(c[0])
 135         else:
 136             ns, tag = c
 137             replaced.append('{%s}%s' % (ns_map[ns], tag))
 138     return '/'.join(replaced)
 139
 140
 141 def xpath_text(node, xpath, name=None, fatal=False):
 142     if sys.version_info < (2, 7):  # Crazy 2.6
 143         xpath = xpath.encode('ascii')
 144
 145     n = node.find(xpath)
 146     if n is None:
 147         if fatal:
 148             name = xpath if name is None else name
 149             raise ExtractorError('Could not find XML element %s' % name)
 150         else:
 151             return None
 152     return n.text
 153
 154
 155 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 156 class BaseHTMLParser(compat_html_parser.HTMLParser):
 157     def __init(self):
 158         compat_html_parser.HTMLParser.__init__(self)
 159         self.html = None
 160
 161     def loads(self, html):
 162         self.html = html
 163         self.feed(html)
 164         self.close()
 165
 166 class AttrParser(BaseHTMLParser):
 167     """Modified HTMLParser that isolates a tag with the specified attribute"""
 168     def __init__(self, attribute, value):
 169         self.attribute = attribute
 170         self.value = value
 171         self.result = None
 172         self.started = False
 173         self.depth = {}
 174         self.watch_startpos = False
 175         self.error_count = 0
 176         BaseHTMLParser.__init__(self)
 177
 178     def error(self, message):
 179         if self.error_count > 10 or self.started:
 180             raise compat_html_parser.HTMLParseError(message, self.getpos())
 181         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 182         self.error_count += 1
 183         self.goahead(1)
 184
 185     def handle_starttag(self, tag, attrs):
 186         attrs = dict(attrs)
 187         if self.started:
 188             self.find_startpos(None)
 189         if self.attribute in attrs and attrs[self.attribute] == self.value:
 190             self.result = [tag]
 191             self.started = True
 192             self.watch_startpos = True
 193         if self.started:
 194             if not tag in self.depth: self.depth[tag] = 0
 195             self.depth[tag] += 1
 196
 197     def handle_endtag(self, tag):
 198         if self.started:
 199             if tag in self.depth: self.depth[tag] -= 1
 200             if self.depth[self.result[0]] == 0:
 201                 self.started = False
 202                 self.result.append(self.getpos())
 203
 204     def find_startpos(self, x):
 205         """Needed to put the start position of the result (self.result[1])
 206         after the opening tag with the requested id"""
 207         if self.watch_startpos:
 208             self.watch_startpos = False
 209             self.result.append(self.getpos())
 210     handle_entityref = handle_charref = handle_data = handle_comment = \
 211     handle_decl = handle_pi = unknown_decl = find_startpos
 212
 213     def get_result(self):
 214         if self.result is None:
 215             return None
 216         if len(self.result) != 3:
 217             return None
 218         lines = self.html.split('\n')
 219         lines = lines[self.result[1][0]-1:self.result[2][0]]
 220         lines[0] = lines[0][self.result[1][1]:]
 221         if len(lines) == 1:
 222             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 223         lines[-1] = lines[-1][:self.result[2][1]]
 224         return '\n'.join(lines).strip()
 225 # Hack for https://github.com/rg3/youtube-dl/issues/662
 226 if sys.version_info < (2, 7, 3):
 227     AttrParser.parse_endtag = (lambda self, i:
 228         i + len("</scr'+'ipt>")
 229         if self.rawdata[i:].startswith("</scr'+'ipt>")
 230         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 231
 232 def get_element_by_id(id, html):
 233     """Return the content of the tag with the specified ID in the passed HTML document"""
 234     return get_element_by_attribute("id", id, html)
 235
 236 def get_element_by_attribute(attribute, value, html):
 237     """Return the content of the tag with the specified attribute in the passed HTML document"""
 238     parser = AttrParser(attribute, value)
 239     try:
 240         parser.loads(html)
 241     except compat_html_parser.HTMLParseError:
 242         pass
 243     return parser.get_result()
 244
 245 class MetaParser(BaseHTMLParser):
 246     """
 247     Modified HTMLParser that isolates a meta tag with the specified name
 248     attribute.
 249     """
 250     def __init__(self, name):
 251         BaseHTMLParser.__init__(self)
 252         self.name = name
 253         self.content = None
 254         self.result = None
 255
 256     def handle_starttag(self, tag, attrs):
 257         if tag != 'meta':
 258             return
 259         attrs = dict(attrs)
 260         if attrs.get('name') == self.name:
 261             self.result = attrs.get('content')
 262
 263     def get_result(self):
 264         return self.result
 265
 266 def get_meta_content(name, html):
 267     """
 268     Return the content attribute from the meta tag with the given name attribute.
 269     """
 270     parser = MetaParser(name)
 271     try:
 272         parser.loads(html)
 273     except compat_html_parser.HTMLParseError:
 274         pass
 275     return parser.get_result()
 276
 277
 278 def clean_html(html):
 279     """Clean an HTML snippet into a readable string"""
 280     # Newline vs <br />
 281     html = html.replace('\n', ' ')
 282     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 283     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 284     # Strip html tags
 285     html = re.sub('<.*?>', '', html)
 286     # Replace html entities
 287     html = unescapeHTML(html)
 288     return html.strip()
 289
 290
 291 def sanitize_open(filename, open_mode):
 292     """Try to open the given filename, and slightly tweak it if this fails.
 293
 294     Attempts to open the given filename. If this fails, it tries to change
 295     the filename slightly, step by step, until it's either able to open it
 296     or it fails and raises a final exception, like the standard open()
 297     function.
 298
 299     It returns the tuple (stream, definitive_file_name).
 300     """
 301     try:
 302         if filename == u'-':
 303             if sys.platform == 'win32':
 304                 import msvcrt
 305                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 306             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 307         stream = open(encodeFilename(filename), open_mode)
 308         return (stream, filename)
 309     except (IOError, OSError) as err:
 310         if err.errno in (errno.EACCES,):
 311             raise
 312
 313         # In case of error, try to remove win32 forbidden chars
 314         alt_filename = os.path.join(
 315                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 316                         for path_part in os.path.split(filename)
 317                        )
 318         if alt_filename == filename:
 319             raise
 320         else:
 321             # An exception here should be caught in the caller
 322             stream = open(encodeFilename(filename), open_mode)
 323             return (stream, alt_filename)
 324
 325
 326 def timeconvert(timestr):
 327     """Convert RFC 2822 defined time string into system timestamp"""
 328     timestamp = None
 329     timetuple = email.utils.parsedate_tz(timestr)
 330     if timetuple is not None:
 331         timestamp = email.utils.mktime_tz(timetuple)
 332     return timestamp
 333
 334 def sanitize_filename(s, restricted=False, is_id=False):
 335     """Sanitizes a string so it could be used as part of a filename.
 336     If restricted is set, use a stricter subset of allowed characters.
 337     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 338     """
 339     def replace_insane(char):
 340         if char == '?' or ord(char) < 32 or ord(char) == 127:
 341             return ''
 342         elif char == '"':
 343             return '' if restricted else '\''
 344         elif char == ':':
 345             return '_-' if restricted else ' -'
 346         elif char in '\\/|*<>':
 347             return '_'
 348         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 349             return '_'
 350         if restricted and ord(char) > 127:
 351             return '_'
 352         return char
 353
 354     result = u''.join(map(replace_insane, s))
 355     if not is_id:
 356         while '__' in result:
 357             result = result.replace('__', '_')
 358         result = result.strip('_')
 359         # Common case of "Foreign band name - English song title"
 360         if restricted and result.startswith('-_'):
 361             result = result[2:]
 362         if not result:
 363             result = '_'
 364     return result
 365
 366 def orderedSet(iterable):
 367     """ Remove all duplicates from the input iterable """
 368     res = []
 369     for el in iterable:
 370         if el not in res:
 371             res.append(el)
 372     return res
 373
 374
 375 def _htmlentity_transform(entity):
 376     """Transforms an HTML entity to a character."""
 377     # Known non-numeric HTML entity
 378     if entity in compat_html_entities.name2codepoint:
 379         return compat_chr(compat_html_entities.name2codepoint[entity])
 380
 381     mobj = re.match(r'#(x?[0-9]+)', entity)
 382     if mobj is not None:
 383         numstr = mobj.group(1)
 384         if numstr.startswith(u'x'):
 385             base = 16
 386             numstr = u'0%s' % numstr
 387         else:
 388             base = 10
 389         return compat_chr(int(numstr, base))
 390
 391     # Unknown entity in name, return its literal representation
 392     return (u'&%s;' % entity)
 393
 394
 395 def unescapeHTML(s):
 396     if s is None:
 397         return None
 398     assert type(s) == compat_str
 399
 400     return re.sub(
 401         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 402
 403
 404 def encodeFilename(s, for_subprocess=False):
 405     """
 406     @param s The name of the file
 407     """
 408
 409     assert type(s) == compat_str
 410
 411     # Python 3 has a Unicode API
 412     if sys.version_info >= (3, 0):
 413         return s
 414
 415     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 416         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 417         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 418         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 419         if not for_subprocess:
 420             return s
 421         else:
 422             # For subprocess calls, encode with locale encoding
 423             # Refer to http://stackoverflow.com/a/9951851/35070
 424             encoding = preferredencoding()
 425     else:
 426         encoding = sys.getfilesystemencoding()
 427     if encoding is None:
 428         encoding = 'utf-8'
 429     return s.encode(encoding, 'ignore')
 430
 431
 432 def encodeArgument(s):
 433     if not isinstance(s, compat_str):
 434         # Legacy code that uses byte strings
 435         # Uncomment the following line after fixing all post processors
 436         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 437         s = s.decode('ascii')
 438     return encodeFilename(s, True)
 439
 440
 441 def decodeOption(optval):
 442     if optval is None:
 443         return optval
 444     if isinstance(optval, bytes):
 445         optval = optval.decode(preferredencoding())
 446
 447     assert isinstance(optval, compat_str)
 448     return optval
 449
 450 def formatSeconds(secs):
 451     if secs > 3600:
 452         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 453     elif secs > 60:
 454         return '%d:%02d' % (secs // 60, secs % 60)
 455     else:
 456         return '%d' % secs
 457
 458
 459 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 460     if sys.version_info < (3, 2):
 461         import httplib
 462
 463         class HTTPSConnectionV3(httplib.HTTPSConnection):
 464             def __init__(self, *args, **kwargs):
 465                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 466
 467             def connect(self):
 468                 sock = socket.create_connection((self.host, self.port), self.timeout)
 469                 if getattr(self, '_tunnel_host', False):
 470                     self.sock = sock
 471                     self._tunnel()
 472                 try:
 473                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 474                 except ssl.SSLError:
 475                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 476
 477         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 478             def https_open(self, req):
 479                 return self.do_open(HTTPSConnectionV3, req)
 480         return HTTPSHandlerV3(**kwargs)
 481     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 482         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 483         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 484         if opts_no_check_certificate:
 485             context.verify_mode = ssl.CERT_NONE
 486         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 487     else:  # Python < 3.4
 488         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 489         context.verify_mode = (ssl.CERT_NONE
 490                                if opts_no_check_certificate
 491                                else ssl.CERT_REQUIRED)
 492         context.set_default_verify_paths()
 493         try:
 494             context.load_default_certs()
 495         except AttributeError:
 496             pass  # Python < 3.4
 497         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 498
 499 class ExtractorError(Exception):
 500     """Error during info extraction."""
 501     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 502         """ tb, if given, is the original traceback (so that it can be printed out).
 503         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 504         """
 505
 506         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 507             expected = True
 508         if video_id is not None:
 509             msg = video_id + ': ' + msg
 510         if cause:
 511             msg += u' (caused by %r)' % cause
 512         if not expected:
 513             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 514         super(ExtractorError, self).__init__(msg)
 515
 516         self.traceback = tb
 517         self.exc_info = sys.exc_info()  # preserve original exception
 518         self.cause = cause
 519         self.video_id = video_id
 520
 521     def format_traceback(self):
 522         if self.traceback is None:
 523             return None
 524         return u''.join(traceback.format_tb(self.traceback))
 525
 526
 527 class RegexNotFoundError(ExtractorError):
 528     """Error when a regex didn't match"""
 529     pass
 530
 531
 532 class DownloadError(Exception):
 533     """Download Error exception.
 534
 535     This exception may be thrown by FileDownloader objects if they are not
 536     configured to continue on errors. They will contain the appropriate
 537     error message.
 538     """
 539     def __init__(self, msg, exc_info=None):
 540         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 541         super(DownloadError, self).__init__(msg)
 542         self.exc_info = exc_info
 543
 544
 545 class SameFileError(Exception):
 546     """Same File exception.
 547
 548     This exception will be thrown by FileDownloader objects if they detect
 549     multiple files would have to be downloaded to the same file on disk.
 550     """
 551     pass
 552
 553
 554 class PostProcessingError(Exception):
 555     """Post Processing exception.
 556
 557     This exception may be raised by PostProcessor's .run() method to
 558     indicate an error in the postprocessing task.
 559     """
 560     def __init__(self, msg):
 561         self.msg = msg
 562
 563 class MaxDownloadsReached(Exception):
 564     """ --max-downloads limit has been reached. """
 565     pass
 566
 567
 568 class UnavailableVideoError(Exception):
 569     """Unavailable Format exception.
 570
 571     This exception will be thrown when a video is requested
 572     in a format that is not available for that video.
 573     """
 574     pass
 575
 576
 577 class ContentTooShortError(Exception):
 578     """Content Too Short exception.
 579
 580     This exception may be raised by FileDownloader objects when a file they
 581     download is too small for what the server announced first, indicating
 582     the connection was probably interrupted.
 583     """
 584     # Both in bytes
 585     downloaded = None
 586     expected = None
 587
 588     def __init__(self, downloaded, expected):
 589         self.downloaded = downloaded
 590         self.expected = expected
 591
 592 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 593     """Handler for HTTP requests and responses.
 594
 595     This class, when installed with an OpenerDirector, automatically adds
 596     the standard headers to every HTTP request and handles gzipped and
 597     deflated responses from web servers. If compression is to be avoided in
 598     a particular request, the original request in the program code only has
 599     to include the HTTP header "Youtubedl-No-Compression", which will be
 600     removed before making the real request.
 601
 602     Part of this code was copied from:
 603
 604     http://techknack.net/python-urllib2-handlers/
 605
 606     Andrew Rowls, the author of that code, agreed to release it to the
 607     public domain.
 608     """
 609
 610     @staticmethod
 611     def deflate(data):
 612         try:
 613             return zlib.decompress(data, -zlib.MAX_WBITS)
 614         except zlib.error:
 615             return zlib.decompress(data)
 616
 617     @staticmethod
 618     def addinfourl_wrapper(stream, headers, url, code):
 619         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 620             return compat_urllib_request.addinfourl(stream, headers, url, code)
 621         ret = compat_urllib_request.addinfourl(stream, headers, url)
 622         ret.code = code
 623         return ret
 624
 625     def http_request(self, req):
 626         for h, v in std_headers.items():
 627             if h not in req.headers:
 628                 req.add_header(h, v)
 629         if 'Youtubedl-no-compression' in req.headers:
 630             if 'Accept-encoding' in req.headers:
 631                 del req.headers['Accept-encoding']
 632             del req.headers['Youtubedl-no-compression']
 633         if 'Youtubedl-user-agent' in req.headers:
 634             if 'User-agent' in req.headers:
 635                 del req.headers['User-agent']
 636             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 637             del req.headers['Youtubedl-user-agent']
 638
 639         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 640             # Python 2.6 is brain-dead when it comes to fragments
 641             req._Request__original = req._Request__original.partition('#')[0]
 642             req._Request__r_type = req._Request__r_type.partition('#')[0]
 643
 644         return req
 645
 646     def http_response(self, req, resp):
 647         old_resp = resp
 648         # gzip
 649         if resp.headers.get('Content-encoding', '') == 'gzip':
 650             content = resp.read()
 651             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 652             try:
 653                 uncompressed = io.BytesIO(gz.read())
 654             except IOError as original_ioerror:
 655                 # There may be junk add the end of the file
 656                 # See http://stackoverflow.com/q/4928560/35070 for details
 657                 for i in range(1, 1024):
 658                     try:
 659                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 660                         uncompressed = io.BytesIO(gz.read())
 661                     except IOError:
 662                         continue
 663                     break
 664                 else:
 665                     raise original_ioerror
 666             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 667             resp.msg = old_resp.msg
 668         # deflate
 669         if resp.headers.get('Content-encoding', '') == 'deflate':
 670             gz = io.BytesIO(self.deflate(resp.read()))
 671             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 672             resp.msg = old_resp.msg
 673         return resp
 674
 675     https_request = http_request
 676     https_response = http_response
 677
 678
 679 def parse_iso8601(date_str, delimiter='T'):
 680     """ Return a UNIX timestamp from the given date """
 681
 682     if date_str is None:
 683         return None
 684
 685     m = re.search(
 686         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 687         date_str)
 688     if not m:
 689         timezone = datetime.timedelta()
 690     else:
 691         date_str = date_str[:-len(m.group(0))]
 692         if not m.group('sign'):
 693             timezone = datetime.timedelta()
 694         else:
 695             sign = 1 if m.group('sign') == '+' else -1
 696             timezone = datetime.timedelta(
 697                 hours=sign * int(m.group('hours')),
 698                 minutes=sign * int(m.group('minutes')))
 699     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 700     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 701     return calendar.timegm(dt.timetuple())
 702
 703
 704 def unified_strdate(date_str):
 705     """Return a string with the date in the format YYYYMMDD"""
 706
 707     if date_str is None:
 708         return None
 709
 710     upload_date = None
 711     #Replace commas
 712     date_str = date_str.replace(',', ' ')
 713     # %z (UTC offset) is only supported in python>=3.2
 714     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 715     format_expressions = [
 716         '%d %B %Y',
 717         '%d %b %Y',
 718         '%B %d %Y',
 719         '%b %d %Y',
 720         '%b %dst %Y %I:%M%p',
 721         '%b %dnd %Y %I:%M%p',
 722         '%b %dth %Y %I:%M%p',
 723         '%Y-%m-%d',
 724         '%Y/%m/%d',
 725         '%d.%m.%Y',
 726         '%d/%m/%Y',
 727         '%d/%m/%y',
 728         '%Y/%m/%d %H:%M:%S',
 729         '%d/%m/%Y %H:%M:%S',
 730         '%Y-%m-%d %H:%M:%S',
 731         '%Y-%m-%d %H:%M:%S.%f',
 732         '%d.%m.%Y %H:%M',
 733         '%d.%m.%Y %H.%M',
 734         '%Y-%m-%dT%H:%M:%SZ',
 735         '%Y-%m-%dT%H:%M:%S.%fZ',
 736         '%Y-%m-%dT%H:%M:%S.%f0Z',
 737         '%Y-%m-%dT%H:%M:%S',
 738         '%Y-%m-%dT%H:%M:%S.%f',
 739         '%Y-%m-%dT%H:%M',
 740     ]
 741     for expression in format_expressions:
 742         try:
 743             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 744         except ValueError:
 745             pass
 746     if upload_date is None:
 747         timetuple = email.utils.parsedate_tz(date_str)
 748         if timetuple:
 749             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 750     return upload_date
 751
 752 def determine_ext(url, default_ext=u'unknown_video'):
 753     if url is None:
 754         return default_ext
 755     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 756     if re.match(r'^[A-Za-z0-9]+$', guess):
 757         return guess
 758     else:
 759         return default_ext
 760
 761 def subtitles_filename(filename, sub_lang, sub_format):
 762     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 763
 764 def date_from_str(date_str):
 765     """
 766     Return a datetime object from a string in the format YYYYMMDD or
 767     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 768     today = datetime.date.today()
 769     if date_str == 'now'or date_str == 'today':
 770         return today
 771     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 772     if match is not None:
 773         sign = match.group('sign')
 774         time = int(match.group('time'))
 775         if sign == '-':
 776             time = -time
 777         unit = match.group('unit')
 778         #A bad aproximation?
 779         if unit == 'month':
 780             unit = 'day'
 781             time *= 30
 782         elif unit == 'year':
 783             unit = 'day'
 784             time *= 365
 785         unit += 's'
 786         delta = datetime.timedelta(**{unit: time})
 787         return today + delta
 788     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 789
 790 def hyphenate_date(date_str):
 791     """
 792     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 793     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 794     if match is not None:
 795         return '-'.join(match.groups())
 796     else:
 797         return date_str
 798
 799 class DateRange(object):
 800     """Represents a time interval between two dates"""
 801     def __init__(self, start=None, end=None):
 802         """start and end must be strings in the format accepted by date"""
 803         if start is not None:
 804             self.start = date_from_str(start)
 805         else:
 806             self.start = datetime.datetime.min.date()
 807         if end is not None:
 808             self.end = date_from_str(end)
 809         else:
 810             self.end = datetime.datetime.max.date()
 811         if self.start > self.end:
 812             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 813     @classmethod
 814     def day(cls, day):
 815         """Returns a range that only contains the given day"""
 816         return cls(day,day)
 817     def __contains__(self, date):
 818         """Check if the date is in the range"""
 819         if not isinstance(date, datetime.date):
 820             date = date_from_str(date)
 821         return self.start <= date <= self.end
 822     def __str__(self):
 823         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 824
 825
 826 def platform_name():
 827     """ Returns the platform name as a compat_str """
 828     res = platform.platform()
 829     if isinstance(res, bytes):
 830         res = res.decode(preferredencoding())
 831
 832     assert isinstance(res, compat_str)
 833     return res
 834
 835
 836 def _windows_write_string(s, out):
 837     """ Returns True if the string was written using special methods,
 838     False if it has yet to be written out."""
 839     # Adapted from http://stackoverflow.com/a/3259271/35070
 840
 841     import ctypes
 842     import ctypes.wintypes
 843
 844     WIN_OUTPUT_IDS = {
 845         1: -11,
 846         2: -12,
 847     }
 848
 849     try:
 850         fileno = out.fileno()
 851     except AttributeError:
 852         # If the output stream doesn't have a fileno, it's virtual
 853         return False
 854     if fileno not in WIN_OUTPUT_IDS:
 855         return False
 856
 857     GetStdHandle = ctypes.WINFUNCTYPE(
 858         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 859         ("GetStdHandle", ctypes.windll.kernel32))
 860     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 861
 862     WriteConsoleW = ctypes.WINFUNCTYPE(
 863         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 864         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 865         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 866     written = ctypes.wintypes.DWORD(0)
 867
 868     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 869     FILE_TYPE_CHAR = 0x0002
 870     FILE_TYPE_REMOTE = 0x8000
 871     GetConsoleMode = ctypes.WINFUNCTYPE(
 872         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 873         ctypes.POINTER(ctypes.wintypes.DWORD))(
 874         ("GetConsoleMode", ctypes.windll.kernel32))
 875     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 876
 877     def not_a_console(handle):
 878         if handle == INVALID_HANDLE_VALUE or handle is None:
 879             return True
 880         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 881                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 882
 883     if not_a_console(h):
 884         return False
 885
 886     def next_nonbmp_pos(s):
 887         try:
 888             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 889         except StopIteration:
 890             return len(s)
 891
 892     while s:
 893         count = min(next_nonbmp_pos(s), 1024)
 894
 895         ret = WriteConsoleW(
 896             h, s, count if count else 2, ctypes.byref(written), None)
 897         if ret == 0:
 898             raise OSError('Failed to write string')
 899         if not count:  # We just wrote a non-BMP character
 900             assert written.value == 2
 901             s = s[1:]
 902         else:
 903             assert written.value > 0
 904             s = s[written.value:]
 905     return True
 906
 907
 908 def write_string(s, out=None, encoding=None):
 909     if out is None:
 910         out = sys.stderr
 911     assert type(s) == compat_str
 912
 913     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 914         if _windows_write_string(s, out):
 915             return
 916
 917     if ('b' in getattr(out, 'mode', '') or
 918             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 919         byt = s.encode(encoding or preferredencoding(), 'ignore')
 920         out.write(byt)
 921     elif hasattr(out, 'buffer'):
 922         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 923         byt = s.encode(enc, 'ignore')
 924         out.buffer.write(byt)
 925     else:
 926         out.write(s)
 927     out.flush()
 928
 929
 930 def bytes_to_intlist(bs):
 931     if not bs:
 932         return []
 933     if isinstance(bs[0], int):  # Python 3
 934         return list(bs)
 935     else:
 936         return [ord(c) for c in bs]
 937
 938
 939 def intlist_to_bytes(xs):
 940     if not xs:
 941         return b''
 942     if isinstance(chr(0), bytes):  # Python 2
 943         return ''.join([chr(x) for x in xs])
 944     else:
 945         return bytes(xs)
 946
 947
 948 # Cross-platform file locking
 949 if sys.platform == 'win32':
 950     import ctypes.wintypes
 951     import msvcrt
 952
 953     class OVERLAPPED(ctypes.Structure):
 954         _fields_ = [
 955             ('Internal', ctypes.wintypes.LPVOID),
 956             ('InternalHigh', ctypes.wintypes.LPVOID),
 957             ('Offset', ctypes.wintypes.DWORD),
 958             ('OffsetHigh', ctypes.wintypes.DWORD),
 959             ('hEvent', ctypes.wintypes.HANDLE),
 960         ]
 961
 962     kernel32 = ctypes.windll.kernel32
 963     LockFileEx = kernel32.LockFileEx
 964     LockFileEx.argtypes = [
 965         ctypes.wintypes.HANDLE,     # hFile
 966         ctypes.wintypes.DWORD,      # dwFlags
 967         ctypes.wintypes.DWORD,      # dwReserved
 968         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 969         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 970         ctypes.POINTER(OVERLAPPED)  # Overlapped
 971     ]
 972     LockFileEx.restype = ctypes.wintypes.BOOL
 973     UnlockFileEx = kernel32.UnlockFileEx
 974     UnlockFileEx.argtypes = [
 975         ctypes.wintypes.HANDLE,     # hFile
 976         ctypes.wintypes.DWORD,      # dwReserved
 977         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 978         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 979         ctypes.POINTER(OVERLAPPED)  # Overlapped
 980     ]
 981     UnlockFileEx.restype = ctypes.wintypes.BOOL
 982     whole_low = 0xffffffff
 983     whole_high = 0x7fffffff
 984
 985     def _lock_file(f, exclusive):
 986         overlapped = OVERLAPPED()
 987         overlapped.Offset = 0
 988         overlapped.OffsetHigh = 0
 989         overlapped.hEvent = 0
 990         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 991         handle = msvcrt.get_osfhandle(f.fileno())
 992         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 993                           whole_low, whole_high, f._lock_file_overlapped_p):
 994             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 995
 996     def _unlock_file(f):
 997         assert f._lock_file_overlapped_p
 998         handle = msvcrt.get_osfhandle(f.fileno())
 999         if not UnlockFileEx(handle, 0,
1000                             whole_low, whole_high, f._lock_file_overlapped_p):
1001             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1002
1003 else:
1004     import fcntl
1005
1006     def _lock_file(f, exclusive):
1007         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1008
1009     def _unlock_file(f):
1010         fcntl.flock(f, fcntl.LOCK_UN)
1011
1012
1013 class locked_file(object):
1014     def __init__(self, filename, mode, encoding=None):
1015         assert mode in ['r', 'a', 'w']
1016         self.f = io.open(filename, mode, encoding=encoding)
1017         self.mode = mode
1018
1019     def __enter__(self):
1020         exclusive = self.mode != 'r'
1021         try:
1022             _lock_file(self.f, exclusive)
1023         except IOError:
1024             self.f.close()
1025             raise
1026         return self
1027
1028     def __exit__(self, etype, value, traceback):
1029         try:
1030             _unlock_file(self.f)
1031         finally:
1032             self.f.close()
1033
1034     def __iter__(self):
1035         return iter(self.f)
1036
1037     def write(self, *args):
1038         return self.f.write(*args)
1039
1040     def read(self, *args):
1041         return self.f.read(*args)
1042
1043
1044 def get_filesystem_encoding():
1045     encoding = sys.getfilesystemencoding()
1046     return encoding if encoding is not None else 'utf-8'
1047
1048
1049 def shell_quote(args):
1050     quoted_args = []
1051     encoding = get_filesystem_encoding()
1052     for a in args:
1053         if isinstance(a, bytes):
1054             # We may get a filename encoded with 'encodeFilename'
1055             a = a.decode(encoding)
1056         quoted_args.append(pipes.quote(a))
1057     return u' '.join(quoted_args)
1058
1059
1060 def takewhile_inclusive(pred, seq):
1061     """ Like itertools.takewhile, but include the latest evaluated element
1062         (the first element so that Not pred(e)) """
1063     for e in seq:
1064         yield e
1065         if not pred(e):
1066             return
1067
1068
1069 def smuggle_url(url, data):
1070     """ Pass additional data in a URL for internal use. """
1071
1072     sdata = compat_urllib_parse.urlencode(
1073         {u'__youtubedl_smuggle': json.dumps(data)})
1074     return url + u'#' + sdata
1075
1076
1077 def unsmuggle_url(smug_url, default=None):
1078     if not '#__youtubedl_smuggle' in smug_url:
1079         return smug_url, default
1080     url, _, sdata = smug_url.rpartition(u'#')
1081     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1082     data = json.loads(jsond)
1083     return url, data
1084
1085
1086 def format_bytes(bytes):
1087     if bytes is None:
1088         return u'N/A'
1089     if type(bytes) is str:
1090         bytes = float(bytes)
1091     if bytes == 0.0:
1092         exponent = 0
1093     else:
1094         exponent = int(math.log(bytes, 1024.0))
1095     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1096     converted = float(bytes) / float(1024 ** exponent)
1097     return u'%.2f%s' % (converted, suffix)
1098
1099
1100 def get_term_width():
1101     columns = compat_getenv('COLUMNS', None)
1102     if columns:
1103         return int(columns)
1104
1105     try:
1106         sp = subprocess.Popen(
1107             ['stty', 'size'],
1108             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1109         out, err = sp.communicate()
1110         return int(out.split()[1])
1111     except:
1112         pass
1113     return None
1114
1115
1116 def month_by_name(name):
1117     """ Return the number of a month by (locale-independently) English name """
1118
1119     ENGLISH_NAMES = [
1120         u'January', u'February', u'March', u'April', u'May', u'June',
1121         u'July', u'August', u'September', u'October', u'November', u'December']
1122     try:
1123         return ENGLISH_NAMES.index(name) + 1
1124     except ValueError:
1125         return None
1126
1127
1128 def fix_xml_ampersands(xml_str):
1129     """Replace all the '&' by '&amp;' in XML"""
1130     return re.sub(
1131         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1132         u'&amp;',
1133         xml_str)
1134
1135
1136 def setproctitle(title):
1137     assert isinstance(title, compat_str)
1138     try:
1139         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1140     except OSError:
1141         return
1142     title_bytes = title.encode('utf-8')
1143     buf = ctypes.create_string_buffer(len(title_bytes))
1144     buf.value = title_bytes
1145     try:
1146         libc.prctl(15, buf, 0, 0, 0)
1147     except AttributeError:
1148         return  # Strange libc, just skip this
1149
1150
1151 def remove_start(s, start):
1152     if s.startswith(start):
1153         return s[len(start):]
1154     return s
1155
1156
1157 def remove_end(s, end):
1158     if s.endswith(end):
1159         return s[:-len(end)]
1160     return s
1161
1162
1163 def url_basename(url):
1164     path = compat_urlparse.urlparse(url).path
1165     return path.strip(u'/').split(u'/')[-1]
1166
1167
1168 class HEADRequest(compat_urllib_request.Request):
1169     def get_method(self):
1170         return "HEAD"
1171
1172
1173 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1174     if get_attr:
1175         if v is not None:
1176             v = getattr(v, get_attr, None)
1177     if v == '':
1178         v = None
1179     return default if v is None else (int(v) * invscale // scale)
1180
1181
1182 def str_or_none(v, default=None):
1183     return default if v is None else compat_str(v)
1184
1185
1186 def str_to_int(int_str):
1187     """ A more relaxed version of int_or_none """
1188     if int_str is None:
1189         return None
1190     int_str = re.sub(r'[,\.\+]', u'', int_str)
1191     return int(int_str)
1192
1193
1194 def float_or_none(v, scale=1, invscale=1, default=None):
1195     return default if v is None else (float(v) * invscale / scale)
1196
1197
1198 def parse_duration(s):
1199     if s is None:
1200         return None
1201
1202     s = s.strip()
1203
1204     m = re.match(
1205         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1206     if not m:
1207         return None
1208     res = int(m.group('secs'))
1209     if m.group('mins'):
1210         res += int(m.group('mins')) * 60
1211         if m.group('hours'):
1212             res += int(m.group('hours')) * 60 * 60
1213     if m.group('ms'):
1214         res += float(m.group('ms'))
1215     return res
1216
1217
1218 def prepend_extension(filename, ext):
1219     name, real_ext = os.path.splitext(filename)
1220     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1221
1222
1223 def check_executable(exe, args=[]):
1224     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1225     args can be a list of arguments for a short output (like -version) """
1226     try:
1227         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1228     except OSError:
1229         return False
1230     return exe
1231
1232
1233 def get_exe_version(exe, args=['--version'],
1234                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1235                     unrecognized=u'present'):
1236     """ Returns the version of the specified executable,
1237     or False if the executable is not present """
1238     try:
1239         out, err = subprocess.Popen(
1240             [exe] + args,
1241             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1242     except OSError:
1243         return False
1244     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1245     m = re.search(version_re, firstline)
1246     if m:
1247         return m.group(1)
1248     else:
1249         return unrecognized
1250
1251
1252 class PagedList(object):
1253     def __len__(self):
1254         # This is only useful for tests
1255         return len(self.getslice())
1256
1257
1258 class OnDemandPagedList(PagedList):
1259     def __init__(self, pagefunc, pagesize):
1260         self._pagefunc = pagefunc
1261         self._pagesize = pagesize
1262
1263     def getslice(self, start=0, end=None):
1264         res = []
1265         for pagenum in itertools.count(start // self._pagesize):
1266             firstid = pagenum * self._pagesize
1267             nextfirstid = pagenum * self._pagesize + self._pagesize
1268             if start >= nextfirstid:
1269                 continue
1270
1271             page_results = list(self._pagefunc(pagenum))
1272
1273             startv = (
1274                 start % self._pagesize
1275                 if firstid <= start < nextfirstid
1276                 else 0)
1277
1278             endv = (
1279                 ((end - 1) % self._pagesize) + 1
1280                 if (end is not None and firstid <= end <= nextfirstid)
1281                 else None)
1282
1283             if startv != 0 or endv is not None:
1284                 page_results = page_results[startv:endv]
1285             res.extend(page_results)
1286
1287             # A little optimization - if current page is not "full", ie. does
1288             # not contain page_size videos then we can assume that this page
1289             # is the last one - there are no more ids on further pages -
1290             # i.e. no need to query again.
1291             if len(page_results) + startv < self._pagesize:
1292                 break
1293
1294             # If we got the whole page, but the next page is not interesting,
1295             # break out early as well
1296             if end == nextfirstid:
1297                 break
1298         return res
1299
1300
1301 class InAdvancePagedList(PagedList):
1302     def __init__(self, pagefunc, pagecount, pagesize):
1303         self._pagefunc = pagefunc
1304         self._pagecount = pagecount
1305         self._pagesize = pagesize
1306
1307     def getslice(self, start=0, end=None):
1308         res = []
1309         start_page = start // self._pagesize
1310         end_page = (
1311             self._pagecount if end is None else (end // self._pagesize + 1))
1312         skip_elems = start - start_page * self._pagesize
1313         only_more = None if end is None else end - start
1314         for pagenum in range(start_page, end_page):
1315             page = list(self._pagefunc(pagenum))
1316             if skip_elems:
1317                 page = page[skip_elems:]
1318                 skip_elems = None
1319             if only_more is not None:
1320                 if len(page) < only_more:
1321                     only_more -= len(page)
1322                 else:
1323                     page = page[:only_more]
1324                     res.extend(page)
1325                     break
1326             res.extend(page)
1327         return res
1328
1329
1330 def uppercase_escape(s):
1331     unicode_escape = codecs.getdecoder('unicode_escape')
1332     return re.sub(
1333         r'\\U[0-9a-fA-F]{8}',
1334         lambda m: unicode_escape(m.group(0))[0],
1335         s)
1336
1337
1338 def escape_rfc3986(s):
1339     """Escape non-ASCII characters as suggested by RFC 3986"""
1340     if sys.version_info < (3, 0) and isinstance(s, unicode):
1341         s = s.encode('utf-8')
1342     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1343
1344
1345 def escape_url(url):
1346     """Escape URL as suggested by RFC 3986"""
1347     url_parsed = compat_urllib_parse_urlparse(url)
1348     return url_parsed._replace(
1349         path=escape_rfc3986(url_parsed.path),
1350         params=escape_rfc3986(url_parsed.params),
1351         query=escape_rfc3986(url_parsed.query),
1352         fragment=escape_rfc3986(url_parsed.fragment)
1353     ).geturl()
1354
1355 try:
1356     struct.pack(u'!I', 0)
1357 except TypeError:
1358     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1359     def struct_pack(spec, *args):
1360         if isinstance(spec, compat_str):
1361             spec = spec.encode('ascii')
1362         return struct.pack(spec, *args)
1363
1364     def struct_unpack(spec, *args):
1365         if isinstance(spec, compat_str):
1366             spec = spec.encode('ascii')
1367         return struct.unpack(spec, *args)
1368 else:
1369     struct_pack = struct.pack
1370     struct_unpack = struct.unpack
1371
1372
1373 def read_batch_urls(batch_fd):
1374     def fixup(url):
1375         if not isinstance(url, compat_str):
1376             url = url.decode('utf-8', 'replace')
1377         BOM_UTF8 = u'\xef\xbb\xbf'
1378         if url.startswith(BOM_UTF8):
1379             url = url[len(BOM_UTF8):]
1380         url = url.strip()
1381         if url.startswith(('#', ';', ']')):
1382             return False
1383         return url
1384
1385     with contextlib.closing(batch_fd) as fd:
1386         return [url for url in map(fixup, fd) if url]
1387
1388
1389 def urlencode_postdata(*args, **kargs):
1390     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1391
1392
1393 try:
1394     etree_iter = xml.etree.ElementTree.Element.iter
1395 except AttributeError:  # Python <=2.6
1396     etree_iter = lambda n: n.findall('.//*')
1397
1398
1399 def parse_xml(s):
1400     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1401         def doctype(self, name, pubid, system):
1402             pass  # Ignore doctypes
1403
1404     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1405     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1406     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1407     # Fix up XML parser in Python 2.x
1408     if sys.version_info < (3, 0):
1409         for n in etree_iter(tree):
1410             if n.text is not None:
1411                 if not isinstance(n.text, compat_str):
1412                     n.text = n.text.decode('utf-8')
1413     return tree
1414
1415
1416 US_RATINGS = {
1417     'G': 0,
1418     'PG': 10,
1419     'PG-13': 13,
1420     'R': 16,
1421     'NC': 18,
1422 }
1423
1424
1425 def parse_age_limit(s):
1426     if s is None:
1427         return None
1428     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1429     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1430
1431
1432 def strip_jsonp(code):
1433     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1434
1435
1436 def js_to_json(code):
1437     def fix_kv(m):
1438         v = m.group(0)
1439         if v in ('true', 'false', 'null'):
1440             return v
1441         if v.startswith('"'):
1442             return v
1443         if v.startswith("'"):
1444             v = v[1:-1]
1445             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1446                 '\\\\': '\\\\',
1447                 "\\'": "'",
1448                 '"': '\\"',
1449             }[m.group(0)], v)
1450         return '"%s"' % v
1451
1452     res = re.sub(r'''(?x)
1453         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1454         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1455         [a-zA-Z_][a-zA-Z_0-9]*
1456         ''', fix_kv, code)
1457     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1458     return res
1459
1460
1461 def qualities(quality_ids):
1462     """ Get a numeric quality value out of a list of possible values """
1463     def q(qid):
1464         try:
1465             return quality_ids.index(qid)
1466         except ValueError:
1467             return -1
1468     return q
1469
1470
1471 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1472
1473
1474 def limit_length(s, length):
1475     """ Add ellipses to overly long strings """
1476     if s is None:
1477         return None
1478     ELLIPSES = '...'
1479     if len(s) > length:
1480         return s[:length - len(ELLIPSES)] + ELLIPSES
1481     return s
1482
1483
1484 def version_tuple(v):
1485     return [int(e) for e in v.split('.')]
1486
1487
1488 def is_outdated_version(version, limit, assume_new=True):
1489     if not version:
1490         return not assume_new
1491     try:
1492         return version_tuple(version) < version_tuple(limit)
1493     except ValueError:
1494         return not assume_new