_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_html_parser,
  38     compat_parse_qs,
  39     compat_str,
  40     compat_urllib_error,
  41     compat_urllib_parse,
  42     compat_urllib_parse_urlparse,
  43     compat_urllib_request,
  44     compat_urlparse,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59 def preferredencoding():
  60     """Get preferred encoding.
  61
  62     Returns the best encoding scheme for the system, based on
  63     locale.getpreferredencoding() and some further tweaks.
  64     """
  65     try:
  66         pref = locale.getpreferredencoding()
  67         u'TEST'.encode(pref)
  68     except:
  69         pref = 'UTF-8'
  70
  71     return pref
  72
  73
  74 def write_json_file(obj, fn):
  75     """ Encode obj as JSON and write it to fn, atomically """
  76
  77     args = {
  78         'suffix': '.tmp',
  79         'prefix': os.path.basename(fn) + '.',
  80         'dir': os.path.dirname(fn),
  81         'delete': False,
  82     }
  83
  84     # In Python 2.x, json.dump expects a bytestream.
  85     # In Python 3.x, it writes to a character stream
  86     if sys.version_info < (3, 0):
  87         args['mode'] = 'wb'
  88     else:
  89         args.update({
  90             'mode': 'w',
  91             'encoding': 'utf-8',
  92         })
  93
  94     tf = tempfile.NamedTemporaryFile(**args)
  95
  96     try:
  97         with tf:
  98             json.dump(obj, tf)
  99         os.rename(tf.name, fn)
 100     except:
 101         try:
 102             os.remove(tf.name)
 103         except OSError:
 104             pass
 105         raise
 106
 107
 108 if sys.version_info >= (2, 7):
 109     def find_xpath_attr(node, xpath, key, val):
 110         """ Find the xpath xpath[@key=val] """
 111         assert re.match(r'^[a-zA-Z-]+$', key)
 112         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 113         expr = xpath + u"[@%s='%s']" % (key, val)
 114         return node.find(expr)
 115 else:
 116     def find_xpath_attr(node, xpath, key, val):
 117         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 118         # .//node does not match if a node is a direct child of . !
 119         if isinstance(xpath, unicode):
 120             xpath = xpath.encode('ascii')
 121
 122         for f in node.findall(xpath):
 123             if f.attrib.get(key) == val:
 124                 return f
 125         return None
 126
 127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 128 # the namespace parameter
 129 def xpath_with_ns(path, ns_map):
 130     components = [c.split(':') for c in path.split('/')]
 131     replaced = []
 132     for c in components:
 133         if len(c) == 1:
 134             replaced.append(c[0])
 135         else:
 136             ns, tag = c
 137             replaced.append('{%s}%s' % (ns_map[ns], tag))
 138     return '/'.join(replaced)
 139
 140
 141 def xpath_text(node, xpath, name=None, fatal=False):
 142     if sys.version_info < (2, 7):  # Crazy 2.6
 143         xpath = xpath.encode('ascii')
 144
 145     n = node.find(xpath)
 146     if n is None:
 147         if fatal:
 148             name = xpath if name is None else name
 149             raise ExtractorError('Could not find XML element %s' % name)
 150         else:
 151             return None
 152     return n.text
 153
 154
 155 if sys.version_info < (2, 7):
 156     compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 157
 158 class BaseHTMLParser(compat_html_parser.HTMLParser):
 159     def __init(self):
 160         compat_html_parser.HTMLParser.__init__(self)
 161         self.html = None
 162
 163     def loads(self, html):
 164         self.html = html
 165         self.feed(html)
 166         self.close()
 167
 168 class AttrParser(BaseHTMLParser):
 169     """Modified HTMLParser that isolates a tag with the specified attribute"""
 170     def __init__(self, attribute, value):
 171         self.attribute = attribute
 172         self.value = value
 173         self.result = None
 174         self.started = False
 175         self.depth = {}
 176         self.watch_startpos = False
 177         self.error_count = 0
 178         BaseHTMLParser.__init__(self)
 179
 180     def error(self, message):
 181         if self.error_count > 10 or self.started:
 182             raise compat_html_parser.HTMLParseError(message, self.getpos())
 183         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 184         self.error_count += 1
 185         self.goahead(1)
 186
 187     def handle_starttag(self, tag, attrs):
 188         attrs = dict(attrs)
 189         if self.started:
 190             self.find_startpos(None)
 191         if self.attribute in attrs and attrs[self.attribute] == self.value:
 192             self.result = [tag]
 193             self.started = True
 194             self.watch_startpos = True
 195         if self.started:
 196             if not tag in self.depth: self.depth[tag] = 0
 197             self.depth[tag] += 1
 198
 199     def handle_endtag(self, tag):
 200         if self.started:
 201             if tag in self.depth: self.depth[tag] -= 1
 202             if self.depth[self.result[0]] == 0:
 203                 self.started = False
 204                 self.result.append(self.getpos())
 205
 206     def find_startpos(self, x):
 207         """Needed to put the start position of the result (self.result[1])
 208         after the opening tag with the requested id"""
 209         if self.watch_startpos:
 210             self.watch_startpos = False
 211             self.result.append(self.getpos())
 212     handle_entityref = handle_charref = handle_data = handle_comment = \
 213     handle_decl = handle_pi = unknown_decl = find_startpos
 214
 215     def get_result(self):
 216         if self.result is None:
 217             return None
 218         if len(self.result) != 3:
 219             return None
 220         lines = self.html.split('\n')
 221         lines = lines[self.result[1][0]-1:self.result[2][0]]
 222         lines[0] = lines[0][self.result[1][1]:]
 223         if len(lines) == 1:
 224             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 225         lines[-1] = lines[-1][:self.result[2][1]]
 226         return '\n'.join(lines).strip()
 227 # Hack for https://github.com/rg3/youtube-dl/issues/662
 228 if sys.version_info < (2, 7, 3):
 229     AttrParser.parse_endtag = (lambda self, i:
 230         i + len("</scr'+'ipt>")
 231         if self.rawdata[i:].startswith("</scr'+'ipt>")
 232         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 233
 234 def get_element_by_id(id, html):
 235     """Return the content of the tag with the specified ID in the passed HTML document"""
 236     return get_element_by_attribute("id", id, html)
 237
 238 def get_element_by_attribute(attribute, value, html):
 239     """Return the content of the tag with the specified attribute in the passed HTML document"""
 240     parser = AttrParser(attribute, value)
 241     try:
 242         parser.loads(html)
 243     except compat_html_parser.HTMLParseError:
 244         pass
 245     return parser.get_result()
 246
 247 class MetaParser(BaseHTMLParser):
 248     """
 249     Modified HTMLParser that isolates a meta tag with the specified name
 250     attribute.
 251     """
 252     def __init__(self, name):
 253         BaseHTMLParser.__init__(self)
 254         self.name = name
 255         self.content = None
 256         self.result = None
 257
 258     def handle_starttag(self, tag, attrs):
 259         if tag != 'meta':
 260             return
 261         attrs = dict(attrs)
 262         if attrs.get('name') == self.name:
 263             self.result = attrs.get('content')
 264
 265     def get_result(self):
 266         return self.result
 267
 268 def get_meta_content(name, html):
 269     """
 270     Return the content attribute from the meta tag with the given name attribute.
 271     """
 272     parser = MetaParser(name)
 273     try:
 274         parser.loads(html)
 275     except compat_html_parser.HTMLParseError:
 276         pass
 277     return parser.get_result()
 278
 279
 280 def clean_html(html):
 281     """Clean an HTML snippet into a readable string"""
 282     # Newline vs <br />
 283     html = html.replace('\n', ' ')
 284     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 285     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 286     # Strip html tags
 287     html = re.sub('<.*?>', '', html)
 288     # Replace html entities
 289     html = unescapeHTML(html)
 290     return html.strip()
 291
 292
 293 def sanitize_open(filename, open_mode):
 294     """Try to open the given filename, and slightly tweak it if this fails.
 295
 296     Attempts to open the given filename. If this fails, it tries to change
 297     the filename slightly, step by step, until it's either able to open it
 298     or it fails and raises a final exception, like the standard open()
 299     function.
 300
 301     It returns the tuple (stream, definitive_file_name).
 302     """
 303     try:
 304         if filename == u'-':
 305             if sys.platform == 'win32':
 306                 import msvcrt
 307                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 308             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 309         stream = open(encodeFilename(filename), open_mode)
 310         return (stream, filename)
 311     except (IOError, OSError) as err:
 312         if err.errno in (errno.EACCES,):
 313             raise
 314
 315         # In case of error, try to remove win32 forbidden chars
 316         alt_filename = os.path.join(
 317                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 318                         for path_part in os.path.split(filename)
 319                        )
 320         if alt_filename == filename:
 321             raise
 322         else:
 323             # An exception here should be caught in the caller
 324             stream = open(encodeFilename(filename), open_mode)
 325             return (stream, alt_filename)
 326
 327
 328 def timeconvert(timestr):
 329     """Convert RFC 2822 defined time string into system timestamp"""
 330     timestamp = None
 331     timetuple = email.utils.parsedate_tz(timestr)
 332     if timetuple is not None:
 333         timestamp = email.utils.mktime_tz(timetuple)
 334     return timestamp
 335
 336 def sanitize_filename(s, restricted=False, is_id=False):
 337     """Sanitizes a string so it could be used as part of a filename.
 338     If restricted is set, use a stricter subset of allowed characters.
 339     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 340     """
 341     def replace_insane(char):
 342         if char == '?' or ord(char) < 32 or ord(char) == 127:
 343             return ''
 344         elif char == '"':
 345             return '' if restricted else '\''
 346         elif char == ':':
 347             return '_-' if restricted else ' -'
 348         elif char in '\\/|*<>':
 349             return '_'
 350         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 351             return '_'
 352         if restricted and ord(char) > 127:
 353             return '_'
 354         return char
 355
 356     result = u''.join(map(replace_insane, s))
 357     if not is_id:
 358         while '__' in result:
 359             result = result.replace('__', '_')
 360         result = result.strip('_')
 361         # Common case of "Foreign band name - English song title"
 362         if restricted and result.startswith('-_'):
 363             result = result[2:]
 364         if not result:
 365             result = '_'
 366     return result
 367
 368 def orderedSet(iterable):
 369     """ Remove all duplicates from the input iterable """
 370     res = []
 371     for el in iterable:
 372         if el not in res:
 373             res.append(el)
 374     return res
 375
 376
 377 def _htmlentity_transform(entity):
 378     """Transforms an HTML entity to a character."""
 379     # Known non-numeric HTML entity
 380     if entity in compat_html_entities.name2codepoint:
 381         return compat_chr(compat_html_entities.name2codepoint[entity])
 382
 383     mobj = re.match(r'#(x?[0-9]+)', entity)
 384     if mobj is not None:
 385         numstr = mobj.group(1)
 386         if numstr.startswith(u'x'):
 387             base = 16
 388             numstr = u'0%s' % numstr
 389         else:
 390             base = 10
 391         return compat_chr(int(numstr, base))
 392
 393     # Unknown entity in name, return its literal representation
 394     return (u'&%s;' % entity)
 395
 396
 397 def unescapeHTML(s):
 398     if s is None:
 399         return None
 400     assert type(s) == compat_str
 401
 402     return re.sub(
 403         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 404
 405
 406 def encodeFilename(s, for_subprocess=False):
 407     """
 408     @param s The name of the file
 409     """
 410
 411     assert type(s) == compat_str
 412
 413     # Python 3 has a Unicode API
 414     if sys.version_info >= (3, 0):
 415         return s
 416
 417     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 418         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 419         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 420         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 421         if not for_subprocess:
 422             return s
 423         else:
 424             # For subprocess calls, encode with locale encoding
 425             # Refer to http://stackoverflow.com/a/9951851/35070
 426             encoding = preferredencoding()
 427     else:
 428         encoding = sys.getfilesystemencoding()
 429     if encoding is None:
 430         encoding = 'utf-8'
 431     return s.encode(encoding, 'ignore')
 432
 433
 434 def encodeArgument(s):
 435     if not isinstance(s, compat_str):
 436         # Legacy code that uses byte strings
 437         # Uncomment the following line after fixing all post processors
 438         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 439         s = s.decode('ascii')
 440     return encodeFilename(s, True)
 441
 442
 443 def decodeOption(optval):
 444     if optval is None:
 445         return optval
 446     if isinstance(optval, bytes):
 447         optval = optval.decode(preferredencoding())
 448
 449     assert isinstance(optval, compat_str)
 450     return optval
 451
 452 def formatSeconds(secs):
 453     if secs > 3600:
 454         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 455     elif secs > 60:
 456         return '%d:%02d' % (secs // 60, secs % 60)
 457     else:
 458         return '%d' % secs
 459
 460
 461 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 462     if sys.version_info < (3, 2):
 463         import httplib
 464
 465         class HTTPSConnectionV3(httplib.HTTPSConnection):
 466             def __init__(self, *args, **kwargs):
 467                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 468
 469             def connect(self):
 470                 sock = socket.create_connection((self.host, self.port), self.timeout)
 471                 if getattr(self, '_tunnel_host', False):
 472                     self.sock = sock
 473                     self._tunnel()
 474                 try:
 475                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 476                 except ssl.SSLError:
 477                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 478
 479         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 480             def https_open(self, req):
 481                 return self.do_open(HTTPSConnectionV3, req)
 482         return HTTPSHandlerV3(**kwargs)
 483     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 484         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 485         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 486         if opts_no_check_certificate:
 487             context.verify_mode = ssl.CERT_NONE
 488         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 489     else:  # Python < 3.4
 490         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 491         context.verify_mode = (ssl.CERT_NONE
 492                                if opts_no_check_certificate
 493                                else ssl.CERT_REQUIRED)
 494         context.set_default_verify_paths()
 495         try:
 496             context.load_default_certs()
 497         except AttributeError:
 498             pass  # Python < 3.4
 499         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 500
 501 class ExtractorError(Exception):
 502     """Error during info extraction."""
 503     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 504         """ tb, if given, is the original traceback (so that it can be printed out).
 505         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 506         """
 507
 508         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 509             expected = True
 510         if video_id is not None:
 511             msg = video_id + ': ' + msg
 512         if cause:
 513             msg += u' (caused by %r)' % cause
 514         if not expected:
 515             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 516         super(ExtractorError, self).__init__(msg)
 517
 518         self.traceback = tb
 519         self.exc_info = sys.exc_info()  # preserve original exception
 520         self.cause = cause
 521         self.video_id = video_id
 522
 523     def format_traceback(self):
 524         if self.traceback is None:
 525             return None
 526         return u''.join(traceback.format_tb(self.traceback))
 527
 528
 529 class RegexNotFoundError(ExtractorError):
 530     """Error when a regex didn't match"""
 531     pass
 532
 533
 534 class DownloadError(Exception):
 535     """Download Error exception.
 536
 537     This exception may be thrown by FileDownloader objects if they are not
 538     configured to continue on errors. They will contain the appropriate
 539     error message.
 540     """
 541     def __init__(self, msg, exc_info=None):
 542         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 543         super(DownloadError, self).__init__(msg)
 544         self.exc_info = exc_info
 545
 546
 547 class SameFileError(Exception):
 548     """Same File exception.
 549
 550     This exception will be thrown by FileDownloader objects if they detect
 551     multiple files would have to be downloaded to the same file on disk.
 552     """
 553     pass
 554
 555
 556 class PostProcessingError(Exception):
 557     """Post Processing exception.
 558
 559     This exception may be raised by PostProcessor's .run() method to
 560     indicate an error in the postprocessing task.
 561     """
 562     def __init__(self, msg):
 563         self.msg = msg
 564
 565 class MaxDownloadsReached(Exception):
 566     """ --max-downloads limit has been reached. """
 567     pass
 568
 569
 570 class UnavailableVideoError(Exception):
 571     """Unavailable Format exception.
 572
 573     This exception will be thrown when a video is requested
 574     in a format that is not available for that video.
 575     """
 576     pass
 577
 578
 579 class ContentTooShortError(Exception):
 580     """Content Too Short exception.
 581
 582     This exception may be raised by FileDownloader objects when a file they
 583     download is too small for what the server announced first, indicating
 584     the connection was probably interrupted.
 585     """
 586     # Both in bytes
 587     downloaded = None
 588     expected = None
 589
 590     def __init__(self, downloaded, expected):
 591         self.downloaded = downloaded
 592         self.expected = expected
 593
 594 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 595     """Handler for HTTP requests and responses.
 596
 597     This class, when installed with an OpenerDirector, automatically adds
 598     the standard headers to every HTTP request and handles gzipped and
 599     deflated responses from web servers. If compression is to be avoided in
 600     a particular request, the original request in the program code only has
 601     to include the HTTP header "Youtubedl-No-Compression", which will be
 602     removed before making the real request.
 603
 604     Part of this code was copied from:
 605
 606     http://techknack.net/python-urllib2-handlers/
 607
 608     Andrew Rowls, the author of that code, agreed to release it to the
 609     public domain.
 610     """
 611
 612     @staticmethod
 613     def deflate(data):
 614         try:
 615             return zlib.decompress(data, -zlib.MAX_WBITS)
 616         except zlib.error:
 617             return zlib.decompress(data)
 618
 619     @staticmethod
 620     def addinfourl_wrapper(stream, headers, url, code):
 621         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 622             return compat_urllib_request.addinfourl(stream, headers, url, code)
 623         ret = compat_urllib_request.addinfourl(stream, headers, url)
 624         ret.code = code
 625         return ret
 626
 627     def http_request(self, req):
 628         for h, v in std_headers.items():
 629             if h not in req.headers:
 630                 req.add_header(h, v)
 631         if 'Youtubedl-no-compression' in req.headers:
 632             if 'Accept-encoding' in req.headers:
 633                 del req.headers['Accept-encoding']
 634             del req.headers['Youtubedl-no-compression']
 635         if 'Youtubedl-user-agent' in req.headers:
 636             if 'User-agent' in req.headers:
 637                 del req.headers['User-agent']
 638             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 639             del req.headers['Youtubedl-user-agent']
 640
 641         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 642             # Python 2.6 is brain-dead when it comes to fragments
 643             req._Request__original = req._Request__original.partition('#')[0]
 644             req._Request__r_type = req._Request__r_type.partition('#')[0]
 645
 646         return req
 647
 648     def http_response(self, req, resp):
 649         old_resp = resp
 650         # gzip
 651         if resp.headers.get('Content-encoding', '') == 'gzip':
 652             content = resp.read()
 653             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 654             try:
 655                 uncompressed = io.BytesIO(gz.read())
 656             except IOError as original_ioerror:
 657                 # There may be junk add the end of the file
 658                 # See http://stackoverflow.com/q/4928560/35070 for details
 659                 for i in range(1, 1024):
 660                     try:
 661                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 662                         uncompressed = io.BytesIO(gz.read())
 663                     except IOError:
 664                         continue
 665                     break
 666                 else:
 667                     raise original_ioerror
 668             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 669             resp.msg = old_resp.msg
 670         # deflate
 671         if resp.headers.get('Content-encoding', '') == 'deflate':
 672             gz = io.BytesIO(self.deflate(resp.read()))
 673             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 674             resp.msg = old_resp.msg
 675         return resp
 676
 677     https_request = http_request
 678     https_response = http_response
 679
 680
 681 def parse_iso8601(date_str, delimiter='T'):
 682     """ Return a UNIX timestamp from the given date """
 683
 684     if date_str is None:
 685         return None
 686
 687     m = re.search(
 688         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 689         date_str)
 690     if not m:
 691         timezone = datetime.timedelta()
 692     else:
 693         date_str = date_str[:-len(m.group(0))]
 694         if not m.group('sign'):
 695             timezone = datetime.timedelta()
 696         else:
 697             sign = 1 if m.group('sign') == '+' else -1
 698             timezone = datetime.timedelta(
 699                 hours=sign * int(m.group('hours')),
 700                 minutes=sign * int(m.group('minutes')))
 701     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 702     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 703     return calendar.timegm(dt.timetuple())
 704
 705
 706 def unified_strdate(date_str):
 707     """Return a string with the date in the format YYYYMMDD"""
 708
 709     if date_str is None:
 710         return None
 711
 712     upload_date = None
 713     #Replace commas
 714     date_str = date_str.replace(',', ' ')
 715     # %z (UTC offset) is only supported in python>=3.2
 716     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 717     format_expressions = [
 718         '%d %B %Y',
 719         '%d %b %Y',
 720         '%B %d %Y',
 721         '%b %d %Y',
 722         '%b %dst %Y %I:%M%p',
 723         '%b %dnd %Y %I:%M%p',
 724         '%b %dth %Y %I:%M%p',
 725         '%Y-%m-%d',
 726         '%Y/%m/%d',
 727         '%d.%m.%Y',
 728         '%d/%m/%Y',
 729         '%d/%m/%y',
 730         '%Y/%m/%d %H:%M:%S',
 731         '%d/%m/%Y %H:%M:%S',
 732         '%Y-%m-%d %H:%M:%S',
 733         '%Y-%m-%d %H:%M:%S.%f',
 734         '%d.%m.%Y %H:%M',
 735         '%d.%m.%Y %H.%M',
 736         '%Y-%m-%dT%H:%M:%SZ',
 737         '%Y-%m-%dT%H:%M:%S.%fZ',
 738         '%Y-%m-%dT%H:%M:%S.%f0Z',
 739         '%Y-%m-%dT%H:%M:%S',
 740         '%Y-%m-%dT%H:%M:%S.%f',
 741         '%Y-%m-%dT%H:%M',
 742     ]
 743     for expression in format_expressions:
 744         try:
 745             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 746         except ValueError:
 747             pass
 748     if upload_date is None:
 749         timetuple = email.utils.parsedate_tz(date_str)
 750         if timetuple:
 751             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 752     return upload_date
 753
 754 def determine_ext(url, default_ext=u'unknown_video'):
 755     if url is None:
 756         return default_ext
 757     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 758     if re.match(r'^[A-Za-z0-9]+$', guess):
 759         return guess
 760     else:
 761         return default_ext
 762
 763 def subtitles_filename(filename, sub_lang, sub_format):
 764     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 765
 766 def date_from_str(date_str):
 767     """
 768     Return a datetime object from a string in the format YYYYMMDD or
 769     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 770     today = datetime.date.today()
 771     if date_str == 'now'or date_str == 'today':
 772         return today
 773     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 774     if match is not None:
 775         sign = match.group('sign')
 776         time = int(match.group('time'))
 777         if sign == '-':
 778             time = -time
 779         unit = match.group('unit')
 780         #A bad aproximation?
 781         if unit == 'month':
 782             unit = 'day'
 783             time *= 30
 784         elif unit == 'year':
 785             unit = 'day'
 786             time *= 365
 787         unit += 's'
 788         delta = datetime.timedelta(**{unit: time})
 789         return today + delta
 790     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 791
 792 def hyphenate_date(date_str):
 793     """
 794     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 795     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 796     if match is not None:
 797         return '-'.join(match.groups())
 798     else:
 799         return date_str
 800
 801 class DateRange(object):
 802     """Represents a time interval between two dates"""
 803     def __init__(self, start=None, end=None):
 804         """start and end must be strings in the format accepted by date"""
 805         if start is not None:
 806             self.start = date_from_str(start)
 807         else:
 808             self.start = datetime.datetime.min.date()
 809         if end is not None:
 810             self.end = date_from_str(end)
 811         else:
 812             self.end = datetime.datetime.max.date()
 813         if self.start > self.end:
 814             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 815     @classmethod
 816     def day(cls, day):
 817         """Returns a range that only contains the given day"""
 818         return cls(day,day)
 819     def __contains__(self, date):
 820         """Check if the date is in the range"""
 821         if not isinstance(date, datetime.date):
 822             date = date_from_str(date)
 823         return self.start <= date <= self.end
 824     def __str__(self):
 825         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 826
 827
 828 def platform_name():
 829     """ Returns the platform name as a compat_str """
 830     res = platform.platform()
 831     if isinstance(res, bytes):
 832         res = res.decode(preferredencoding())
 833
 834     assert isinstance(res, compat_str)
 835     return res
 836
 837
 838 def _windows_write_string(s, out):
 839     """ Returns True if the string was written using special methods,
 840     False if it has yet to be written out."""
 841     # Adapted from http://stackoverflow.com/a/3259271/35070
 842
 843     import ctypes
 844     import ctypes.wintypes
 845
 846     WIN_OUTPUT_IDS = {
 847         1: -11,
 848         2: -12,
 849     }
 850
 851     try:
 852         fileno = out.fileno()
 853     except AttributeError:
 854         # If the output stream doesn't have a fileno, it's virtual
 855         return False
 856     if fileno not in WIN_OUTPUT_IDS:
 857         return False
 858
 859     GetStdHandle = ctypes.WINFUNCTYPE(
 860         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 861         ("GetStdHandle", ctypes.windll.kernel32))
 862     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 863
 864     WriteConsoleW = ctypes.WINFUNCTYPE(
 865         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 866         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 867         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 868     written = ctypes.wintypes.DWORD(0)
 869
 870     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 871     FILE_TYPE_CHAR = 0x0002
 872     FILE_TYPE_REMOTE = 0x8000
 873     GetConsoleMode = ctypes.WINFUNCTYPE(
 874         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 875         ctypes.POINTER(ctypes.wintypes.DWORD))(
 876         ("GetConsoleMode", ctypes.windll.kernel32))
 877     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 878
 879     def not_a_console(handle):
 880         if handle == INVALID_HANDLE_VALUE or handle is None:
 881             return True
 882         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 883                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 884
 885     if not_a_console(h):
 886         return False
 887
 888     def next_nonbmp_pos(s):
 889         try:
 890             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 891         except StopIteration:
 892             return len(s)
 893
 894     while s:
 895         count = min(next_nonbmp_pos(s), 1024)
 896
 897         ret = WriteConsoleW(
 898             h, s, count if count else 2, ctypes.byref(written), None)
 899         if ret == 0:
 900             raise OSError('Failed to write string')
 901         if not count:  # We just wrote a non-BMP character
 902             assert written.value == 2
 903             s = s[1:]
 904         else:
 905             assert written.value > 0
 906             s = s[written.value:]
 907     return True
 908
 909
 910 def write_string(s, out=None, encoding=None):
 911     if out is None:
 912         out = sys.stderr
 913     assert type(s) == compat_str
 914
 915     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 916         if _windows_write_string(s, out):
 917             return
 918
 919     if ('b' in getattr(out, 'mode', '') or
 920             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 921         byt = s.encode(encoding or preferredencoding(), 'ignore')
 922         out.write(byt)
 923     elif hasattr(out, 'buffer'):
 924         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 925         byt = s.encode(enc, 'ignore')
 926         out.buffer.write(byt)
 927     else:
 928         out.write(s)
 929     out.flush()
 930
 931
 932 def bytes_to_intlist(bs):
 933     if not bs:
 934         return []
 935     if isinstance(bs[0], int):  # Python 3
 936         return list(bs)
 937     else:
 938         return [ord(c) for c in bs]
 939
 940
 941 def intlist_to_bytes(xs):
 942     if not xs:
 943         return b''
 944     if isinstance(chr(0), bytes):  # Python 2
 945         return ''.join([chr(x) for x in xs])
 946     else:
 947         return bytes(xs)
 948
 949
 950 # Cross-platform file locking
 951 if sys.platform == 'win32':
 952     import ctypes.wintypes
 953     import msvcrt
 954
 955     class OVERLAPPED(ctypes.Structure):
 956         _fields_ = [
 957             ('Internal', ctypes.wintypes.LPVOID),
 958             ('InternalHigh', ctypes.wintypes.LPVOID),
 959             ('Offset', ctypes.wintypes.DWORD),
 960             ('OffsetHigh', ctypes.wintypes.DWORD),
 961             ('hEvent', ctypes.wintypes.HANDLE),
 962         ]
 963
 964     kernel32 = ctypes.windll.kernel32
 965     LockFileEx = kernel32.LockFileEx
 966     LockFileEx.argtypes = [
 967         ctypes.wintypes.HANDLE,     # hFile
 968         ctypes.wintypes.DWORD,      # dwFlags
 969         ctypes.wintypes.DWORD,      # dwReserved
 970         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 971         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 972         ctypes.POINTER(OVERLAPPED)  # Overlapped
 973     ]
 974     LockFileEx.restype = ctypes.wintypes.BOOL
 975     UnlockFileEx = kernel32.UnlockFileEx
 976     UnlockFileEx.argtypes = [
 977         ctypes.wintypes.HANDLE,     # hFile
 978         ctypes.wintypes.DWORD,      # dwReserved
 979         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 980         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 981         ctypes.POINTER(OVERLAPPED)  # Overlapped
 982     ]
 983     UnlockFileEx.restype = ctypes.wintypes.BOOL
 984     whole_low = 0xffffffff
 985     whole_high = 0x7fffffff
 986
 987     def _lock_file(f, exclusive):
 988         overlapped = OVERLAPPED()
 989         overlapped.Offset = 0
 990         overlapped.OffsetHigh = 0
 991         overlapped.hEvent = 0
 992         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 993         handle = msvcrt.get_osfhandle(f.fileno())
 994         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 995                           whole_low, whole_high, f._lock_file_overlapped_p):
 996             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 997
 998     def _unlock_file(f):
 999         assert f._lock_file_overlapped_p
1000         handle = msvcrt.get_osfhandle(f.fileno())
1001         if not UnlockFileEx(handle, 0,
1002                             whole_low, whole_high, f._lock_file_overlapped_p):
1003             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1004
1005 else:
1006     import fcntl
1007
1008     def _lock_file(f, exclusive):
1009         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1010
1011     def _unlock_file(f):
1012         fcntl.flock(f, fcntl.LOCK_UN)
1013
1014
1015 class locked_file(object):
1016     def __init__(self, filename, mode, encoding=None):
1017         assert mode in ['r', 'a', 'w']
1018         self.f = io.open(filename, mode, encoding=encoding)
1019         self.mode = mode
1020
1021     def __enter__(self):
1022         exclusive = self.mode != 'r'
1023         try:
1024             _lock_file(self.f, exclusive)
1025         except IOError:
1026             self.f.close()
1027             raise
1028         return self
1029
1030     def __exit__(self, etype, value, traceback):
1031         try:
1032             _unlock_file(self.f)
1033         finally:
1034             self.f.close()
1035
1036     def __iter__(self):
1037         return iter(self.f)
1038
1039     def write(self, *args):
1040         return self.f.write(*args)
1041
1042     def read(self, *args):
1043         return self.f.read(*args)
1044
1045
1046 def get_filesystem_encoding():
1047     encoding = sys.getfilesystemencoding()
1048     return encoding if encoding is not None else 'utf-8'
1049
1050
1051 def shell_quote(args):
1052     quoted_args = []
1053     encoding = get_filesystem_encoding()
1054     for a in args:
1055         if isinstance(a, bytes):
1056             # We may get a filename encoded with 'encodeFilename'
1057             a = a.decode(encoding)
1058         quoted_args.append(pipes.quote(a))
1059     return u' '.join(quoted_args)
1060
1061
1062 def takewhile_inclusive(pred, seq):
1063     """ Like itertools.takewhile, but include the latest evaluated element
1064         (the first element so that Not pred(e)) """
1065     for e in seq:
1066         yield e
1067         if not pred(e):
1068             return
1069
1070
1071 def smuggle_url(url, data):
1072     """ Pass additional data in a URL for internal use. """
1073
1074     sdata = compat_urllib_parse.urlencode(
1075         {u'__youtubedl_smuggle': json.dumps(data)})
1076     return url + u'#' + sdata
1077
1078
1079 def unsmuggle_url(smug_url, default=None):
1080     if not '#__youtubedl_smuggle' in smug_url:
1081         return smug_url, default
1082     url, _, sdata = smug_url.rpartition(u'#')
1083     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1084     data = json.loads(jsond)
1085     return url, data
1086
1087
1088 def format_bytes(bytes):
1089     if bytes is None:
1090         return u'N/A'
1091     if type(bytes) is str:
1092         bytes = float(bytes)
1093     if bytes == 0.0:
1094         exponent = 0
1095     else:
1096         exponent = int(math.log(bytes, 1024.0))
1097     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1098     converted = float(bytes) / float(1024 ** exponent)
1099     return u'%.2f%s' % (converted, suffix)
1100
1101
1102 def get_term_width():
1103     columns = compat_getenv('COLUMNS', None)
1104     if columns:
1105         return int(columns)
1106
1107     try:
1108         sp = subprocess.Popen(
1109             ['stty', 'size'],
1110             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1111         out, err = sp.communicate()
1112         return int(out.split()[1])
1113     except:
1114         pass
1115     return None
1116
1117
1118 def month_by_name(name):
1119     """ Return the number of a month by (locale-independently) English name """
1120
1121     ENGLISH_NAMES = [
1122         u'January', u'February', u'March', u'April', u'May', u'June',
1123         u'July', u'August', u'September', u'October', u'November', u'December']
1124     try:
1125         return ENGLISH_NAMES.index(name) + 1
1126     except ValueError:
1127         return None
1128
1129
1130 def fix_xml_ampersands(xml_str):
1131     """Replace all the '&' by '&amp;' in XML"""
1132     return re.sub(
1133         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1134         u'&amp;',
1135         xml_str)
1136
1137
1138 def setproctitle(title):
1139     assert isinstance(title, compat_str)
1140     try:
1141         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1142     except OSError:
1143         return
1144     title_bytes = title.encode('utf-8')
1145     buf = ctypes.create_string_buffer(len(title_bytes))
1146     buf.value = title_bytes
1147     try:
1148         libc.prctl(15, buf, 0, 0, 0)
1149     except AttributeError:
1150         return  # Strange libc, just skip this
1151
1152
1153 def remove_start(s, start):
1154     if s.startswith(start):
1155         return s[len(start):]
1156     return s
1157
1158
1159 def remove_end(s, end):
1160     if s.endswith(end):
1161         return s[:-len(end)]
1162     return s
1163
1164
1165 def url_basename(url):
1166     path = compat_urlparse.urlparse(url).path
1167     return path.strip(u'/').split(u'/')[-1]
1168
1169
1170 class HEADRequest(compat_urllib_request.Request):
1171     def get_method(self):
1172         return "HEAD"
1173
1174
1175 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1176     if get_attr:
1177         if v is not None:
1178             v = getattr(v, get_attr, None)
1179     if v == '':
1180         v = None
1181     return default if v is None else (int(v) * invscale // scale)
1182
1183
1184 def str_or_none(v, default=None):
1185     return default if v is None else compat_str(v)
1186
1187
1188 def str_to_int(int_str):
1189     """ A more relaxed version of int_or_none """
1190     if int_str is None:
1191         return None
1192     int_str = re.sub(r'[,\.\+]', u'', int_str)
1193     return int(int_str)
1194
1195
1196 def float_or_none(v, scale=1, invscale=1, default=None):
1197     return default if v is None else (float(v) * invscale / scale)
1198
1199
1200 def parse_duration(s):
1201     if s is None:
1202         return None
1203
1204     s = s.strip()
1205
1206     m = re.match(
1207         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1208     if not m:
1209         return None
1210     res = int(m.group('secs'))
1211     if m.group('mins'):
1212         res += int(m.group('mins')) * 60
1213         if m.group('hours'):
1214             res += int(m.group('hours')) * 60 * 60
1215     if m.group('ms'):
1216         res += float(m.group('ms'))
1217     return res
1218
1219
1220 def prepend_extension(filename, ext):
1221     name, real_ext = os.path.splitext(filename)
1222     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1223
1224
1225 def check_executable(exe, args=[]):
1226     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1227     args can be a list of arguments for a short output (like -version) """
1228     try:
1229         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1230     except OSError:
1231         return False
1232     return exe
1233
1234
1235 def get_exe_version(exe, args=['--version'],
1236                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1237                     unrecognized=u'present'):
1238     """ Returns the version of the specified executable,
1239     or False if the executable is not present """
1240     try:
1241         out, err = subprocess.Popen(
1242             [exe] + args,
1243             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1244     except OSError:
1245         return False
1246     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1247     m = re.search(version_re, firstline)
1248     if m:
1249         return m.group(1)
1250     else:
1251         return unrecognized
1252
1253
1254 class PagedList(object):
1255     def __len__(self):
1256         # This is only useful for tests
1257         return len(self.getslice())
1258
1259
1260 class OnDemandPagedList(PagedList):
1261     def __init__(self, pagefunc, pagesize):
1262         self._pagefunc = pagefunc
1263         self._pagesize = pagesize
1264
1265     def getslice(self, start=0, end=None):
1266         res = []
1267         for pagenum in itertools.count(start // self._pagesize):
1268             firstid = pagenum * self._pagesize
1269             nextfirstid = pagenum * self._pagesize + self._pagesize
1270             if start >= nextfirstid:
1271                 continue
1272
1273             page_results = list(self._pagefunc(pagenum))
1274
1275             startv = (
1276                 start % self._pagesize
1277                 if firstid <= start < nextfirstid
1278                 else 0)
1279
1280             endv = (
1281                 ((end - 1) % self._pagesize) + 1
1282                 if (end is not None and firstid <= end <= nextfirstid)
1283                 else None)
1284
1285             if startv != 0 or endv is not None:
1286                 page_results = page_results[startv:endv]
1287             res.extend(page_results)
1288
1289             # A little optimization - if current page is not "full", ie. does
1290             # not contain page_size videos then we can assume that this page
1291             # is the last one - there are no more ids on further pages -
1292             # i.e. no need to query again.
1293             if len(page_results) + startv < self._pagesize:
1294                 break
1295
1296             # If we got the whole page, but the next page is not interesting,
1297             # break out early as well
1298             if end == nextfirstid:
1299                 break
1300         return res
1301
1302
1303 class InAdvancePagedList(PagedList):
1304     def __init__(self, pagefunc, pagecount, pagesize):
1305         self._pagefunc = pagefunc
1306         self._pagecount = pagecount
1307         self._pagesize = pagesize
1308
1309     def getslice(self, start=0, end=None):
1310         res = []
1311         start_page = start // self._pagesize
1312         end_page = (
1313             self._pagecount if end is None else (end // self._pagesize + 1))
1314         skip_elems = start - start_page * self._pagesize
1315         only_more = None if end is None else end - start
1316         for pagenum in range(start_page, end_page):
1317             page = list(self._pagefunc(pagenum))
1318             if skip_elems:
1319                 page = page[skip_elems:]
1320                 skip_elems = None
1321             if only_more is not None:
1322                 if len(page) < only_more:
1323                     only_more -= len(page)
1324                 else:
1325                     page = page[:only_more]
1326                     res.extend(page)
1327                     break
1328             res.extend(page)
1329         return res
1330
1331
1332 def uppercase_escape(s):
1333     unicode_escape = codecs.getdecoder('unicode_escape')
1334     return re.sub(
1335         r'\\U[0-9a-fA-F]{8}',
1336         lambda m: unicode_escape(m.group(0))[0],
1337         s)
1338
1339
1340 def escape_rfc3986(s):
1341     """Escape non-ASCII characters as suggested by RFC 3986"""
1342     if sys.version_info < (3, 0) and isinstance(s, unicode):
1343         s = s.encode('utf-8')
1344     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1345
1346
1347 def escape_url(url):
1348     """Escape URL as suggested by RFC 3986"""
1349     url_parsed = compat_urllib_parse_urlparse(url)
1350     return url_parsed._replace(
1351         path=escape_rfc3986(url_parsed.path),
1352         params=escape_rfc3986(url_parsed.params),
1353         query=escape_rfc3986(url_parsed.query),
1354         fragment=escape_rfc3986(url_parsed.fragment)
1355     ).geturl()
1356
1357 try:
1358     struct.pack(u'!I', 0)
1359 except TypeError:
1360     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1361     def struct_pack(spec, *args):
1362         if isinstance(spec, compat_str):
1363             spec = spec.encode('ascii')
1364         return struct.pack(spec, *args)
1365
1366     def struct_unpack(spec, *args):
1367         if isinstance(spec, compat_str):
1368             spec = spec.encode('ascii')
1369         return struct.unpack(spec, *args)
1370 else:
1371     struct_pack = struct.pack
1372     struct_unpack = struct.unpack
1373
1374
1375 def read_batch_urls(batch_fd):
1376     def fixup(url):
1377         if not isinstance(url, compat_str):
1378             url = url.decode('utf-8', 'replace')
1379         BOM_UTF8 = u'\xef\xbb\xbf'
1380         if url.startswith(BOM_UTF8):
1381             url = url[len(BOM_UTF8):]
1382         url = url.strip()
1383         if url.startswith(('#', ';', ']')):
1384             return False
1385         return url
1386
1387     with contextlib.closing(batch_fd) as fd:
1388         return [url for url in map(fixup, fd) if url]
1389
1390
1391 def urlencode_postdata(*args, **kargs):
1392     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1393
1394
1395 try:
1396     etree_iter = xml.etree.ElementTree.Element.iter
1397 except AttributeError:  # Python <=2.6
1398     etree_iter = lambda n: n.findall('.//*')
1399
1400
1401 def parse_xml(s):
1402     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1403         def doctype(self, name, pubid, system):
1404             pass  # Ignore doctypes
1405
1406     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1407     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1408     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1409     # Fix up XML parser in Python 2.x
1410     if sys.version_info < (3, 0):
1411         for n in etree_iter(tree):
1412             if n.text is not None:
1413                 if not isinstance(n.text, compat_str):
1414                     n.text = n.text.decode('utf-8')
1415     return tree
1416
1417
1418 US_RATINGS = {
1419     'G': 0,
1420     'PG': 10,
1421     'PG-13': 13,
1422     'R': 16,
1423     'NC': 18,
1424 }
1425
1426
1427 def parse_age_limit(s):
1428     if s is None:
1429         return None
1430     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1431     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1432
1433
1434 def strip_jsonp(code):
1435     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1436
1437
1438 def js_to_json(code):
1439     def fix_kv(m):
1440         v = m.group(0)
1441         if v in ('true', 'false', 'null'):
1442             return v
1443         if v.startswith('"'):
1444             return v
1445         if v.startswith("'"):
1446             v = v[1:-1]
1447             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1448                 '\\\\': '\\\\',
1449                 "\\'": "'",
1450                 '"': '\\"',
1451             }[m.group(0)], v)
1452         return '"%s"' % v
1453
1454     res = re.sub(r'''(?x)
1455         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1456         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1457         [a-zA-Z_][a-zA-Z_0-9]*
1458         ''', fix_kv, code)
1459     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1460     return res
1461
1462
1463 def qualities(quality_ids):
1464     """ Get a numeric quality value out of a list of possible values """
1465     def q(qid):
1466         try:
1467             return quality_ids.index(qid)
1468         except ValueError:
1469             return -1
1470     return q
1471
1472
1473 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1474
1475
1476 def limit_length(s, length):
1477     """ Add ellipses to overly long strings """
1478     if s is None:
1479         return None
1480     ELLIPSES = '...'
1481     if len(s) > length:
1482         return s[:length - len(ELLIPSES)] + ELLIPSES
1483     return s
1484
1485
1486 def version_tuple(v):
1487     return [int(e) for e in v.split('.')]
1488
1489
1490 def is_outdated_version(version, limit, assume_new=True):
1491     if not version:
1492         return not assume_new
1493     try:
1494         return version_tuple(version) < version_tuple(limit)
1495     except ValueError:
1496         return not assume_new