_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import gzip
  12 import itertools
  13 import io
  14 import json
  15 import locale
  16 import math
  17 import os
  18 import pipes
  19 import platform
  20 import re
  21 import ssl
  22 import socket
  23 import struct
  24 import subprocess
  25 import sys
  26 import tempfile
  27 import traceback
  28 import xml.etree.ElementTree
  29 import zlib
  30
  31 from .compat import (
  32     compat_chr,
  33     compat_getenv,
  34     compat_html_entities,
  35     compat_html_parser,
  36     compat_parse_qs,
  37     compat_str,
  38     compat_urllib_error,
  39     compat_urllib_parse,
  40     compat_urllib_parse_urlparse,
  41     compat_urllib_request,
  42     compat_urlparse,
  43 )
  44
  45
  46 # This is not clearly defined otherwise
  47 compiled_regex_type = type(re.compile(''))
  48
  49 std_headers = {
  50     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  51     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  52     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  53     'Accept-Encoding': 'gzip, deflate',
  54     'Accept-Language': 'en-us,en;q=0.5',
  55 }
  56
  57 def preferredencoding():
  58     """Get preferred encoding.
  59
  60     Returns the best encoding scheme for the system, based on
  61     locale.getpreferredencoding() and some further tweaks.
  62     """
  63     try:
  64         pref = locale.getpreferredencoding()
  65         u'TEST'.encode(pref)
  66     except:
  67         pref = 'UTF-8'
  68
  69     return pref
  70
  71
  72 def write_json_file(obj, fn):
  73     """ Encode obj as JSON and write it to fn, atomically """
  74
  75     args = {
  76         'suffix': '.tmp',
  77         'prefix': os.path.basename(fn) + '.',
  78         'dir': os.path.dirname(fn),
  79         'delete': False,
  80     }
  81
  82     # In Python 2.x, json.dump expects a bytestream.
  83     # In Python 3.x, it writes to a character stream
  84     if sys.version_info < (3, 0):
  85         args['mode'] = 'wb'
  86     else:
  87         args.update({
  88             'mode': 'w',
  89             'encoding': 'utf-8',
  90         })
  91
  92     tf = tempfile.NamedTemporaryFile(**args)
  93
  94     try:
  95         with tf:
  96             json.dump(obj, tf)
  97         os.rename(tf.name, fn)
  98     except:
  99         try:
 100             os.remove(tf.name)
 101         except OSError:
 102             pass
 103         raise
 104
 105
 106 if sys.version_info >= (2, 7):
 107     def find_xpath_attr(node, xpath, key, val):
 108         """ Find the xpath xpath[@key=val] """
 109         assert re.match(r'^[a-zA-Z-]+$', key)
 110         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 111         expr = xpath + u"[@%s='%s']" % (key, val)
 112         return node.find(expr)
 113 else:
 114     def find_xpath_attr(node, xpath, key, val):
 115         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 116         # .//node does not match if a node is a direct child of . !
 117         if isinstance(xpath, unicode):
 118             xpath = xpath.encode('ascii')
 119
 120         for f in node.findall(xpath):
 121             if f.attrib.get(key) == val:
 122                 return f
 123         return None
 124
 125 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 126 # the namespace parameter
 127 def xpath_with_ns(path, ns_map):
 128     components = [c.split(':') for c in path.split('/')]
 129     replaced = []
 130     for c in components:
 131         if len(c) == 1:
 132             replaced.append(c[0])
 133         else:
 134             ns, tag = c
 135             replaced.append('{%s}%s' % (ns_map[ns], tag))
 136     return '/'.join(replaced)
 137
 138
 139 def xpath_text(node, xpath, name=None, fatal=False):
 140     if sys.version_info < (2, 7):  # Crazy 2.6
 141         xpath = xpath.encode('ascii')
 142
 143     n = node.find(xpath)
 144     if n is None:
 145         if fatal:
 146             name = xpath if name is None else name
 147             raise ExtractorError('Could not find XML element %s' % name)
 148         else:
 149             return None
 150     return n.text
 151
 152
 153 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 154 class BaseHTMLParser(compat_html_parser.HTMLParser):
 155     def __init(self):
 156         compat_html_parser.HTMLParser.__init__(self)
 157         self.html = None
 158
 159     def loads(self, html):
 160         self.html = html
 161         self.feed(html)
 162         self.close()
 163
 164 class AttrParser(BaseHTMLParser):
 165     """Modified HTMLParser that isolates a tag with the specified attribute"""
 166     def __init__(self, attribute, value):
 167         self.attribute = attribute
 168         self.value = value
 169         self.result = None
 170         self.started = False
 171         self.depth = {}
 172         self.watch_startpos = False
 173         self.error_count = 0
 174         BaseHTMLParser.__init__(self)
 175
 176     def error(self, message):
 177         if self.error_count > 10 or self.started:
 178             raise compat_html_parser.HTMLParseError(message, self.getpos())
 179         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 180         self.error_count += 1
 181         self.goahead(1)
 182
 183     def handle_starttag(self, tag, attrs):
 184         attrs = dict(attrs)
 185         if self.started:
 186             self.find_startpos(None)
 187         if self.attribute in attrs and attrs[self.attribute] == self.value:
 188             self.result = [tag]
 189             self.started = True
 190             self.watch_startpos = True
 191         if self.started:
 192             if not tag in self.depth: self.depth[tag] = 0
 193             self.depth[tag] += 1
 194
 195     def handle_endtag(self, tag):
 196         if self.started:
 197             if tag in self.depth: self.depth[tag] -= 1
 198             if self.depth[self.result[0]] == 0:
 199                 self.started = False
 200                 self.result.append(self.getpos())
 201
 202     def find_startpos(self, x):
 203         """Needed to put the start position of the result (self.result[1])
 204         after the opening tag with the requested id"""
 205         if self.watch_startpos:
 206             self.watch_startpos = False
 207             self.result.append(self.getpos())
 208     handle_entityref = handle_charref = handle_data = handle_comment = \
 209     handle_decl = handle_pi = unknown_decl = find_startpos
 210
 211     def get_result(self):
 212         if self.result is None:
 213             return None
 214         if len(self.result) != 3:
 215             return None
 216         lines = self.html.split('\n')
 217         lines = lines[self.result[1][0]-1:self.result[2][0]]
 218         lines[0] = lines[0][self.result[1][1]:]
 219         if len(lines) == 1:
 220             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 221         lines[-1] = lines[-1][:self.result[2][1]]
 222         return '\n'.join(lines).strip()
 223 # Hack for https://github.com/rg3/youtube-dl/issues/662
 224 if sys.version_info < (2, 7, 3):
 225     AttrParser.parse_endtag = (lambda self, i:
 226         i + len("</scr'+'ipt>")
 227         if self.rawdata[i:].startswith("</scr'+'ipt>")
 228         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 229
 230 def get_element_by_id(id, html):
 231     """Return the content of the tag with the specified ID in the passed HTML document"""
 232     return get_element_by_attribute("id", id, html)
 233
 234 def get_element_by_attribute(attribute, value, html):
 235     """Return the content of the tag with the specified attribute in the passed HTML document"""
 236     parser = AttrParser(attribute, value)
 237     try:
 238         parser.loads(html)
 239     except compat_html_parser.HTMLParseError:
 240         pass
 241     return parser.get_result()
 242
 243 class MetaParser(BaseHTMLParser):
 244     """
 245     Modified HTMLParser that isolates a meta tag with the specified name
 246     attribute.
 247     """
 248     def __init__(self, name):
 249         BaseHTMLParser.__init__(self)
 250         self.name = name
 251         self.content = None
 252         self.result = None
 253
 254     def handle_starttag(self, tag, attrs):
 255         if tag != 'meta':
 256             return
 257         attrs = dict(attrs)
 258         if attrs.get('name') == self.name:
 259             self.result = attrs.get('content')
 260
 261     def get_result(self):
 262         return self.result
 263
 264 def get_meta_content(name, html):
 265     """
 266     Return the content attribute from the meta tag with the given name attribute.
 267     """
 268     parser = MetaParser(name)
 269     try:
 270         parser.loads(html)
 271     except compat_html_parser.HTMLParseError:
 272         pass
 273     return parser.get_result()
 274
 275
 276 def clean_html(html):
 277     """Clean an HTML snippet into a readable string"""
 278     # Newline vs <br />
 279     html = html.replace('\n', ' ')
 280     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 281     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 282     # Strip html tags
 283     html = re.sub('<.*?>', '', html)
 284     # Replace html entities
 285     html = unescapeHTML(html)
 286     return html.strip()
 287
 288
 289 def sanitize_open(filename, open_mode):
 290     """Try to open the given filename, and slightly tweak it if this fails.
 291
 292     Attempts to open the given filename. If this fails, it tries to change
 293     the filename slightly, step by step, until it's either able to open it
 294     or it fails and raises a final exception, like the standard open()
 295     function.
 296
 297     It returns the tuple (stream, definitive_file_name).
 298     """
 299     try:
 300         if filename == u'-':
 301             if sys.platform == 'win32':
 302                 import msvcrt
 303                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 304             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 305         stream = open(encodeFilename(filename), open_mode)
 306         return (stream, filename)
 307     except (IOError, OSError) as err:
 308         if err.errno in (errno.EACCES,):
 309             raise
 310
 311         # In case of error, try to remove win32 forbidden chars
 312         alt_filename = os.path.join(
 313                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 314                         for path_part in os.path.split(filename)
 315                        )
 316         if alt_filename == filename:
 317             raise
 318         else:
 319             # An exception here should be caught in the caller
 320             stream = open(encodeFilename(filename), open_mode)
 321             return (stream, alt_filename)
 322
 323
 324 def timeconvert(timestr):
 325     """Convert RFC 2822 defined time string into system timestamp"""
 326     timestamp = None
 327     timetuple = email.utils.parsedate_tz(timestr)
 328     if timetuple is not None:
 329         timestamp = email.utils.mktime_tz(timetuple)
 330     return timestamp
 331
 332 def sanitize_filename(s, restricted=False, is_id=False):
 333     """Sanitizes a string so it could be used as part of a filename.
 334     If restricted is set, use a stricter subset of allowed characters.
 335     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 336     """
 337     def replace_insane(char):
 338         if char == '?' or ord(char) < 32 or ord(char) == 127:
 339             return ''
 340         elif char == '"':
 341             return '' if restricted else '\''
 342         elif char == ':':
 343             return '_-' if restricted else ' -'
 344         elif char in '\\/|*<>':
 345             return '_'
 346         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 347             return '_'
 348         if restricted and ord(char) > 127:
 349             return '_'
 350         return char
 351
 352     result = u''.join(map(replace_insane, s))
 353     if not is_id:
 354         while '__' in result:
 355             result = result.replace('__', '_')
 356         result = result.strip('_')
 357         # Common case of "Foreign band name - English song title"
 358         if restricted and result.startswith('-_'):
 359             result = result[2:]
 360         if not result:
 361             result = '_'
 362     return result
 363
 364 def orderedSet(iterable):
 365     """ Remove all duplicates from the input iterable """
 366     res = []
 367     for el in iterable:
 368         if el not in res:
 369             res.append(el)
 370     return res
 371
 372
 373 def _htmlentity_transform(entity):
 374     """Transforms an HTML entity to a character."""
 375     # Known non-numeric HTML entity
 376     if entity in compat_html_entities.name2codepoint:
 377         return compat_chr(compat_html_entities.name2codepoint[entity])
 378
 379     mobj = re.match(r'#(x?[0-9]+)', entity)
 380     if mobj is not None:
 381         numstr = mobj.group(1)
 382         if numstr.startswith(u'x'):
 383             base = 16
 384             numstr = u'0%s' % numstr
 385         else:
 386             base = 10
 387         return compat_chr(int(numstr, base))
 388
 389     # Unknown entity in name, return its literal representation
 390     return (u'&%s;' % entity)
 391
 392
 393 def unescapeHTML(s):
 394     if s is None:
 395         return None
 396     assert type(s) == compat_str
 397
 398     return re.sub(
 399         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 400
 401
 402 def encodeFilename(s, for_subprocess=False):
 403     """
 404     @param s The name of the file
 405     """
 406
 407     assert type(s) == compat_str
 408
 409     # Python 3 has a Unicode API
 410     if sys.version_info >= (3, 0):
 411         return s
 412
 413     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 414         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 415         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 416         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 417         if not for_subprocess:
 418             return s
 419         else:
 420             # For subprocess calls, encode with locale encoding
 421             # Refer to http://stackoverflow.com/a/9951851/35070
 422             encoding = preferredencoding()
 423     else:
 424         encoding = sys.getfilesystemencoding()
 425     if encoding is None:
 426         encoding = 'utf-8'
 427     return s.encode(encoding, 'ignore')
 428
 429
 430 def encodeArgument(s):
 431     if not isinstance(s, compat_str):
 432         # Legacy code that uses byte strings
 433         # Uncomment the following line after fixing all post processors
 434         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 435         s = s.decode('ascii')
 436     return encodeFilename(s, True)
 437
 438
 439 def decodeOption(optval):
 440     if optval is None:
 441         return optval
 442     if isinstance(optval, bytes):
 443         optval = optval.decode(preferredencoding())
 444
 445     assert isinstance(optval, compat_str)
 446     return optval
 447
 448 def formatSeconds(secs):
 449     if secs > 3600:
 450         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 451     elif secs > 60:
 452         return '%d:%02d' % (secs // 60, secs % 60)
 453     else:
 454         return '%d' % secs
 455
 456
 457 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 458     if sys.version_info < (3, 2):
 459         import httplib
 460
 461         class HTTPSConnectionV3(httplib.HTTPSConnection):
 462             def __init__(self, *args, **kwargs):
 463                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 464
 465             def connect(self):
 466                 sock = socket.create_connection((self.host, self.port), self.timeout)
 467                 if getattr(self, '_tunnel_host', False):
 468                     self.sock = sock
 469                     self._tunnel()
 470                 try:
 471                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 472                 except ssl.SSLError:
 473                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 474
 475         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 476             def https_open(self, req):
 477                 return self.do_open(HTTPSConnectionV3, req)
 478         return HTTPSHandlerV3(**kwargs)
 479     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 480         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 481         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 482         if opts_no_check_certificate:
 483             context.verify_mode = ssl.CERT_NONE
 484         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 485     else:  # Python < 3.4
 486         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 487         context.verify_mode = (ssl.CERT_NONE
 488                                if opts_no_check_certificate
 489                                else ssl.CERT_REQUIRED)
 490         context.set_default_verify_paths()
 491         try:
 492             context.load_default_certs()
 493         except AttributeError:
 494             pass  # Python < 3.4
 495         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 496
 497 class ExtractorError(Exception):
 498     """Error during info extraction."""
 499     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 500         """ tb, if given, is the original traceback (so that it can be printed out).
 501         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 502         """
 503
 504         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 505             expected = True
 506         if video_id is not None:
 507             msg = video_id + ': ' + msg
 508         if cause:
 509             msg += u' (caused by %r)' % cause
 510         if not expected:
 511             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 512         super(ExtractorError, self).__init__(msg)
 513
 514         self.traceback = tb
 515         self.exc_info = sys.exc_info()  # preserve original exception
 516         self.cause = cause
 517         self.video_id = video_id
 518
 519     def format_traceback(self):
 520         if self.traceback is None:
 521             return None
 522         return u''.join(traceback.format_tb(self.traceback))
 523
 524
 525 class RegexNotFoundError(ExtractorError):
 526     """Error when a regex didn't match"""
 527     pass
 528
 529
 530 class DownloadError(Exception):
 531     """Download Error exception.
 532
 533     This exception may be thrown by FileDownloader objects if they are not
 534     configured to continue on errors. They will contain the appropriate
 535     error message.
 536     """
 537     def __init__(self, msg, exc_info=None):
 538         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 539         super(DownloadError, self).__init__(msg)
 540         self.exc_info = exc_info
 541
 542
 543 class SameFileError(Exception):
 544     """Same File exception.
 545
 546     This exception will be thrown by FileDownloader objects if they detect
 547     multiple files would have to be downloaded to the same file on disk.
 548     """
 549     pass
 550
 551
 552 class PostProcessingError(Exception):
 553     """Post Processing exception.
 554
 555     This exception may be raised by PostProcessor's .run() method to
 556     indicate an error in the postprocessing task.
 557     """
 558     def __init__(self, msg):
 559         self.msg = msg
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582     # Both in bytes
 583     downloaded = None
 584     expected = None
 585
 586     def __init__(self, downloaded, expected):
 587         self.downloaded = downloaded
 588         self.expected = expected
 589
 590 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 591     """Handler for HTTP requests and responses.
 592
 593     This class, when installed with an OpenerDirector, automatically adds
 594     the standard headers to every HTTP request and handles gzipped and
 595     deflated responses from web servers. If compression is to be avoided in
 596     a particular request, the original request in the program code only has
 597     to include the HTTP header "Youtubedl-No-Compression", which will be
 598     removed before making the real request.
 599
 600     Part of this code was copied from:
 601
 602     http://techknack.net/python-urllib2-handlers/
 603
 604     Andrew Rowls, the author of that code, agreed to release it to the
 605     public domain.
 606     """
 607
 608     @staticmethod
 609     def deflate(data):
 610         try:
 611             return zlib.decompress(data, -zlib.MAX_WBITS)
 612         except zlib.error:
 613             return zlib.decompress(data)
 614
 615     @staticmethod
 616     def addinfourl_wrapper(stream, headers, url, code):
 617         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 618             return compat_urllib_request.addinfourl(stream, headers, url, code)
 619         ret = compat_urllib_request.addinfourl(stream, headers, url)
 620         ret.code = code
 621         return ret
 622
 623     def http_request(self, req):
 624         for h, v in std_headers.items():
 625             if h not in req.headers:
 626                 req.add_header(h, v)
 627         if 'Youtubedl-no-compression' in req.headers:
 628             if 'Accept-encoding' in req.headers:
 629                 del req.headers['Accept-encoding']
 630             del req.headers['Youtubedl-no-compression']
 631         if 'Youtubedl-user-agent' in req.headers:
 632             if 'User-agent' in req.headers:
 633                 del req.headers['User-agent']
 634             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 635             del req.headers['Youtubedl-user-agent']
 636
 637         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 638             # Python 2.6 is brain-dead when it comes to fragments
 639             req._Request__original = req._Request__original.partition('#')[0]
 640             req._Request__r_type = req._Request__r_type.partition('#')[0]
 641
 642         return req
 643
 644     def http_response(self, req, resp):
 645         old_resp = resp
 646         # gzip
 647         if resp.headers.get('Content-encoding', '') == 'gzip':
 648             content = resp.read()
 649             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 650             try:
 651                 uncompressed = io.BytesIO(gz.read())
 652             except IOError as original_ioerror:
 653                 # There may be junk add the end of the file
 654                 # See http://stackoverflow.com/q/4928560/35070 for details
 655                 for i in range(1, 1024):
 656                     try:
 657                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 658                         uncompressed = io.BytesIO(gz.read())
 659                     except IOError:
 660                         continue
 661                     break
 662                 else:
 663                     raise original_ioerror
 664             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 665             resp.msg = old_resp.msg
 666         # deflate
 667         if resp.headers.get('Content-encoding', '') == 'deflate':
 668             gz = io.BytesIO(self.deflate(resp.read()))
 669             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 670             resp.msg = old_resp.msg
 671         return resp
 672
 673     https_request = http_request
 674     https_response = http_response
 675
 676
 677 def parse_iso8601(date_str, delimiter='T'):
 678     """ Return a UNIX timestamp from the given date """
 679
 680     if date_str is None:
 681         return None
 682
 683     m = re.search(
 684         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 685         date_str)
 686     if not m:
 687         timezone = datetime.timedelta()
 688     else:
 689         date_str = date_str[:-len(m.group(0))]
 690         if not m.group('sign'):
 691             timezone = datetime.timedelta()
 692         else:
 693             sign = 1 if m.group('sign') == '+' else -1
 694             timezone = datetime.timedelta(
 695                 hours=sign * int(m.group('hours')),
 696                 minutes=sign * int(m.group('minutes')))
 697     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 698     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 699     return calendar.timegm(dt.timetuple())
 700
 701
 702 def unified_strdate(date_str):
 703     """Return a string with the date in the format YYYYMMDD"""
 704
 705     if date_str is None:
 706         return None
 707
 708     upload_date = None
 709     #Replace commas
 710     date_str = date_str.replace(',', ' ')
 711     # %z (UTC offset) is only supported in python>=3.2
 712     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 713     format_expressions = [
 714         '%d %B %Y',
 715         '%d %b %Y',
 716         '%B %d %Y',
 717         '%b %d %Y',
 718         '%b %dst %Y %I:%M%p',
 719         '%b %dnd %Y %I:%M%p',
 720         '%b %dth %Y %I:%M%p',
 721         '%Y-%m-%d',
 722         '%Y/%m/%d',
 723         '%d.%m.%Y',
 724         '%d/%m/%Y',
 725         '%d/%m/%y',
 726         '%Y/%m/%d %H:%M:%S',
 727         '%d/%m/%Y %H:%M:%S',
 728         '%Y-%m-%d %H:%M:%S',
 729         '%Y-%m-%d %H:%M:%S.%f',
 730         '%d.%m.%Y %H:%M',
 731         '%d.%m.%Y %H.%M',
 732         '%Y-%m-%dT%H:%M:%SZ',
 733         '%Y-%m-%dT%H:%M:%S.%fZ',
 734         '%Y-%m-%dT%H:%M:%S.%f0Z',
 735         '%Y-%m-%dT%H:%M:%S',
 736         '%Y-%m-%dT%H:%M:%S.%f',
 737         '%Y-%m-%dT%H:%M',
 738     ]
 739     for expression in format_expressions:
 740         try:
 741             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 742         except ValueError:
 743             pass
 744     if upload_date is None:
 745         timetuple = email.utils.parsedate_tz(date_str)
 746         if timetuple:
 747             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 748     return upload_date
 749
 750 def determine_ext(url, default_ext=u'unknown_video'):
 751     if url is None:
 752         return default_ext
 753     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 754     if re.match(r'^[A-Za-z0-9]+$', guess):
 755         return guess
 756     else:
 757         return default_ext
 758
 759 def subtitles_filename(filename, sub_lang, sub_format):
 760     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 761
 762 def date_from_str(date_str):
 763     """
 764     Return a datetime object from a string in the format YYYYMMDD or
 765     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 766     today = datetime.date.today()
 767     if date_str == 'now'or date_str == 'today':
 768         return today
 769     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 770     if match is not None:
 771         sign = match.group('sign')
 772         time = int(match.group('time'))
 773         if sign == '-':
 774             time = -time
 775         unit = match.group('unit')
 776         #A bad aproximation?
 777         if unit == 'month':
 778             unit = 'day'
 779             time *= 30
 780         elif unit == 'year':
 781             unit = 'day'
 782             time *= 365
 783         unit += 's'
 784         delta = datetime.timedelta(**{unit: time})
 785         return today + delta
 786     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 787
 788 def hyphenate_date(date_str):
 789     """
 790     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 791     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 792     if match is not None:
 793         return '-'.join(match.groups())
 794     else:
 795         return date_str
 796
 797 class DateRange(object):
 798     """Represents a time interval between two dates"""
 799     def __init__(self, start=None, end=None):
 800         """start and end must be strings in the format accepted by date"""
 801         if start is not None:
 802             self.start = date_from_str(start)
 803         else:
 804             self.start = datetime.datetime.min.date()
 805         if end is not None:
 806             self.end = date_from_str(end)
 807         else:
 808             self.end = datetime.datetime.max.date()
 809         if self.start > self.end:
 810             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 811     @classmethod
 812     def day(cls, day):
 813         """Returns a range that only contains the given day"""
 814         return cls(day,day)
 815     def __contains__(self, date):
 816         """Check if the date is in the range"""
 817         if not isinstance(date, datetime.date):
 818             date = date_from_str(date)
 819         return self.start <= date <= self.end
 820     def __str__(self):
 821         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 822
 823
 824 def platform_name():
 825     """ Returns the platform name as a compat_str """
 826     res = platform.platform()
 827     if isinstance(res, bytes):
 828         res = res.decode(preferredencoding())
 829
 830     assert isinstance(res, compat_str)
 831     return res
 832
 833
 834 def _windows_write_string(s, out):
 835     """ Returns True if the string was written using special methods,
 836     False if it has yet to be written out."""
 837     # Adapted from http://stackoverflow.com/a/3259271/35070
 838
 839     import ctypes
 840     import ctypes.wintypes
 841
 842     WIN_OUTPUT_IDS = {
 843         1: -11,
 844         2: -12,
 845     }
 846
 847     try:
 848         fileno = out.fileno()
 849     except AttributeError:
 850         # If the output stream doesn't have a fileno, it's virtual
 851         return False
 852     if fileno not in WIN_OUTPUT_IDS:
 853         return False
 854
 855     GetStdHandle = ctypes.WINFUNCTYPE(
 856         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 857         ("GetStdHandle", ctypes.windll.kernel32))
 858     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 859
 860     WriteConsoleW = ctypes.WINFUNCTYPE(
 861         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 862         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 863         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 864     written = ctypes.wintypes.DWORD(0)
 865
 866     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 867     FILE_TYPE_CHAR = 0x0002
 868     FILE_TYPE_REMOTE = 0x8000
 869     GetConsoleMode = ctypes.WINFUNCTYPE(
 870         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 871         ctypes.POINTER(ctypes.wintypes.DWORD))(
 872         ("GetConsoleMode", ctypes.windll.kernel32))
 873     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 874
 875     def not_a_console(handle):
 876         if handle == INVALID_HANDLE_VALUE or handle is None:
 877             return True
 878         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 879                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 880
 881     if not_a_console(h):
 882         return False
 883
 884     def next_nonbmp_pos(s):
 885         try:
 886             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 887         except StopIteration:
 888             return len(s)
 889
 890     while s:
 891         count = min(next_nonbmp_pos(s), 1024)
 892
 893         ret = WriteConsoleW(
 894             h, s, count if count else 2, ctypes.byref(written), None)
 895         if ret == 0:
 896             raise OSError('Failed to write string')
 897         if not count:  # We just wrote a non-BMP character
 898             assert written.value == 2
 899             s = s[1:]
 900         else:
 901             assert written.value > 0
 902             s = s[written.value:]
 903     return True
 904
 905
 906 def write_string(s, out=None, encoding=None):
 907     if out is None:
 908         out = sys.stderr
 909     assert type(s) == compat_str
 910
 911     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 912         if _windows_write_string(s, out):
 913             return
 914
 915     if ('b' in getattr(out, 'mode', '') or
 916             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 917         byt = s.encode(encoding or preferredencoding(), 'ignore')
 918         out.write(byt)
 919     elif hasattr(out, 'buffer'):
 920         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 921         byt = s.encode(enc, 'ignore')
 922         out.buffer.write(byt)
 923     else:
 924         out.write(s)
 925     out.flush()
 926
 927
 928 def bytes_to_intlist(bs):
 929     if not bs:
 930         return []
 931     if isinstance(bs[0], int):  # Python 3
 932         return list(bs)
 933     else:
 934         return [ord(c) for c in bs]
 935
 936
 937 def intlist_to_bytes(xs):
 938     if not xs:
 939         return b''
 940     if isinstance(chr(0), bytes):  # Python 2
 941         return ''.join([chr(x) for x in xs])
 942     else:
 943         return bytes(xs)
 944
 945
 946 # Cross-platform file locking
 947 if sys.platform == 'win32':
 948     import ctypes.wintypes
 949     import msvcrt
 950
 951     class OVERLAPPED(ctypes.Structure):
 952         _fields_ = [
 953             ('Internal', ctypes.wintypes.LPVOID),
 954             ('InternalHigh', ctypes.wintypes.LPVOID),
 955             ('Offset', ctypes.wintypes.DWORD),
 956             ('OffsetHigh', ctypes.wintypes.DWORD),
 957             ('hEvent', ctypes.wintypes.HANDLE),
 958         ]
 959
 960     kernel32 = ctypes.windll.kernel32
 961     LockFileEx = kernel32.LockFileEx
 962     LockFileEx.argtypes = [
 963         ctypes.wintypes.HANDLE,     # hFile
 964         ctypes.wintypes.DWORD,      # dwFlags
 965         ctypes.wintypes.DWORD,      # dwReserved
 966         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 967         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 968         ctypes.POINTER(OVERLAPPED)  # Overlapped
 969     ]
 970     LockFileEx.restype = ctypes.wintypes.BOOL
 971     UnlockFileEx = kernel32.UnlockFileEx
 972     UnlockFileEx.argtypes = [
 973         ctypes.wintypes.HANDLE,     # hFile
 974         ctypes.wintypes.DWORD,      # dwReserved
 975         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 976         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 977         ctypes.POINTER(OVERLAPPED)  # Overlapped
 978     ]
 979     UnlockFileEx.restype = ctypes.wintypes.BOOL
 980     whole_low = 0xffffffff
 981     whole_high = 0x7fffffff
 982
 983     def _lock_file(f, exclusive):
 984         overlapped = OVERLAPPED()
 985         overlapped.Offset = 0
 986         overlapped.OffsetHigh = 0
 987         overlapped.hEvent = 0
 988         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 989         handle = msvcrt.get_osfhandle(f.fileno())
 990         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 991                           whole_low, whole_high, f._lock_file_overlapped_p):
 992             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 993
 994     def _unlock_file(f):
 995         assert f._lock_file_overlapped_p
 996         handle = msvcrt.get_osfhandle(f.fileno())
 997         if not UnlockFileEx(handle, 0,
 998                             whole_low, whole_high, f._lock_file_overlapped_p):
 999             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1000
1001 else:
1002     import fcntl
1003
1004     def _lock_file(f, exclusive):
1005         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1006
1007     def _unlock_file(f):
1008         fcntl.flock(f, fcntl.LOCK_UN)
1009
1010
1011 class locked_file(object):
1012     def __init__(self, filename, mode, encoding=None):
1013         assert mode in ['r', 'a', 'w']
1014         self.f = io.open(filename, mode, encoding=encoding)
1015         self.mode = mode
1016
1017     def __enter__(self):
1018         exclusive = self.mode != 'r'
1019         try:
1020             _lock_file(self.f, exclusive)
1021         except IOError:
1022             self.f.close()
1023             raise
1024         return self
1025
1026     def __exit__(self, etype, value, traceback):
1027         try:
1028             _unlock_file(self.f)
1029         finally:
1030             self.f.close()
1031
1032     def __iter__(self):
1033         return iter(self.f)
1034
1035     def write(self, *args):
1036         return self.f.write(*args)
1037
1038     def read(self, *args):
1039         return self.f.read(*args)
1040
1041
1042 def get_filesystem_encoding():
1043     encoding = sys.getfilesystemencoding()
1044     return encoding if encoding is not None else 'utf-8'
1045
1046
1047 def shell_quote(args):
1048     quoted_args = []
1049     encoding = get_filesystem_encoding()
1050     for a in args:
1051         if isinstance(a, bytes):
1052             # We may get a filename encoded with 'encodeFilename'
1053             a = a.decode(encoding)
1054         quoted_args.append(pipes.quote(a))
1055     return u' '.join(quoted_args)
1056
1057
1058 def takewhile_inclusive(pred, seq):
1059     """ Like itertools.takewhile, but include the latest evaluated element
1060         (the first element so that Not pred(e)) """
1061     for e in seq:
1062         yield e
1063         if not pred(e):
1064             return
1065
1066
1067 def smuggle_url(url, data):
1068     """ Pass additional data in a URL for internal use. """
1069
1070     sdata = compat_urllib_parse.urlencode(
1071         {u'__youtubedl_smuggle': json.dumps(data)})
1072     return url + u'#' + sdata
1073
1074
1075 def unsmuggle_url(smug_url, default=None):
1076     if not '#__youtubedl_smuggle' in smug_url:
1077         return smug_url, default
1078     url, _, sdata = smug_url.rpartition(u'#')
1079     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1080     data = json.loads(jsond)
1081     return url, data
1082
1083
1084 def format_bytes(bytes):
1085     if bytes is None:
1086         return u'N/A'
1087     if type(bytes) is str:
1088         bytes = float(bytes)
1089     if bytes == 0.0:
1090         exponent = 0
1091     else:
1092         exponent = int(math.log(bytes, 1024.0))
1093     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1094     converted = float(bytes) / float(1024 ** exponent)
1095     return u'%.2f%s' % (converted, suffix)
1096
1097
1098 def get_term_width():
1099     columns = compat_getenv('COLUMNS', None)
1100     if columns:
1101         return int(columns)
1102
1103     try:
1104         sp = subprocess.Popen(
1105             ['stty', 'size'],
1106             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1107         out, err = sp.communicate()
1108         return int(out.split()[1])
1109     except:
1110         pass
1111     return None
1112
1113
1114 def month_by_name(name):
1115     """ Return the number of a month by (locale-independently) English name """
1116
1117     ENGLISH_NAMES = [
1118         u'January', u'February', u'March', u'April', u'May', u'June',
1119         u'July', u'August', u'September', u'October', u'November', u'December']
1120     try:
1121         return ENGLISH_NAMES.index(name) + 1
1122     except ValueError:
1123         return None
1124
1125
1126 def fix_xml_ampersands(xml_str):
1127     """Replace all the '&' by '&amp;' in XML"""
1128     return re.sub(
1129         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1130         u'&amp;',
1131         xml_str)
1132
1133
1134 def setproctitle(title):
1135     assert isinstance(title, compat_str)
1136     try:
1137         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1138     except OSError:
1139         return
1140     title_bytes = title.encode('utf-8')
1141     buf = ctypes.create_string_buffer(len(title_bytes))
1142     buf.value = title_bytes
1143     try:
1144         libc.prctl(15, buf, 0, 0, 0)
1145     except AttributeError:
1146         return  # Strange libc, just skip this
1147
1148
1149 def remove_start(s, start):
1150     if s.startswith(start):
1151         return s[len(start):]
1152     return s
1153
1154
1155 def remove_end(s, end):
1156     if s.endswith(end):
1157         return s[:-len(end)]
1158     return s
1159
1160
1161 def url_basename(url):
1162     path = compat_urlparse.urlparse(url).path
1163     return path.strip(u'/').split(u'/')[-1]
1164
1165
1166 class HEADRequest(compat_urllib_request.Request):
1167     def get_method(self):
1168         return "HEAD"
1169
1170
1171 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1172     if get_attr:
1173         if v is not None:
1174             v = getattr(v, get_attr, None)
1175     if v == '':
1176         v = None
1177     return default if v is None else (int(v) * invscale // scale)
1178
1179
1180 def str_or_none(v, default=None):
1181     return default if v is None else compat_str(v)
1182
1183
1184 def str_to_int(int_str):
1185     """ A more relaxed version of int_or_none """
1186     if int_str is None:
1187         return None
1188     int_str = re.sub(r'[,\.\+]', u'', int_str)
1189     return int(int_str)
1190
1191
1192 def float_or_none(v, scale=1, invscale=1, default=None):
1193     return default if v is None else (float(v) * invscale / scale)
1194
1195
1196 def parse_duration(s):
1197     if s is None:
1198         return None
1199
1200     s = s.strip()
1201
1202     m = re.match(
1203         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1204     if not m:
1205         return None
1206     res = int(m.group('secs'))
1207     if m.group('mins'):
1208         res += int(m.group('mins')) * 60
1209         if m.group('hours'):
1210             res += int(m.group('hours')) * 60 * 60
1211     if m.group('ms'):
1212         res += float(m.group('ms'))
1213     return res
1214
1215
1216 def prepend_extension(filename, ext):
1217     name, real_ext = os.path.splitext(filename)
1218     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1219
1220
1221 def check_executable(exe, args=[]):
1222     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1223     args can be a list of arguments for a short output (like -version) """
1224     try:
1225         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1226     except OSError:
1227         return False
1228     return exe
1229
1230
1231 def get_exe_version(exe, args=['--version'],
1232                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1233                     unrecognized=u'present'):
1234     """ Returns the version of the specified executable,
1235     or False if the executable is not present """
1236     try:
1237         out, err = subprocess.Popen(
1238             [exe] + args,
1239             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1240     except OSError:
1241         return False
1242     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1243     m = re.search(version_re, firstline)
1244     if m:
1245         return m.group(1)
1246     else:
1247         return unrecognized
1248
1249
1250 class PagedList(object):
1251     def __len__(self):
1252         # This is only useful for tests
1253         return len(self.getslice())
1254
1255
1256 class OnDemandPagedList(PagedList):
1257     def __init__(self, pagefunc, pagesize):
1258         self._pagefunc = pagefunc
1259         self._pagesize = pagesize
1260
1261     def getslice(self, start=0, end=None):
1262         res = []
1263         for pagenum in itertools.count(start // self._pagesize):
1264             firstid = pagenum * self._pagesize
1265             nextfirstid = pagenum * self._pagesize + self._pagesize
1266             if start >= nextfirstid:
1267                 continue
1268
1269             page_results = list(self._pagefunc(pagenum))
1270
1271             startv = (
1272                 start % self._pagesize
1273                 if firstid <= start < nextfirstid
1274                 else 0)
1275
1276             endv = (
1277                 ((end - 1) % self._pagesize) + 1
1278                 if (end is not None and firstid <= end <= nextfirstid)
1279                 else None)
1280
1281             if startv != 0 or endv is not None:
1282                 page_results = page_results[startv:endv]
1283             res.extend(page_results)
1284
1285             # A little optimization - if current page is not "full", ie. does
1286             # not contain page_size videos then we can assume that this page
1287             # is the last one - there are no more ids on further pages -
1288             # i.e. no need to query again.
1289             if len(page_results) + startv < self._pagesize:
1290                 break
1291
1292             # If we got the whole page, but the next page is not interesting,
1293             # break out early as well
1294             if end == nextfirstid:
1295                 break
1296         return res
1297
1298
1299 class InAdvancePagedList(PagedList):
1300     def __init__(self, pagefunc, pagecount, pagesize):
1301         self._pagefunc = pagefunc
1302         self._pagecount = pagecount
1303         self._pagesize = pagesize
1304
1305     def getslice(self, start=0, end=None):
1306         res = []
1307         start_page = start // self._pagesize
1308         end_page = (
1309             self._pagecount if end is None else (end // self._pagesize + 1))
1310         skip_elems = start - start_page * self._pagesize
1311         only_more = None if end is None else end - start
1312         for pagenum in range(start_page, end_page):
1313             page = list(self._pagefunc(pagenum))
1314             if skip_elems:
1315                 page = page[skip_elems:]
1316                 skip_elems = None
1317             if only_more is not None:
1318                 if len(page) < only_more:
1319                     only_more -= len(page)
1320                 else:
1321                     page = page[:only_more]
1322                     res.extend(page)
1323                     break
1324             res.extend(page)
1325         return res
1326
1327
1328 def uppercase_escape(s):
1329     unicode_escape = codecs.getdecoder('unicode_escape')
1330     return re.sub(
1331         r'\\U[0-9a-fA-F]{8}',
1332         lambda m: unicode_escape(m.group(0))[0],
1333         s)
1334
1335
1336 def escape_rfc3986(s):
1337     """Escape non-ASCII characters as suggested by RFC 3986"""
1338     if sys.version_info < (3, 0) and isinstance(s, unicode):
1339         s = s.encode('utf-8')
1340     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1341
1342
1343 def escape_url(url):
1344     """Escape URL as suggested by RFC 3986"""
1345     url_parsed = compat_urllib_parse_urlparse(url)
1346     return url_parsed._replace(
1347         path=escape_rfc3986(url_parsed.path),
1348         params=escape_rfc3986(url_parsed.params),
1349         query=escape_rfc3986(url_parsed.query),
1350         fragment=escape_rfc3986(url_parsed.fragment)
1351     ).geturl()
1352
1353 try:
1354     struct.pack(u'!I', 0)
1355 except TypeError:
1356     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1357     def struct_pack(spec, *args):
1358         if isinstance(spec, compat_str):
1359             spec = spec.encode('ascii')
1360         return struct.pack(spec, *args)
1361
1362     def struct_unpack(spec, *args):
1363         if isinstance(spec, compat_str):
1364             spec = spec.encode('ascii')
1365         return struct.unpack(spec, *args)
1366 else:
1367     struct_pack = struct.pack
1368     struct_unpack = struct.unpack
1369
1370
1371 def read_batch_urls(batch_fd):
1372     def fixup(url):
1373         if not isinstance(url, compat_str):
1374             url = url.decode('utf-8', 'replace')
1375         BOM_UTF8 = u'\xef\xbb\xbf'
1376         if url.startswith(BOM_UTF8):
1377             url = url[len(BOM_UTF8):]
1378         url = url.strip()
1379         if url.startswith(('#', ';', ']')):
1380             return False
1381         return url
1382
1383     with contextlib.closing(batch_fd) as fd:
1384         return [url for url in map(fixup, fd) if url]
1385
1386
1387 def urlencode_postdata(*args, **kargs):
1388     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1389
1390
1391 try:
1392     etree_iter = xml.etree.ElementTree.Element.iter
1393 except AttributeError:  # Python <=2.6
1394     etree_iter = lambda n: n.findall('.//*')
1395
1396
1397 def parse_xml(s):
1398     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1399         def doctype(self, name, pubid, system):
1400             pass  # Ignore doctypes
1401
1402     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1403     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1404     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1405     # Fix up XML parser in Python 2.x
1406     if sys.version_info < (3, 0):
1407         for n in etree_iter(tree):
1408             if n.text is not None:
1409                 if not isinstance(n.text, compat_str):
1410                     n.text = n.text.decode('utf-8')
1411     return tree
1412
1413
1414 US_RATINGS = {
1415     'G': 0,
1416     'PG': 10,
1417     'PG-13': 13,
1418     'R': 16,
1419     'NC': 18,
1420 }
1421
1422
1423 def parse_age_limit(s):
1424     if s is None:
1425         return None
1426     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1427     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1428
1429
1430 def strip_jsonp(code):
1431     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1432
1433
1434 def js_to_json(code):
1435     def fix_kv(m):
1436         v = m.group(0)
1437         if v in ('true', 'false', 'null'):
1438             return v
1439         if v.startswith('"'):
1440             return v
1441         if v.startswith("'"):
1442             v = v[1:-1]
1443             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1444                 '\\\\': '\\\\',
1445                 "\\'": "'",
1446                 '"': '\\"',
1447             }[m.group(0)], v)
1448         return '"%s"' % v
1449
1450     res = re.sub(r'''(?x)
1451         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1452         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1453         [a-zA-Z_][a-zA-Z_0-9]*
1454         ''', fix_kv, code)
1455     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1456     return res
1457
1458
1459 def qualities(quality_ids):
1460     """ Get a numeric quality value out of a list of possible values """
1461     def q(qid):
1462         try:
1463             return quality_ids.index(qid)
1464         except ValueError:
1465             return -1
1466     return q
1467
1468
1469 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1470
1471
1472 def limit_length(s, length):
1473     """ Add ellipses to overly long strings """
1474     if s is None:
1475         return None
1476     ELLIPSES = '...'
1477     if len(s) > length:
1478         return s[:length - len(ELLIPSES)] + ELLIPSES
1479     return s
1480
1481
1482 def version_tuple(v):
1483     return [int(e) for e in v.split('.')]
1484
1485
1486 def is_outdated_version(version, limit, assume_new=True):
1487     if not version:
1488         return not assume_new
1489     try:
1490         return version_tuple(version) < version_tuple(limit)
1491     except ValueError:
1492         return not assume_new