_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import os
  12 import pipes
  13 import platform
  14 import re
  15 import ssl
  16 import socket
  17 import sys
  18 import traceback
  19 import zlib
  20
  21 try:
  22     import urllib.request as compat_urllib_request
  23 except ImportError: # Python 2
  24     import urllib2 as compat_urllib_request
  25
  26 try:
  27     import urllib.error as compat_urllib_error
  28 except ImportError: # Python 2
  29     import urllib2 as compat_urllib_error
  30
  31 try:
  32     import urllib.parse as compat_urllib_parse
  33 except ImportError: # Python 2
  34     import urllib as compat_urllib_parse
  35
  36 try:
  37     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  38 except ImportError: # Python 2
  39     from urlparse import urlparse as compat_urllib_parse_urlparse
  40
  41 try:
  42     import urllib.parse as compat_urlparse
  43 except ImportError: # Python 2
  44     import urlparse as compat_urlparse
  45
  46 try:
  47     import http.cookiejar as compat_cookiejar
  48 except ImportError: # Python 2
  49     import cookielib as compat_cookiejar
  50
  51 try:
  52     import html.entities as compat_html_entities
  53 except ImportError: # Python 2
  54     import htmlentitydefs as compat_html_entities
  55
  56 try:
  57     import html.parser as compat_html_parser
  58 except ImportError: # Python 2
  59     import HTMLParser as compat_html_parser
  60
  61 try:
  62     import http.client as compat_http_client
  63 except ImportError: # Python 2
  64     import httplib as compat_http_client
  65
  66 try:
  67     from urllib.error import HTTPError as compat_HTTPError
  68 except ImportError:  # Python 2
  69     from urllib2 import HTTPError as compat_HTTPError
  70
  71 try:
  72     from urllib.request import urlretrieve as compat_urlretrieve
  73 except ImportError:  # Python 2
  74     from urllib import urlretrieve as compat_urlretrieve
  75
  76
  77 try:
  78     from subprocess import DEVNULL
  79     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  80 except ImportError:
  81     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  82
  83 try:
  84     from urllib.parse import parse_qs as compat_parse_qs
  85 except ImportError: # Python 2
  86     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  87     # Python 2's version is apparently totally broken
  88     def _unquote(string, encoding='utf-8', errors='replace'):
  89         if string == '':
  90             return string
  91         res = string.split('%')
  92         if len(res) == 1:
  93             return string
  94         if encoding is None:
  95             encoding = 'utf-8'
  96         if errors is None:
  97             errors = 'replace'
  98         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  99         pct_sequence = b''
 100         string = res[0]
 101         for item in res[1:]:
 102             try:
 103                 if not item:
 104                     raise ValueError
 105                 pct_sequence += item[:2].decode('hex')
 106                 rest = item[2:]
 107                 if not rest:
 108                     # This segment was just a single percent-encoded character.
 109                     # May be part of a sequence of code units, so delay decoding.
 110                     # (Stored in pct_sequence).
 111                     continue
 112             except ValueError:
 113                 rest = '%' + item
 114             # Encountered non-percent-encoded characters. Flush the current
 115             # pct_sequence.
 116             string += pct_sequence.decode(encoding, errors) + rest
 117             pct_sequence = b''
 118         if pct_sequence:
 119             # Flush the final pct_sequence
 120             string += pct_sequence.decode(encoding, errors)
 121         return string
 122
 123     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 124                 encoding='utf-8', errors='replace'):
 125         qs, _coerce_result = qs, unicode
 126         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 127         r = []
 128         for name_value in pairs:
 129             if not name_value and not strict_parsing:
 130                 continue
 131             nv = name_value.split('=', 1)
 132             if len(nv) != 2:
 133                 if strict_parsing:
 134                     raise ValueError("bad query field: %r" % (name_value,))
 135                 # Handle case of a control-name with no equal sign
 136                 if keep_blank_values:
 137                     nv.append('')
 138                 else:
 139                     continue
 140             if len(nv[1]) or keep_blank_values:
 141                 name = nv[0].replace('+', ' ')
 142                 name = _unquote(name, encoding=encoding, errors=errors)
 143                 name = _coerce_result(name)
 144                 value = nv[1].replace('+', ' ')
 145                 value = _unquote(value, encoding=encoding, errors=errors)
 146                 value = _coerce_result(value)
 147                 r.append((name, value))
 148         return r
 149
 150     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 151                 encoding='utf-8', errors='replace'):
 152         parsed_result = {}
 153         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 154                         encoding=encoding, errors=errors)
 155         for name, value in pairs:
 156             if name in parsed_result:
 157                 parsed_result[name].append(value)
 158             else:
 159                 parsed_result[name] = [value]
 160         return parsed_result
 161
 162 try:
 163     compat_str = unicode # Python 2
 164 except NameError:
 165     compat_str = str
 166
 167 try:
 168     compat_chr = unichr # Python 2
 169 except NameError:
 170     compat_chr = chr
 171
 172 def compat_ord(c):
 173     if type(c) is int: return c
 174     else: return ord(c)
 175
 176 # This is not clearly defined otherwise
 177 compiled_regex_type = type(re.compile(''))
 178
 179 std_headers = {
 180     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 181     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 182     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 183     'Accept-Encoding': 'gzip, deflate',
 184     'Accept-Language': 'en-us,en;q=0.5',
 185 }
 186
 187 def preferredencoding():
 188     """Get preferred encoding.
 189
 190     Returns the best encoding scheme for the system, based on
 191     locale.getpreferredencoding() and some further tweaks.
 192     """
 193     try:
 194         pref = locale.getpreferredencoding()
 195         u'TEST'.encode(pref)
 196     except:
 197         pref = 'UTF-8'
 198
 199     return pref
 200
 201 if sys.version_info < (3,0):
 202     def compat_print(s):
 203         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 204 else:
 205     def compat_print(s):
 206         assert type(s) == type(u'')
 207         print(s)
 208
 209 # In Python 2.x, json.dump expects a bytestream.
 210 # In Python 3.x, it writes to a character stream
 211 if sys.version_info < (3,0):
 212     def write_json_file(obj, fn):
 213         with open(fn, 'wb') as f:
 214             json.dump(obj, f)
 215 else:
 216     def write_json_file(obj, fn):
 217         with open(fn, 'w', encoding='utf-8') as f:
 218             json.dump(obj, f)
 219
 220 if sys.version_info >= (2,7):
 221     def find_xpath_attr(node, xpath, key, val):
 222         """ Find the xpath xpath[@key=val] """
 223         assert re.match(r'^[a-zA-Z]+$', key)
 224         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 225         expr = xpath + u"[@%s='%s']" % (key, val)
 226         return node.find(expr)
 227 else:
 228     def find_xpath_attr(node, xpath, key, val):
 229         for f in node.findall(xpath):
 230             if f.attrib.get(key) == val:
 231                 return f
 232         return None
 233
 234 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 235 # the namespace parameter
 236 def xpath_with_ns(path, ns_map):
 237     components = [c.split(':') for c in path.split('/')]
 238     replaced = []
 239     for c in components:
 240         if len(c) == 1:
 241             replaced.append(c[0])
 242         else:
 243             ns, tag = c
 244             replaced.append('{%s}%s' % (ns_map[ns], tag))
 245     return '/'.join(replaced)
 246
 247 def htmlentity_transform(matchobj):
 248     """Transforms an HTML entity to a character.
 249
 250     This function receives a match object and is intended to be used with
 251     the re.sub() function.
 252     """
 253     entity = matchobj.group(1)
 254
 255     # Known non-numeric HTML entity
 256     if entity in compat_html_entities.name2codepoint:
 257         return compat_chr(compat_html_entities.name2codepoint[entity])
 258
 259     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 260     if mobj is not None:
 261         numstr = mobj.group(1)
 262         if numstr.startswith(u'x'):
 263             base = 16
 264             numstr = u'0%s' % numstr
 265         else:
 266             base = 10
 267         return compat_chr(int(numstr, base))
 268
 269     # Unknown entity in name, return its literal representation
 270     return (u'&%s;' % entity)
 271
 272 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 273 class BaseHTMLParser(compat_html_parser.HTMLParser):
 274     def __init(self):
 275         compat_html_parser.HTMLParser.__init__(self)
 276         self.html = None
 277
 278     def loads(self, html):
 279         self.html = html
 280         self.feed(html)
 281         self.close()
 282
 283 class AttrParser(BaseHTMLParser):
 284     """Modified HTMLParser that isolates a tag with the specified attribute"""
 285     def __init__(self, attribute, value):
 286         self.attribute = attribute
 287         self.value = value
 288         self.result = None
 289         self.started = False
 290         self.depth = {}
 291         self.watch_startpos = False
 292         self.error_count = 0
 293         BaseHTMLParser.__init__(self)
 294
 295     def error(self, message):
 296         if self.error_count > 10 or self.started:
 297             raise compat_html_parser.HTMLParseError(message, self.getpos())
 298         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 299         self.error_count += 1
 300         self.goahead(1)
 301
 302     def handle_starttag(self, tag, attrs):
 303         attrs = dict(attrs)
 304         if self.started:
 305             self.find_startpos(None)
 306         if self.attribute in attrs and attrs[self.attribute] == self.value:
 307             self.result = [tag]
 308             self.started = True
 309             self.watch_startpos = True
 310         if self.started:
 311             if not tag in self.depth: self.depth[tag] = 0
 312             self.depth[tag] += 1
 313
 314     def handle_endtag(self, tag):
 315         if self.started:
 316             if tag in self.depth: self.depth[tag] -= 1
 317             if self.depth[self.result[0]] == 0:
 318                 self.started = False
 319                 self.result.append(self.getpos())
 320
 321     def find_startpos(self, x):
 322         """Needed to put the start position of the result (self.result[1])
 323         after the opening tag with the requested id"""
 324         if self.watch_startpos:
 325             self.watch_startpos = False
 326             self.result.append(self.getpos())
 327     handle_entityref = handle_charref = handle_data = handle_comment = \
 328     handle_decl = handle_pi = unknown_decl = find_startpos
 329
 330     def get_result(self):
 331         if self.result is None:
 332             return None
 333         if len(self.result) != 3:
 334             return None
 335         lines = self.html.split('\n')
 336         lines = lines[self.result[1][0]-1:self.result[2][0]]
 337         lines[0] = lines[0][self.result[1][1]:]
 338         if len(lines) == 1:
 339             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 340         lines[-1] = lines[-1][:self.result[2][1]]
 341         return '\n'.join(lines).strip()
 342 # Hack for https://github.com/rg3/youtube-dl/issues/662
 343 if sys.version_info < (2, 7, 3):
 344     AttrParser.parse_endtag = (lambda self, i:
 345         i + len("</scr'+'ipt>")
 346         if self.rawdata[i:].startswith("</scr'+'ipt>")
 347         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 348
 349 def get_element_by_id(id, html):
 350     """Return the content of the tag with the specified ID in the passed HTML document"""
 351     return get_element_by_attribute("id", id, html)
 352
 353 def get_element_by_attribute(attribute, value, html):
 354     """Return the content of the tag with the specified attribute in the passed HTML document"""
 355     parser = AttrParser(attribute, value)
 356     try:
 357         parser.loads(html)
 358     except compat_html_parser.HTMLParseError:
 359         pass
 360     return parser.get_result()
 361
 362 class MetaParser(BaseHTMLParser):
 363     """
 364     Modified HTMLParser that isolates a meta tag with the specified name
 365     attribute.
 366     """
 367     def __init__(self, name):
 368         BaseHTMLParser.__init__(self)
 369         self.name = name
 370         self.content = None
 371         self.result = None
 372
 373     def handle_starttag(self, tag, attrs):
 374         if tag != 'meta':
 375             return
 376         attrs = dict(attrs)
 377         if attrs.get('name') == self.name:
 378             self.result = attrs.get('content')
 379
 380     def get_result(self):
 381         return self.result
 382
 383 def get_meta_content(name, html):
 384     """
 385     Return the content attribute from the meta tag with the given name attribute.
 386     """
 387     parser = MetaParser(name)
 388     try:
 389         parser.loads(html)
 390     except compat_html_parser.HTMLParseError:
 391         pass
 392     return parser.get_result()
 393
 394
 395 def clean_html(html):
 396     """Clean an HTML snippet into a readable string"""
 397     # Newline vs <br />
 398     html = html.replace('\n', ' ')
 399     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 400     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 401     # Strip html tags
 402     html = re.sub('<.*?>', '', html)
 403     # Replace html entities
 404     html = unescapeHTML(html)
 405     return html.strip()
 406
 407
 408 def sanitize_open(filename, open_mode):
 409     """Try to open the given filename, and slightly tweak it if this fails.
 410
 411     Attempts to open the given filename. If this fails, it tries to change
 412     the filename slightly, step by step, until it's either able to open it
 413     or it fails and raises a final exception, like the standard open()
 414     function.
 415
 416     It returns the tuple (stream, definitive_file_name).
 417     """
 418     try:
 419         if filename == u'-':
 420             if sys.platform == 'win32':
 421                 import msvcrt
 422                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 423             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 424         stream = open(encodeFilename(filename), open_mode)
 425         return (stream, filename)
 426     except (IOError, OSError) as err:
 427         if err.errno in (errno.EACCES,):
 428             raise
 429
 430         # In case of error, try to remove win32 forbidden chars
 431         alt_filename = os.path.join(
 432                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 433                         for path_part in os.path.split(filename)
 434                        )
 435         if alt_filename == filename:
 436             raise
 437         else:
 438             # An exception here should be caught in the caller
 439             stream = open(encodeFilename(filename), open_mode)
 440             return (stream, alt_filename)
 441
 442
 443 def timeconvert(timestr):
 444     """Convert RFC 2822 defined time string into system timestamp"""
 445     timestamp = None
 446     timetuple = email.utils.parsedate_tz(timestr)
 447     if timetuple is not None:
 448         timestamp = email.utils.mktime_tz(timetuple)
 449     return timestamp
 450
 451 def sanitize_filename(s, restricted=False, is_id=False):
 452     """Sanitizes a string so it could be used as part of a filename.
 453     If restricted is set, use a stricter subset of allowed characters.
 454     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 455     """
 456     def replace_insane(char):
 457         if char == '?' or ord(char) < 32 or ord(char) == 127:
 458             return ''
 459         elif char == '"':
 460             return '' if restricted else '\''
 461         elif char == ':':
 462             return '_-' if restricted else ' -'
 463         elif char in '\\/|*<>':
 464             return '_'
 465         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 466             return '_'
 467         if restricted and ord(char) > 127:
 468             return '_'
 469         return char
 470
 471     result = u''.join(map(replace_insane, s))
 472     if not is_id:
 473         while '__' in result:
 474             result = result.replace('__', '_')
 475         result = result.strip('_')
 476         # Common case of "Foreign band name - English song title"
 477         if restricted and result.startswith('-_'):
 478             result = result[2:]
 479         if not result:
 480             result = '_'
 481     return result
 482
 483 def orderedSet(iterable):
 484     """ Remove all duplicates from the input iterable """
 485     res = []
 486     for el in iterable:
 487         if el not in res:
 488             res.append(el)
 489     return res
 490
 491 def unescapeHTML(s):
 492     """
 493     @param s a string
 494     """
 495     assert type(s) == type(u'')
 496
 497     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 498     return result
 499
 500 def encodeFilename(s):
 501     """
 502     @param s The name of the file
 503     """
 504
 505     assert type(s) == type(u'')
 506
 507     # Python 3 has a Unicode API
 508     if sys.version_info >= (3, 0):
 509         return s
 510
 511     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 512         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 513         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 514         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 515         return s
 516     else:
 517         encoding = sys.getfilesystemencoding()
 518         if encoding is None:
 519             encoding = 'utf-8'
 520         return s.encode(encoding, 'ignore')
 521
 522 def decodeOption(optval):
 523     if optval is None:
 524         return optval
 525     if isinstance(optval, bytes):
 526         optval = optval.decode(preferredencoding())
 527
 528     assert isinstance(optval, compat_str)
 529     return optval
 530
 531 def formatSeconds(secs):
 532     if secs > 3600:
 533         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 534     elif secs > 60:
 535         return '%d:%02d' % (secs // 60, secs % 60)
 536     else:
 537         return '%d' % secs
 538
 539 def make_HTTPS_handler(opts_no_check_certificate):
 540     if sys.version_info < (3, 2):
 541         import httplib
 542
 543         class HTTPSConnectionV3(httplib.HTTPSConnection):
 544             def __init__(self, *args, **kwargs):
 545                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 546
 547             def connect(self):
 548                 sock = socket.create_connection((self.host, self.port), self.timeout)
 549                 if self._tunnel_host:
 550                     self.sock = sock
 551                     self._tunnel()
 552                 try:
 553                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 554                 except ssl.SSLError as e:
 555                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 556
 557         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 558             def https_open(self, req):
 559                 return self.do_open(HTTPSConnectionV3, req)
 560         return HTTPSHandlerV3()
 561     else:
 562         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 563         context.set_default_verify_paths()
 564
 565         context.verify_mode = (ssl.CERT_NONE
 566                                if opts_no_check_certificate
 567                                else ssl.CERT_REQUIRED)
 568         return compat_urllib_request.HTTPSHandler(context=context)
 569
 570 class ExtractorError(Exception):
 571     """Error during info extraction."""
 572     def __init__(self, msg, tb=None, expected=False, cause=None):
 573         """ tb, if given, is the original traceback (so that it can be printed out).
 574         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 575         """
 576
 577         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 578             expected = True
 579         if not expected:
 580             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 581         super(ExtractorError, self).__init__(msg)
 582
 583         self.traceback = tb
 584         self.exc_info = sys.exc_info()  # preserve original exception
 585         self.cause = cause
 586
 587     def format_traceback(self):
 588         if self.traceback is None:
 589             return None
 590         return u''.join(traceback.format_tb(self.traceback))
 591
 592
 593 class RegexNotFoundError(ExtractorError):
 594     """Error when a regex didn't match"""
 595     pass
 596
 597
 598 class DownloadError(Exception):
 599     """Download Error exception.
 600
 601     This exception may be thrown by FileDownloader objects if they are not
 602     configured to continue on errors. They will contain the appropriate
 603     error message.
 604     """
 605     def __init__(self, msg, exc_info=None):
 606         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 607         super(DownloadError, self).__init__(msg)
 608         self.exc_info = exc_info
 609
 610
 611 class SameFileError(Exception):
 612     """Same File exception.
 613
 614     This exception will be thrown by FileDownloader objects if they detect
 615     multiple files would have to be downloaded to the same file on disk.
 616     """
 617     pass
 618
 619
 620 class PostProcessingError(Exception):
 621     """Post Processing exception.
 622
 623     This exception may be raised by PostProcessor's .run() method to
 624     indicate an error in the postprocessing task.
 625     """
 626     def __init__(self, msg):
 627         self.msg = msg
 628
 629 class MaxDownloadsReached(Exception):
 630     """ --max-downloads limit has been reached. """
 631     pass
 632
 633
 634 class UnavailableVideoError(Exception):
 635     """Unavailable Format exception.
 636
 637     This exception will be thrown when a video is requested
 638     in a format that is not available for that video.
 639     """
 640     pass
 641
 642
 643 class ContentTooShortError(Exception):
 644     """Content Too Short exception.
 645
 646     This exception may be raised by FileDownloader objects when a file they
 647     download is too small for what the server announced first, indicating
 648     the connection was probably interrupted.
 649     """
 650     # Both in bytes
 651     downloaded = None
 652     expected = None
 653
 654     def __init__(self, downloaded, expected):
 655         self.downloaded = downloaded
 656         self.expected = expected
 657
 658 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 659     """Handler for HTTP requests and responses.
 660
 661     This class, when installed with an OpenerDirector, automatically adds
 662     the standard headers to every HTTP request and handles gzipped and
 663     deflated responses from web servers. If compression is to be avoided in
 664     a particular request, the original request in the program code only has
 665     to include the HTTP header "Youtubedl-No-Compression", which will be
 666     removed before making the real request.
 667
 668     Part of this code was copied from:
 669
 670     http://techknack.net/python-urllib2-handlers/
 671
 672     Andrew Rowls, the author of that code, agreed to release it to the
 673     public domain.
 674     """
 675
 676     @staticmethod
 677     def deflate(data):
 678         try:
 679             return zlib.decompress(data, -zlib.MAX_WBITS)
 680         except zlib.error:
 681             return zlib.decompress(data)
 682
 683     @staticmethod
 684     def addinfourl_wrapper(stream, headers, url, code):
 685         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 686             return compat_urllib_request.addinfourl(stream, headers, url, code)
 687         ret = compat_urllib_request.addinfourl(stream, headers, url)
 688         ret.code = code
 689         return ret
 690
 691     def http_request(self, req):
 692         for h,v in std_headers.items():
 693             if h in req.headers:
 694                 del req.headers[h]
 695             req.add_header(h, v)
 696         if 'Youtubedl-no-compression' in req.headers:
 697             if 'Accept-encoding' in req.headers:
 698                 del req.headers['Accept-encoding']
 699             del req.headers['Youtubedl-no-compression']
 700         if 'Youtubedl-user-agent' in req.headers:
 701             if 'User-agent' in req.headers:
 702                 del req.headers['User-agent']
 703             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 704             del req.headers['Youtubedl-user-agent']
 705         return req
 706
 707     def http_response(self, req, resp):
 708         old_resp = resp
 709         # gzip
 710         if resp.headers.get('Content-encoding', '') == 'gzip':
 711             content = resp.read()
 712             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 713             try:
 714                 uncompressed = io.BytesIO(gz.read())
 715             except IOError as original_ioerror:
 716                 # There may be junk add the end of the file
 717                 # See http://stackoverflow.com/q/4928560/35070 for details
 718                 for i in range(1, 1024):
 719                     try:
 720                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 721                         uncompressed = io.BytesIO(gz.read())
 722                     except IOError:
 723                         continue
 724                     break
 725                 else:
 726                     raise original_ioerror
 727             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 728             resp.msg = old_resp.msg
 729         # deflate
 730         if resp.headers.get('Content-encoding', '') == 'deflate':
 731             gz = io.BytesIO(self.deflate(resp.read()))
 732             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 733             resp.msg = old_resp.msg
 734         return resp
 735
 736     https_request = http_request
 737     https_response = http_response
 738
 739 def unified_strdate(date_str):
 740     """Return a string with the date in the format YYYYMMDD"""
 741     upload_date = None
 742     #Replace commas
 743     date_str = date_str.replace(',',' ')
 744     # %z (UTC offset) is only supported in python>=3.2
 745     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 746     format_expressions = [
 747         '%d %B %Y',
 748         '%B %d %Y',
 749         '%b %d %Y',
 750         '%Y-%m-%d',
 751         '%d/%m/%Y',
 752         '%Y/%m/%d %H:%M:%S',
 753         '%d.%m.%Y %H:%M',
 754         '%Y-%m-%dT%H:%M:%SZ',
 755         '%Y-%m-%dT%H:%M:%S.%fZ',
 756         '%Y-%m-%dT%H:%M:%S.%f0Z',
 757         '%Y-%m-%dT%H:%M:%S',
 758     ]
 759     for expression in format_expressions:
 760         try:
 761             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 762         except:
 763             pass
 764     return upload_date
 765
 766 def determine_ext(url, default_ext=u'unknown_video'):
 767     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 768     if re.match(r'^[A-Za-z0-9]+$', guess):
 769         return guess
 770     else:
 771         return default_ext
 772
 773 def subtitles_filename(filename, sub_lang, sub_format):
 774     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 775
 776 def date_from_str(date_str):
 777     """
 778     Return a datetime object from a string in the format YYYYMMDD or
 779     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 780     today = datetime.date.today()
 781     if date_str == 'now'or date_str == 'today':
 782         return today
 783     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 784     if match is not None:
 785         sign = match.group('sign')
 786         time = int(match.group('time'))
 787         if sign == '-':
 788             time = -time
 789         unit = match.group('unit')
 790         #A bad aproximation?
 791         if unit == 'month':
 792             unit = 'day'
 793             time *= 30
 794         elif unit == 'year':
 795             unit = 'day'
 796             time *= 365
 797         unit += 's'
 798         delta = datetime.timedelta(**{unit: time})
 799         return today + delta
 800     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 801
 802 class DateRange(object):
 803     """Represents a time interval between two dates"""
 804     def __init__(self, start=None, end=None):
 805         """start and end must be strings in the format accepted by date"""
 806         if start is not None:
 807             self.start = date_from_str(start)
 808         else:
 809             self.start = datetime.datetime.min.date()
 810         if end is not None:
 811             self.end = date_from_str(end)
 812         else:
 813             self.end = datetime.datetime.max.date()
 814         if self.start > self.end:
 815             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 816     @classmethod
 817     def day(cls, day):
 818         """Returns a range that only contains the given day"""
 819         return cls(day,day)
 820     def __contains__(self, date):
 821         """Check if the date is in the range"""
 822         if not isinstance(date, datetime.date):
 823             date = date_from_str(date)
 824         return self.start <= date <= self.end
 825     def __str__(self):
 826         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 827
 828
 829 def platform_name():
 830     """ Returns the platform name as a compat_str """
 831     res = platform.platform()
 832     if isinstance(res, bytes):
 833         res = res.decode(preferredencoding())
 834
 835     assert isinstance(res, compat_str)
 836     return res
 837
 838
 839 def write_string(s, out=None):
 840     if out is None:
 841         out = sys.stderr
 842     assert type(s) == type(u'')
 843
 844     if ('b' in getattr(out, 'mode', '') or
 845             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 846         s = s.encode(preferredencoding(), 'ignore')
 847     out.write(s)
 848     out.flush()
 849
 850
 851 def bytes_to_intlist(bs):
 852     if not bs:
 853         return []
 854     if isinstance(bs[0], int):  # Python 3
 855         return list(bs)
 856     else:
 857         return [ord(c) for c in bs]
 858
 859
 860 def intlist_to_bytes(xs):
 861     if not xs:
 862         return b''
 863     if isinstance(chr(0), bytes):  # Python 2
 864         return ''.join([chr(x) for x in xs])
 865     else:
 866         return bytes(xs)
 867
 868
 869 def get_cachedir(params={}):
 870     cache_root = os.environ.get('XDG_CACHE_HOME',
 871                                 os.path.expanduser('~/.cache'))
 872     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 873
 874
 875 # Cross-platform file locking
 876 if sys.platform == 'win32':
 877     import ctypes.wintypes
 878     import msvcrt
 879
 880     class OVERLAPPED(ctypes.Structure):
 881         _fields_ = [
 882             ('Internal', ctypes.wintypes.LPVOID),
 883             ('InternalHigh', ctypes.wintypes.LPVOID),
 884             ('Offset', ctypes.wintypes.DWORD),
 885             ('OffsetHigh', ctypes.wintypes.DWORD),
 886             ('hEvent', ctypes.wintypes.HANDLE),
 887         ]
 888
 889     kernel32 = ctypes.windll.kernel32
 890     LockFileEx = kernel32.LockFileEx
 891     LockFileEx.argtypes = [
 892         ctypes.wintypes.HANDLE,     # hFile
 893         ctypes.wintypes.DWORD,      # dwFlags
 894         ctypes.wintypes.DWORD,      # dwReserved
 895         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 896         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 897         ctypes.POINTER(OVERLAPPED)  # Overlapped
 898     ]
 899     LockFileEx.restype = ctypes.wintypes.BOOL
 900     UnlockFileEx = kernel32.UnlockFileEx
 901     UnlockFileEx.argtypes = [
 902         ctypes.wintypes.HANDLE,     # hFile
 903         ctypes.wintypes.DWORD,      # dwReserved
 904         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 905         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 906         ctypes.POINTER(OVERLAPPED)  # Overlapped
 907     ]
 908     UnlockFileEx.restype = ctypes.wintypes.BOOL
 909     whole_low = 0xffffffff
 910     whole_high = 0x7fffffff
 911
 912     def _lock_file(f, exclusive):
 913         overlapped = OVERLAPPED()
 914         overlapped.Offset = 0
 915         overlapped.OffsetHigh = 0
 916         overlapped.hEvent = 0
 917         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 918         handle = msvcrt.get_osfhandle(f.fileno())
 919         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 920                           whole_low, whole_high, f._lock_file_overlapped_p):
 921             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 922
 923     def _unlock_file(f):
 924         assert f._lock_file_overlapped_p
 925         handle = msvcrt.get_osfhandle(f.fileno())
 926         if not UnlockFileEx(handle, 0,
 927                             whole_low, whole_high, f._lock_file_overlapped_p):
 928             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 929
 930 else:
 931     import fcntl
 932
 933     def _lock_file(f, exclusive):
 934         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 935
 936     def _unlock_file(f):
 937         fcntl.lockf(f, fcntl.LOCK_UN)
 938
 939
 940 class locked_file(object):
 941     def __init__(self, filename, mode, encoding=None):
 942         assert mode in ['r', 'a', 'w']
 943         self.f = io.open(filename, mode, encoding=encoding)
 944         self.mode = mode
 945
 946     def __enter__(self):
 947         exclusive = self.mode != 'r'
 948         try:
 949             _lock_file(self.f, exclusive)
 950         except IOError:
 951             self.f.close()
 952             raise
 953         return self
 954
 955     def __exit__(self, etype, value, traceback):
 956         try:
 957             _unlock_file(self.f)
 958         finally:
 959             self.f.close()
 960
 961     def __iter__(self):
 962         return iter(self.f)
 963
 964     def write(self, *args):
 965         return self.f.write(*args)
 966
 967     def read(self, *args):
 968         return self.f.read(*args)
 969
 970
 971 def shell_quote(args):
 972     quoted_args = []
 973     encoding = sys.getfilesystemencoding()
 974     if encoding is None:
 975         encoding = 'utf-8'
 976     for a in args:
 977         if isinstance(a, bytes):
 978             # We may get a filename encoded with 'encodeFilename'
 979             a = a.decode(encoding)
 980         quoted_args.append(pipes.quote(a))
 981     return u' '.join(quoted_args)
 982
 983
 984 def takewhile_inclusive(pred, seq):
 985     """ Like itertools.takewhile, but include the latest evaluated element
 986         (the first element so that Not pred(e)) """
 987     for e in seq:
 988         yield e
 989         if not pred(e):
 990             return
 991
 992
 993 def smuggle_url(url, data):
 994     """ Pass additional data in a URL for internal use. """
 995
 996     sdata = compat_urllib_parse.urlencode(
 997         {u'__youtubedl_smuggle': json.dumps(data)})
 998     return url + u'#' + sdata
 999
1000
1001 def unsmuggle_url(smug_url):
1002     if not '#__youtubedl_smuggle' in smug_url:
1003         return smug_url, None
1004     url, _, sdata = smug_url.rpartition(u'#')
1005     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1006     data = json.loads(jsond)
1007     return url, data