_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import os
  12 import pipes
  13 import platform
  14 import re
  15 import socket
  16 import sys
  17 import traceback
  18 import zlib
  19
  20 try:
  21     import urllib.request as compat_urllib_request
  22 except ImportError: # Python 2
  23     import urllib2 as compat_urllib_request
  24
  25 try:
  26     import urllib.error as compat_urllib_error
  27 except ImportError: # Python 2
  28     import urllib2 as compat_urllib_error
  29
  30 try:
  31     import urllib.parse as compat_urllib_parse
  32 except ImportError: # Python 2
  33     import urllib as compat_urllib_parse
  34
  35 try:
  36     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  37 except ImportError: # Python 2
  38     from urlparse import urlparse as compat_urllib_parse_urlparse
  39
  40 try:
  41     import urllib.parse as compat_urlparse
  42 except ImportError: # Python 2
  43     import urlparse as compat_urlparse
  44
  45 try:
  46     import http.cookiejar as compat_cookiejar
  47 except ImportError: # Python 2
  48     import cookielib as compat_cookiejar
  49
  50 try:
  51     import html.entities as compat_html_entities
  52 except ImportError: # Python 2
  53     import htmlentitydefs as compat_html_entities
  54
  55 try:
  56     import html.parser as compat_html_parser
  57 except ImportError: # Python 2
  58     import HTMLParser as compat_html_parser
  59
  60 try:
  61     import http.client as compat_http_client
  62 except ImportError: # Python 2
  63     import httplib as compat_http_client
  64
  65 try:
  66     from urllib.error import HTTPError as compat_HTTPError
  67 except ImportError:  # Python 2
  68     from urllib2 import HTTPError as compat_HTTPError
  69
  70 try:
  71     from urllib.request import urlretrieve as compat_urlretrieve
  72 except ImportError:  # Python 2
  73     from urllib import urlretrieve as compat_urlretrieve
  74
  75
  76 try:
  77     from subprocess import DEVNULL
  78     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  79 except ImportError:
  80     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  81
  82 try:
  83     from urllib.parse import parse_qs as compat_parse_qs
  84 except ImportError: # Python 2
  85     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  86     # Python 2's version is apparently totally broken
  87     def _unquote(string, encoding='utf-8', errors='replace'):
  88         if string == '':
  89             return string
  90         res = string.split('%')
  91         if len(res) == 1:
  92             return string
  93         if encoding is None:
  94             encoding = 'utf-8'
  95         if errors is None:
  96             errors = 'replace'
  97         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  98         pct_sequence = b''
  99         string = res[0]
 100         for item in res[1:]:
 101             try:
 102                 if not item:
 103                     raise ValueError
 104                 pct_sequence += item[:2].decode('hex')
 105                 rest = item[2:]
 106                 if not rest:
 107                     # This segment was just a single percent-encoded character.
 108                     # May be part of a sequence of code units, so delay decoding.
 109                     # (Stored in pct_sequence).
 110                     continue
 111             except ValueError:
 112                 rest = '%' + item
 113             # Encountered non-percent-encoded characters. Flush the current
 114             # pct_sequence.
 115             string += pct_sequence.decode(encoding, errors) + rest
 116             pct_sequence = b''
 117         if pct_sequence:
 118             # Flush the final pct_sequence
 119             string += pct_sequence.decode(encoding, errors)
 120         return string
 121
 122     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 123                 encoding='utf-8', errors='replace'):
 124         qs, _coerce_result = qs, unicode
 125         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 126         r = []
 127         for name_value in pairs:
 128             if not name_value and not strict_parsing:
 129                 continue
 130             nv = name_value.split('=', 1)
 131             if len(nv) != 2:
 132                 if strict_parsing:
 133                     raise ValueError("bad query field: %r" % (name_value,))
 134                 # Handle case of a control-name with no equal sign
 135                 if keep_blank_values:
 136                     nv.append('')
 137                 else:
 138                     continue
 139             if len(nv[1]) or keep_blank_values:
 140                 name = nv[0].replace('+', ' ')
 141                 name = _unquote(name, encoding=encoding, errors=errors)
 142                 name = _coerce_result(name)
 143                 value = nv[1].replace('+', ' ')
 144                 value = _unquote(value, encoding=encoding, errors=errors)
 145                 value = _coerce_result(value)
 146                 r.append((name, value))
 147         return r
 148
 149     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 150                 encoding='utf-8', errors='replace'):
 151         parsed_result = {}
 152         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 153                         encoding=encoding, errors=errors)
 154         for name, value in pairs:
 155             if name in parsed_result:
 156                 parsed_result[name].append(value)
 157             else:
 158                 parsed_result[name] = [value]
 159         return parsed_result
 160
 161 try:
 162     compat_str = unicode # Python 2
 163 except NameError:
 164     compat_str = str
 165
 166 try:
 167     compat_chr = unichr # Python 2
 168 except NameError:
 169     compat_chr = chr
 170
 171 def compat_ord(c):
 172     if type(c) is int: return c
 173     else: return ord(c)
 174
 175 # This is not clearly defined otherwise
 176 compiled_regex_type = type(re.compile(''))
 177
 178 std_headers = {
 179     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 180     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 181     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 182     'Accept-Encoding': 'gzip, deflate',
 183     'Accept-Language': 'en-us,en;q=0.5',
 184 }
 185
 186 def preferredencoding():
 187     """Get preferred encoding.
 188
 189     Returns the best encoding scheme for the system, based on
 190     locale.getpreferredencoding() and some further tweaks.
 191     """
 192     try:
 193         pref = locale.getpreferredencoding()
 194         u'TEST'.encode(pref)
 195     except:
 196         pref = 'UTF-8'
 197
 198     return pref
 199
 200 if sys.version_info < (3,0):
 201     def compat_print(s):
 202         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 203 else:
 204     def compat_print(s):
 205         assert type(s) == type(u'')
 206         print(s)
 207
 208 # In Python 2.x, json.dump expects a bytestream.
 209 # In Python 3.x, it writes to a character stream
 210 if sys.version_info < (3,0):
 211     def write_json_file(obj, fn):
 212         with open(fn, 'wb') as f:
 213             json.dump(obj, f)
 214 else:
 215     def write_json_file(obj, fn):
 216         with open(fn, 'w', encoding='utf-8') as f:
 217             json.dump(obj, f)
 218
 219 if sys.version_info >= (2,7):
 220     def find_xpath_attr(node, xpath, key, val):
 221         """ Find the xpath xpath[@key=val] """
 222         assert re.match(r'^[a-zA-Z]+$', key)
 223         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 224         expr = xpath + u"[@%s='%s']" % (key, val)
 225         return node.find(expr)
 226 else:
 227     def find_xpath_attr(node, xpath, key, val):
 228         for f in node.findall(xpath):
 229             if f.attrib.get(key) == val:
 230                 return f
 231         return None
 232
 233 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 234 # the namespace parameter
 235 def xpath_with_ns(path, ns_map):
 236     components = [c.split(':') for c in path.split('/')]
 237     replaced = []
 238     for c in components:
 239         if len(c) == 1:
 240             replaced.append(c[0])
 241         else:
 242             ns, tag = c
 243             replaced.append('{%s}%s' % (ns_map[ns], tag))
 244     return '/'.join(replaced)
 245
 246 def htmlentity_transform(matchobj):
 247     """Transforms an HTML entity to a character.
 248
 249     This function receives a match object and is intended to be used with
 250     the re.sub() function.
 251     """
 252     entity = matchobj.group(1)
 253
 254     # Known non-numeric HTML entity
 255     if entity in compat_html_entities.name2codepoint:
 256         return compat_chr(compat_html_entities.name2codepoint[entity])
 257
 258     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 259     if mobj is not None:
 260         numstr = mobj.group(1)
 261         if numstr.startswith(u'x'):
 262             base = 16
 263             numstr = u'0%s' % numstr
 264         else:
 265             base = 10
 266         return compat_chr(int(numstr, base))
 267
 268     # Unknown entity in name, return its literal representation
 269     return (u'&%s;' % entity)
 270
 271 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 272 class BaseHTMLParser(compat_html_parser.HTMLParser):
 273     def __init(self):
 274         compat_html_parser.HTMLParser.__init__(self)
 275         self.html = None
 276
 277     def loads(self, html):
 278         self.html = html
 279         self.feed(html)
 280         self.close()
 281
 282 class AttrParser(BaseHTMLParser):
 283     """Modified HTMLParser that isolates a tag with the specified attribute"""
 284     def __init__(self, attribute, value):
 285         self.attribute = attribute
 286         self.value = value
 287         self.result = None
 288         self.started = False
 289         self.depth = {}
 290         self.watch_startpos = False
 291         self.error_count = 0
 292         BaseHTMLParser.__init__(self)
 293
 294     def error(self, message):
 295         if self.error_count > 10 or self.started:
 296             raise compat_html_parser.HTMLParseError(message, self.getpos())
 297         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 298         self.error_count += 1
 299         self.goahead(1)
 300
 301     def handle_starttag(self, tag, attrs):
 302         attrs = dict(attrs)
 303         if self.started:
 304             self.find_startpos(None)
 305         if self.attribute in attrs and attrs[self.attribute] == self.value:
 306             self.result = [tag]
 307             self.started = True
 308             self.watch_startpos = True
 309         if self.started:
 310             if not tag in self.depth: self.depth[tag] = 0
 311             self.depth[tag] += 1
 312
 313     def handle_endtag(self, tag):
 314         if self.started:
 315             if tag in self.depth: self.depth[tag] -= 1
 316             if self.depth[self.result[0]] == 0:
 317                 self.started = False
 318                 self.result.append(self.getpos())
 319
 320     def find_startpos(self, x):
 321         """Needed to put the start position of the result (self.result[1])
 322         after the opening tag with the requested id"""
 323         if self.watch_startpos:
 324             self.watch_startpos = False
 325             self.result.append(self.getpos())
 326     handle_entityref = handle_charref = handle_data = handle_comment = \
 327     handle_decl = handle_pi = unknown_decl = find_startpos
 328
 329     def get_result(self):
 330         if self.result is None:
 331             return None
 332         if len(self.result) != 3:
 333             return None
 334         lines = self.html.split('\n')
 335         lines = lines[self.result[1][0]-1:self.result[2][0]]
 336         lines[0] = lines[0][self.result[1][1]:]
 337         if len(lines) == 1:
 338             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 339         lines[-1] = lines[-1][:self.result[2][1]]
 340         return '\n'.join(lines).strip()
 341 # Hack for https://github.com/rg3/youtube-dl/issues/662
 342 if sys.version_info < (2, 7, 3):
 343     AttrParser.parse_endtag = (lambda self, i:
 344         i + len("</scr'+'ipt>")
 345         if self.rawdata[i:].startswith("</scr'+'ipt>")
 346         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 347
 348 def get_element_by_id(id, html):
 349     """Return the content of the tag with the specified ID in the passed HTML document"""
 350     return get_element_by_attribute("id", id, html)
 351
 352 def get_element_by_attribute(attribute, value, html):
 353     """Return the content of the tag with the specified attribute in the passed HTML document"""
 354     parser = AttrParser(attribute, value)
 355     try:
 356         parser.loads(html)
 357     except compat_html_parser.HTMLParseError:
 358         pass
 359     return parser.get_result()
 360
 361 class MetaParser(BaseHTMLParser):
 362     """
 363     Modified HTMLParser that isolates a meta tag with the specified name
 364     attribute.
 365     """
 366     def __init__(self, name):
 367         BaseHTMLParser.__init__(self)
 368         self.name = name
 369         self.content = None
 370         self.result = None
 371
 372     def handle_starttag(self, tag, attrs):
 373         if tag != 'meta':
 374             return
 375         attrs = dict(attrs)
 376         if attrs.get('name') == self.name:
 377             self.result = attrs.get('content')
 378
 379     def get_result(self):
 380         return self.result
 381
 382 def get_meta_content(name, html):
 383     """
 384     Return the content attribute from the meta tag with the given name attribute.
 385     """
 386     parser = MetaParser(name)
 387     try:
 388         parser.loads(html)
 389     except compat_html_parser.HTMLParseError:
 390         pass
 391     return parser.get_result()
 392
 393
 394 def clean_html(html):
 395     """Clean an HTML snippet into a readable string"""
 396     # Newline vs <br />
 397     html = html.replace('\n', ' ')
 398     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 399     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 400     # Strip html tags
 401     html = re.sub('<.*?>', '', html)
 402     # Replace html entities
 403     html = unescapeHTML(html)
 404     return html.strip()
 405
 406
 407 def sanitize_open(filename, open_mode):
 408     """Try to open the given filename, and slightly tweak it if this fails.
 409
 410     Attempts to open the given filename. If this fails, it tries to change
 411     the filename slightly, step by step, until it's either able to open it
 412     or it fails and raises a final exception, like the standard open()
 413     function.
 414
 415     It returns the tuple (stream, definitive_file_name).
 416     """
 417     try:
 418         if filename == u'-':
 419             if sys.platform == 'win32':
 420                 import msvcrt
 421                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 422             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 423         stream = open(encodeFilename(filename), open_mode)
 424         return (stream, filename)
 425     except (IOError, OSError) as err:
 426         if err.errno in (errno.EACCES,):
 427             raise
 428
 429         # In case of error, try to remove win32 forbidden chars
 430         alt_filename = os.path.join(
 431                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 432                         for path_part in os.path.split(filename)
 433                        )
 434         if alt_filename == filename:
 435             raise
 436         else:
 437             # An exception here should be caught in the caller
 438             stream = open(encodeFilename(filename), open_mode)
 439             return (stream, alt_filename)
 440
 441
 442 def timeconvert(timestr):
 443     """Convert RFC 2822 defined time string into system timestamp"""
 444     timestamp = None
 445     timetuple = email.utils.parsedate_tz(timestr)
 446     if timetuple is not None:
 447         timestamp = email.utils.mktime_tz(timetuple)
 448     return timestamp
 449
 450 def sanitize_filename(s, restricted=False, is_id=False):
 451     """Sanitizes a string so it could be used as part of a filename.
 452     If restricted is set, use a stricter subset of allowed characters.
 453     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 454     """
 455     def replace_insane(char):
 456         if char == '?' or ord(char) < 32 or ord(char) == 127:
 457             return ''
 458         elif char == '"':
 459             return '' if restricted else '\''
 460         elif char == ':':
 461             return '_-' if restricted else ' -'
 462         elif char in '\\/|*<>':
 463             return '_'
 464         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 465             return '_'
 466         if restricted and ord(char) > 127:
 467             return '_'
 468         return char
 469
 470     result = u''.join(map(replace_insane, s))
 471     if not is_id:
 472         while '__' in result:
 473             result = result.replace('__', '_')
 474         result = result.strip('_')
 475         # Common case of "Foreign band name - English song title"
 476         if restricted and result.startswith('-_'):
 477             result = result[2:]
 478         if not result:
 479             result = '_'
 480     return result
 481
 482 def orderedSet(iterable):
 483     """ Remove all duplicates from the input iterable """
 484     res = []
 485     for el in iterable:
 486         if el not in res:
 487             res.append(el)
 488     return res
 489
 490 def unescapeHTML(s):
 491     """
 492     @param s a string
 493     """
 494     assert type(s) == type(u'')
 495
 496     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 497     return result
 498
 499 def encodeFilename(s):
 500     """
 501     @param s The name of the file
 502     """
 503
 504     assert type(s) == type(u'')
 505
 506     # Python 3 has a Unicode API
 507     if sys.version_info >= (3, 0):
 508         return s
 509
 510     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 511         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 512         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 513         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 514         return s
 515     else:
 516         encoding = sys.getfilesystemencoding()
 517         if encoding is None:
 518             encoding = 'utf-8'
 519         return s.encode(encoding, 'ignore')
 520
 521 def decodeOption(optval):
 522     if optval is None:
 523         return optval
 524     if isinstance(optval, bytes):
 525         optval = optval.decode(preferredencoding())
 526
 527     assert isinstance(optval, compat_str)
 528     return optval
 529
 530 def formatSeconds(secs):
 531     if secs > 3600:
 532         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 533     elif secs > 60:
 534         return '%d:%02d' % (secs // 60, secs % 60)
 535     else:
 536         return '%d' % secs
 537
 538 def make_HTTPS_handler(opts):
 539     if sys.version_info < (3,2):
 540         # Python's 2.x handler is very simplistic
 541         return compat_urllib_request.HTTPSHandler()
 542     else:
 543         import ssl
 544         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 545         context.set_default_verify_paths()
 546
 547         context.verify_mode = (ssl.CERT_NONE
 548                                if opts.no_check_certificate
 549                                else ssl.CERT_REQUIRED)
 550         return compat_urllib_request.HTTPSHandler(context=context)
 551
 552 class ExtractorError(Exception):
 553     """Error during info extraction."""
 554     def __init__(self, msg, tb=None, expected=False, cause=None):
 555         """ tb, if given, is the original traceback (so that it can be printed out).
 556         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 557         """
 558
 559         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 560             expected = True
 561         if not expected:
 562             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 563         super(ExtractorError, self).__init__(msg)
 564
 565         self.traceback = tb
 566         self.exc_info = sys.exc_info()  # preserve original exception
 567         self.cause = cause
 568
 569     def format_traceback(self):
 570         if self.traceback is None:
 571             return None
 572         return u''.join(traceback.format_tb(self.traceback))
 573
 574
 575 class RegexNotFoundError(ExtractorError):
 576     """Error when a regex didn't match"""
 577     pass
 578
 579
 580 class DownloadError(Exception):
 581     """Download Error exception.
 582
 583     This exception may be thrown by FileDownloader objects if they are not
 584     configured to continue on errors. They will contain the appropriate
 585     error message.
 586     """
 587     def __init__(self, msg, exc_info=None):
 588         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 589         super(DownloadError, self).__init__(msg)
 590         self.exc_info = exc_info
 591
 592
 593 class SameFileError(Exception):
 594     """Same File exception.
 595
 596     This exception will be thrown by FileDownloader objects if they detect
 597     multiple files would have to be downloaded to the same file on disk.
 598     """
 599     pass
 600
 601
 602 class PostProcessingError(Exception):
 603     """Post Processing exception.
 604
 605     This exception may be raised by PostProcessor's .run() method to
 606     indicate an error in the postprocessing task.
 607     """
 608     def __init__(self, msg):
 609         self.msg = msg
 610
 611 class MaxDownloadsReached(Exception):
 612     """ --max-downloads limit has been reached. """
 613     pass
 614
 615
 616 class UnavailableVideoError(Exception):
 617     """Unavailable Format exception.
 618
 619     This exception will be thrown when a video is requested
 620     in a format that is not available for that video.
 621     """
 622     pass
 623
 624
 625 class ContentTooShortError(Exception):
 626     """Content Too Short exception.
 627
 628     This exception may be raised by FileDownloader objects when a file they
 629     download is too small for what the server announced first, indicating
 630     the connection was probably interrupted.
 631     """
 632     # Both in bytes
 633     downloaded = None
 634     expected = None
 635
 636     def __init__(self, downloaded, expected):
 637         self.downloaded = downloaded
 638         self.expected = expected
 639
 640 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 641     """Handler for HTTP requests and responses.
 642
 643     This class, when installed with an OpenerDirector, automatically adds
 644     the standard headers to every HTTP request and handles gzipped and
 645     deflated responses from web servers. If compression is to be avoided in
 646     a particular request, the original request in the program code only has
 647     to include the HTTP header "Youtubedl-No-Compression", which will be
 648     removed before making the real request.
 649
 650     Part of this code was copied from:
 651
 652     http://techknack.net/python-urllib2-handlers/
 653
 654     Andrew Rowls, the author of that code, agreed to release it to the
 655     public domain.
 656     """
 657
 658     @staticmethod
 659     def deflate(data):
 660         try:
 661             return zlib.decompress(data, -zlib.MAX_WBITS)
 662         except zlib.error:
 663             return zlib.decompress(data)
 664
 665     @staticmethod
 666     def addinfourl_wrapper(stream, headers, url, code):
 667         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 668             return compat_urllib_request.addinfourl(stream, headers, url, code)
 669         ret = compat_urllib_request.addinfourl(stream, headers, url)
 670         ret.code = code
 671         return ret
 672
 673     def http_request(self, req):
 674         for h,v in std_headers.items():
 675             if h in req.headers:
 676                 del req.headers[h]
 677             req.add_header(h, v)
 678         if 'Youtubedl-no-compression' in req.headers:
 679             if 'Accept-encoding' in req.headers:
 680                 del req.headers['Accept-encoding']
 681             del req.headers['Youtubedl-no-compression']
 682         if 'Youtubedl-user-agent' in req.headers:
 683             if 'User-agent' in req.headers:
 684                 del req.headers['User-agent']
 685             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 686             del req.headers['Youtubedl-user-agent']
 687         return req
 688
 689     def http_response(self, req, resp):
 690         old_resp = resp
 691         # gzip
 692         if resp.headers.get('Content-encoding', '') == 'gzip':
 693             content = resp.read()
 694             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 695             try:
 696                 uncompressed = io.BytesIO(gz.read())
 697             except IOError as original_ioerror:
 698                 # There may be junk add the end of the file
 699                 # See http://stackoverflow.com/q/4928560/35070 for details
 700                 for i in range(1, 1024):
 701                     try:
 702                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 703                         uncompressed = io.BytesIO(gz.read())
 704                     except IOError:
 705                         continue
 706                     break
 707                 else:
 708                     raise original_ioerror
 709             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 710             resp.msg = old_resp.msg
 711         # deflate
 712         if resp.headers.get('Content-encoding', '') == 'deflate':
 713             gz = io.BytesIO(self.deflate(resp.read()))
 714             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 715             resp.msg = old_resp.msg
 716         return resp
 717
 718     https_request = http_request
 719     https_response = http_response
 720
 721 def unified_strdate(date_str):
 722     """Return a string with the date in the format YYYYMMDD"""
 723     upload_date = None
 724     #Replace commas
 725     date_str = date_str.replace(',',' ')
 726     # %z (UTC offset) is only supported in python>=3.2
 727     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 728     format_expressions = [
 729         '%d %B %Y',
 730         '%B %d %Y',
 731         '%b %d %Y',
 732         '%Y-%m-%d',
 733         '%d/%m/%Y',
 734         '%Y/%m/%d %H:%M:%S',
 735         '%d.%m.%Y %H:%M',
 736         '%Y-%m-%dT%H:%M:%SZ',
 737         '%Y-%m-%dT%H:%M:%S',
 738     ]
 739     for expression in format_expressions:
 740         try:
 741             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 742         except:
 743             pass
 744     return upload_date
 745
 746 def determine_ext(url, default_ext=u'unknown_video'):
 747     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 748     if re.match(r'^[A-Za-z0-9]+$', guess):
 749         return guess
 750     else:
 751         return default_ext
 752
 753 def subtitles_filename(filename, sub_lang, sub_format):
 754     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 755
 756 def date_from_str(date_str):
 757     """
 758     Return a datetime object from a string in the format YYYYMMDD or
 759     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 760     today = datetime.date.today()
 761     if date_str == 'now'or date_str == 'today':
 762         return today
 763     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 764     if match is not None:
 765         sign = match.group('sign')
 766         time = int(match.group('time'))
 767         if sign == '-':
 768             time = -time
 769         unit = match.group('unit')
 770         #A bad aproximation?
 771         if unit == 'month':
 772             unit = 'day'
 773             time *= 30
 774         elif unit == 'year':
 775             unit = 'day'
 776             time *= 365
 777         unit += 's'
 778         delta = datetime.timedelta(**{unit: time})
 779         return today + delta
 780     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 781
 782 class DateRange(object):
 783     """Represents a time interval between two dates"""
 784     def __init__(self, start=None, end=None):
 785         """start and end must be strings in the format accepted by date"""
 786         if start is not None:
 787             self.start = date_from_str(start)
 788         else:
 789             self.start = datetime.datetime.min.date()
 790         if end is not None:
 791             self.end = date_from_str(end)
 792         else:
 793             self.end = datetime.datetime.max.date()
 794         if self.start > self.end:
 795             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 796     @classmethod
 797     def day(cls, day):
 798         """Returns a range that only contains the given day"""
 799         return cls(day,day)
 800     def __contains__(self, date):
 801         """Check if the date is in the range"""
 802         if not isinstance(date, datetime.date):
 803             date = date_from_str(date)
 804         return self.start <= date <= self.end
 805     def __str__(self):
 806         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 807
 808
 809 def platform_name():
 810     """ Returns the platform name as a compat_str """
 811     res = platform.platform()
 812     if isinstance(res, bytes):
 813         res = res.decode(preferredencoding())
 814
 815     assert isinstance(res, compat_str)
 816     return res
 817
 818
 819 def write_string(s, out=None):
 820     if out is None:
 821         out = sys.stderr
 822     assert type(s) == type(u'')
 823
 824     if ('b' in getattr(out, 'mode', '') or
 825             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 826         s = s.encode(preferredencoding(), 'ignore')
 827     out.write(s)
 828     out.flush()
 829
 830
 831 def bytes_to_intlist(bs):
 832     if not bs:
 833         return []
 834     if isinstance(bs[0], int):  # Python 3
 835         return list(bs)
 836     else:
 837         return [ord(c) for c in bs]
 838
 839
 840 def intlist_to_bytes(xs):
 841     if not xs:
 842         return b''
 843     if isinstance(chr(0), bytes):  # Python 2
 844         return ''.join([chr(x) for x in xs])
 845     else:
 846         return bytes(xs)
 847
 848
 849 def get_cachedir(params={}):
 850     cache_root = os.environ.get('XDG_CACHE_HOME',
 851                                 os.path.expanduser('~/.cache'))
 852     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 853
 854
 855 # Cross-platform file locking
 856 if sys.platform == 'win32':
 857     import ctypes.wintypes
 858     import msvcrt
 859
 860     class OVERLAPPED(ctypes.Structure):
 861         _fields_ = [
 862             ('Internal', ctypes.wintypes.LPVOID),
 863             ('InternalHigh', ctypes.wintypes.LPVOID),
 864             ('Offset', ctypes.wintypes.DWORD),
 865             ('OffsetHigh', ctypes.wintypes.DWORD),
 866             ('hEvent', ctypes.wintypes.HANDLE),
 867         ]
 868
 869     kernel32 = ctypes.windll.kernel32
 870     LockFileEx = kernel32.LockFileEx
 871     LockFileEx.argtypes = [
 872         ctypes.wintypes.HANDLE,     # hFile
 873         ctypes.wintypes.DWORD,      # dwFlags
 874         ctypes.wintypes.DWORD,      # dwReserved
 875         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 876         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 877         ctypes.POINTER(OVERLAPPED)  # Overlapped
 878     ]
 879     LockFileEx.restype = ctypes.wintypes.BOOL
 880     UnlockFileEx = kernel32.UnlockFileEx
 881     UnlockFileEx.argtypes = [
 882         ctypes.wintypes.HANDLE,     # hFile
 883         ctypes.wintypes.DWORD,      # dwReserved
 884         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 885         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 886         ctypes.POINTER(OVERLAPPED)  # Overlapped
 887     ]
 888     UnlockFileEx.restype = ctypes.wintypes.BOOL
 889     whole_low = 0xffffffff
 890     whole_high = 0x7fffffff
 891
 892     def _lock_file(f, exclusive):
 893         overlapped = OVERLAPPED()
 894         overlapped.Offset = 0
 895         overlapped.OffsetHigh = 0
 896         overlapped.hEvent = 0
 897         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 898         handle = msvcrt.get_osfhandle(f.fileno())
 899         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 900                           whole_low, whole_high, f._lock_file_overlapped_p):
 901             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 902
 903     def _unlock_file(f):
 904         assert f._lock_file_overlapped_p
 905         handle = msvcrt.get_osfhandle(f.fileno())
 906         if not UnlockFileEx(handle, 0,
 907                             whole_low, whole_high, f._lock_file_overlapped_p):
 908             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 909
 910 else:
 911     import fcntl
 912
 913     def _lock_file(f, exclusive):
 914         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 915
 916     def _unlock_file(f):
 917         fcntl.lockf(f, fcntl.LOCK_UN)
 918
 919
 920 class locked_file(object):
 921     def __init__(self, filename, mode, encoding=None):
 922         assert mode in ['r', 'a', 'w']
 923         self.f = io.open(filename, mode, encoding=encoding)
 924         self.mode = mode
 925
 926     def __enter__(self):
 927         exclusive = self.mode != 'r'
 928         try:
 929             _lock_file(self.f, exclusive)
 930         except IOError:
 931             self.f.close()
 932             raise
 933         return self
 934
 935     def __exit__(self, etype, value, traceback):
 936         try:
 937             _unlock_file(self.f)
 938         finally:
 939             self.f.close()
 940
 941     def __iter__(self):
 942         return iter(self.f)
 943
 944     def write(self, *args):
 945         return self.f.write(*args)
 946
 947     def read(self, *args):
 948         return self.f.read(*args)
 949
 950
 951 def shell_quote(args):
 952     return ' '.join(map(pipes.quote, args))
 953
 954
 955 def takewhile_inclusive(pred, seq):
 956     """ Like itertools.takewhile, but include the latest evaluated element
 957         (the first element so that Not pred(e)) """
 958     for e in seq:
 959         yield e
 960         if not pred(e):
 961             return
 962
 963
 964 def smuggle_url(url, data):
 965     """ Pass additional data in a URL for internal use. """
 966
 967     sdata = compat_urllib_parse.urlencode(
 968         {u'__youtubedl_smuggle': json.dumps(data)})
 969     return url + u'#' + sdata
 970
 971
 972 def unsmuggle_url(smug_url):
 973     if not '#__youtubedl_smuggle' in smug_url:
 974         return smug_url, None
 975     url, _, sdata = smug_url.rpartition(u'#')
 976     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
 977     data = json.loads(jsond)
 978     return url, data