_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import json
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import email.utils
  13 import json
  14
  15 try:
  16     import urllib.request as compat_urllib_request
  17 except ImportError: # Python 2
  18     import urllib2 as compat_urllib_request
  19
  20 try:
  21     import urllib.error as compat_urllib_error
  22 except ImportError: # Python 2
  23     import urllib2 as compat_urllib_error
  24
  25 try:
  26     import urllib.parse as compat_urllib_parse
  27 except ImportError: # Python 2
  28     import urllib as compat_urllib_parse
  29
  30 try:
  31     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  32 except ImportError: # Python 2
  33     from urlparse import urlparse as compat_urllib_parse_urlparse
  34
  35 try:
  36     import http.cookiejar as compat_cookiejar
  37 except ImportError: # Python 2
  38     import cookielib as compat_cookiejar
  39
  40 try:
  41     import html.entities as compat_html_entities
  42 except ImportError: # Python 2
  43     import htmlentitydefs as compat_html_entities
  44
  45 try:
  46     import html.parser as compat_html_parser
  47 except ImportError: # Python 2
  48     import HTMLParser as compat_html_parser
  49
  50 try:
  51     import http.client as compat_http_client
  52 except ImportError: # Python 2
  53     import httplib as compat_http_client
  54
  55 try:
  56     from subprocess import DEVNULL
  57     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  58 except ImportError:
  59     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  60
  61 try:
  62     from urllib.parse import parse_qs as compat_parse_qs
  63 except ImportError: # Python 2
  64     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  65     # Python 2's version is apparently totally broken
  66     def _unquote(string, encoding='utf-8', errors='replace'):
  67         if string == '':
  68             return string
  69         res = string.split('%')
  70         if len(res) == 1:
  71             return string
  72         if encoding is None:
  73             encoding = 'utf-8'
  74         if errors is None:
  75             errors = 'replace'
  76         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  77         pct_sequence = b''
  78         string = res[0]
  79         for item in res[1:]:
  80             try:
  81                 if not item:
  82                     raise ValueError
  83                 pct_sequence += item[:2].decode('hex')
  84                 rest = item[2:]
  85                 if not rest:
  86                     # This segment was just a single percent-encoded character.
  87                     # May be part of a sequence of code units, so delay decoding.
  88                     # (Stored in pct_sequence).
  89                     continue
  90             except ValueError:
  91                 rest = '%' + item
  92             # Encountered non-percent-encoded characters. Flush the current
  93             # pct_sequence.
  94             string += pct_sequence.decode(encoding, errors) + rest
  95             pct_sequence = b''
  96         if pct_sequence:
  97             # Flush the final pct_sequence
  98             string += pct_sequence.decode(encoding, errors)
  99         return string
 100
 101     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 102                 encoding='utf-8', errors='replace'):
 103         qs, _coerce_result = qs, unicode
 104         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 105         r = []
 106         for name_value in pairs:
 107             if not name_value and not strict_parsing:
 108                 continue
 109             nv = name_value.split('=', 1)
 110             if len(nv) != 2:
 111                 if strict_parsing:
 112                     raise ValueError("bad query field: %r" % (name_value,))
 113                 # Handle case of a control-name with no equal sign
 114                 if keep_blank_values:
 115                     nv.append('')
 116                 else:
 117                     continue
 118             if len(nv[1]) or keep_blank_values:
 119                 name = nv[0].replace('+', ' ')
 120                 name = _unquote(name, encoding=encoding, errors=errors)
 121                 name = _coerce_result(name)
 122                 value = nv[1].replace('+', ' ')
 123                 value = _unquote(value, encoding=encoding, errors=errors)
 124                 value = _coerce_result(value)
 125                 r.append((name, value))
 126         return r
 127
 128     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 129                 encoding='utf-8', errors='replace'):
 130         parsed_result = {}
 131         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 132                         encoding=encoding, errors=errors)
 133         for name, value in pairs:
 134             if name in parsed_result:
 135                 parsed_result[name].append(value)
 136             else:
 137                 parsed_result[name] = [value]
 138         return parsed_result
 139
 140 try:
 141     compat_str = unicode # Python 2
 142 except NameError:
 143     compat_str = str
 144
 145 try:
 146     compat_chr = unichr # Python 2
 147 except NameError:
 148     compat_chr = chr
 149
 150 std_headers = {
 151     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 152     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 153     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 154     'Accept-Encoding': 'gzip, deflate',
 155     'Accept-Language': 'en-us,en;q=0.5',
 156 }
 157 def preferredencoding():
 158     """Get preferred encoding.
 159
 160     Returns the best encoding scheme for the system, based on
 161     locale.getpreferredencoding() and some further tweaks.
 162     """
 163     try:
 164         pref = locale.getpreferredencoding()
 165         u'TEST'.encode(pref)
 166     except:
 167         pref = 'UTF-8'
 168
 169     return pref
 170
 171 if sys.version_info < (3,0):
 172     def compat_print(s):
 173         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 174 else:
 175     def compat_print(s):
 176         assert type(s) == type(u'')
 177         print(s)
 178
 179 # In Python 2.x, json.dump expects a bytestream.
 180 # In Python 3.x, it writes to a character stream
 181 if sys.version_info < (3,0):
 182     def write_json_file(obj, fn):
 183         with open(fn, 'wb') as f:
 184             json.dump(obj, f)
 185 else:
 186     def write_json_file(obj, fn):
 187         with open(fn, 'w', encoding='utf-8') as f:
 188             json.dump(obj, f)
 189
 190
 191 def htmlentity_transform(matchobj):
 192     """Transforms an HTML entity to a character.
 193
 194     This function receives a match object and is intended to be used with
 195     the re.sub() function.
 196     """
 197     entity = matchobj.group(1)
 198
 199     # Known non-numeric HTML entity
 200     if entity in compat_html_entities.name2codepoint:
 201         return compat_chr(compat_html_entities.name2codepoint[entity])
 202
 203     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 204     if mobj is not None:
 205         numstr = mobj.group(1)
 206         if numstr.startswith(u'x'):
 207             base = 16
 208             numstr = u'0%s' % numstr
 209         else:
 210             base = 10
 211         return compat_chr(int(numstr, base))
 212
 213     # Unknown entity in name, return its literal representation
 214     return (u'&%s;' % entity)
 215
 216 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 217 class AttrParser(compat_html_parser.HTMLParser):
 218     """Modified HTMLParser that isolates a tag with the specified attribute"""
 219     def __init__(self, attribute, value):
 220         self.attribute = attribute
 221         self.value = value
 222         self.result = None
 223         self.started = False
 224         self.depth = {}
 225         self.html = None
 226         self.watch_startpos = False
 227         self.error_count = 0
 228         compat_html_parser.HTMLParser.__init__(self)
 229
 230     def error(self, message):
 231         if self.error_count > 10 or self.started:
 232             raise compat_html_parser.HTMLParseError(message, self.getpos())
 233         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 234         self.error_count += 1
 235         self.goahead(1)
 236
 237     def loads(self, html):
 238         self.html = html
 239         self.feed(html)
 240         self.close()
 241
 242     def handle_starttag(self, tag, attrs):
 243         attrs = dict(attrs)
 244         if self.started:
 245             self.find_startpos(None)
 246         if self.attribute in attrs and attrs[self.attribute] == self.value:
 247             self.result = [tag]
 248             self.started = True
 249             self.watch_startpos = True
 250         if self.started:
 251             if not tag in self.depth: self.depth[tag] = 0
 252             self.depth[tag] += 1
 253
 254     def handle_endtag(self, tag):
 255         if self.started:
 256             if tag in self.depth: self.depth[tag] -= 1
 257             if self.depth[self.result[0]] == 0:
 258                 self.started = False
 259                 self.result.append(self.getpos())
 260
 261     def find_startpos(self, x):
 262         """Needed to put the start position of the result (self.result[1])
 263         after the opening tag with the requested id"""
 264         if self.watch_startpos:
 265             self.watch_startpos = False
 266             self.result.append(self.getpos())
 267     handle_entityref = handle_charref = handle_data = handle_comment = \
 268     handle_decl = handle_pi = unknown_decl = find_startpos
 269
 270     def get_result(self):
 271         if self.result is None:
 272             return None
 273         if len(self.result) != 3:
 274             return None
 275         lines = self.html.split('\n')
 276         lines = lines[self.result[1][0]-1:self.result[2][0]]
 277         lines[0] = lines[0][self.result[1][1]:]
 278         if len(lines) == 1:
 279             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 280         lines[-1] = lines[-1][:self.result[2][1]]
 281         return '\n'.join(lines).strip()
 282
 283 def get_element_by_id(id, html):
 284     """Return the content of the tag with the specified ID in the passed HTML document"""
 285     return get_element_by_attribute("id", id, html)
 286
 287 def get_element_by_attribute(attribute, value, html):
 288     """Return the content of the tag with the specified attribute in the passed HTML document"""
 289     parser = AttrParser(attribute, value)
 290     try:
 291         parser.loads(html)
 292     except compat_html_parser.HTMLParseError:
 293         pass
 294     return parser.get_result()
 295
 296
 297 def clean_html(html):
 298     """Clean an HTML snippet into a readable string"""
 299     # Newline vs <br />
 300     html = html.replace('\n', ' ')
 301     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 302     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 303     # Strip html tags
 304     html = re.sub('<.*?>', '', html)
 305     # Replace html entities
 306     html = unescapeHTML(html)
 307     return html
 308
 309
 310 def sanitize_open(filename, open_mode):
 311     """Try to open the given filename, and slightly tweak it if this fails.
 312
 313     Attempts to open the given filename. If this fails, it tries to change
 314     the filename slightly, step by step, until it's either able to open it
 315     or it fails and raises a final exception, like the standard open()
 316     function.
 317
 318     It returns the tuple (stream, definitive_file_name).
 319     """
 320     try:
 321         if filename == u'-':
 322             if sys.platform == 'win32':
 323                 import msvcrt
 324                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 325             return (sys.stdout, filename)
 326         stream = open(encodeFilename(filename), open_mode)
 327         return (stream, filename)
 328     except (IOError, OSError) as err:
 329         # In case of error, try to remove win32 forbidden chars
 330         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 331
 332         # An exception here should be caught in the caller
 333         stream = open(encodeFilename(filename), open_mode)
 334         return (stream, filename)
 335
 336
 337 def timeconvert(timestr):
 338     """Convert RFC 2822 defined time string into system timestamp"""
 339     timestamp = None
 340     timetuple = email.utils.parsedate_tz(timestr)
 341     if timetuple is not None:
 342         timestamp = email.utils.mktime_tz(timetuple)
 343     return timestamp
 344
 345 def sanitize_filename(s, restricted=False, is_id=False):
 346     """Sanitizes a string so it could be used as part of a filename.
 347     If restricted is set, use a stricter subset of allowed characters.
 348     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 349     """
 350     def replace_insane(char):
 351         if char == '?' or ord(char) < 32 or ord(char) == 127:
 352             return ''
 353         elif char == '"':
 354             return '' if restricted else '\''
 355         elif char == ':':
 356             return '_-' if restricted else ' -'
 357         elif char in '\\/|*<>':
 358             return '_'
 359         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 360             return '_'
 361         if restricted and ord(char) > 127:
 362             return '_'
 363         return char
 364
 365     result = u''.join(map(replace_insane, s))
 366     if not is_id:
 367         while '__' in result:
 368             result = result.replace('__', '_')
 369         result = result.strip('_')
 370         # Common case of "Foreign band name - English song title"
 371         if restricted and result.startswith('-_'):
 372             result = result[2:]
 373         if not result:
 374             result = '_'
 375     return result
 376
 377 def orderedSet(iterable):
 378     """ Remove all duplicates from the input iterable """
 379     res = []
 380     for el in iterable:
 381         if el not in res:
 382             res.append(el)
 383     return res
 384
 385 def unescapeHTML(s):
 386     """
 387     @param s a string
 388     """
 389     assert type(s) == type(u'')
 390
 391     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 392     return result
 393
 394 def encodeFilename(s):
 395     """
 396     @param s The name of the file
 397     """
 398
 399     assert type(s) == type(u'')
 400
 401     # Python 3 has a Unicode API
 402     if sys.version_info >= (3, 0):
 403         return s
 404
 405     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 406         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 407         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 408         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 409         return s
 410     else:
 411         return s.encode(sys.getfilesystemencoding(), 'ignore')
 412
 413 def rsa_verify(message, signature, key):
 414     from struct import pack
 415     from hashlib import sha256
 416     from sys import version_info
 417     def b(x):
 418         if version_info[0] == 2: return x
 419         else: return x.encode('latin1')
 420     assert(type(message) == type(b('')))
 421     block_size = 0
 422     n = key[0]
 423     while n:
 424         block_size += 1
 425         n >>= 8
 426     signature = pow(int(signature, 16), key[1], key[0])
 427     raw_bytes = []
 428     while signature:
 429         raw_bytes.insert(0, pack("B", signature & 0xFF))
 430         signature >>= 8
 431     signature = (block_size - len(raw_bytes)) * b('\x00') + b('').join(raw_bytes)
 432     if signature[0:2] != b('\x00\x01'): return False
 433     signature = signature[2:]
 434     if not b('\x00') in signature: return False
 435     signature = signature[signature.index(b('\x00'))+1:]
 436     if not signature.startswith(b('\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20')): return False
 437     signature = signature[19:]
 438     if signature != sha256(message).digest(): return False
 439     return True
 440
 441 class DownloadError(Exception):
 442     """Download Error exception.
 443
 444     This exception may be thrown by FileDownloader objects if they are not
 445     configured to continue on errors. They will contain the appropriate
 446     error message.
 447     """
 448     pass
 449
 450
 451 class SameFileError(Exception):
 452     """Same File exception.
 453
 454     This exception will be thrown by FileDownloader objects if they detect
 455     multiple files would have to be downloaded to the same file on disk.
 456     """
 457     pass
 458
 459
 460 class PostProcessingError(Exception):
 461     """Post Processing exception.
 462
 463     This exception may be raised by PostProcessor's .run() method to
 464     indicate an error in the postprocessing task.
 465     """
 466     pass
 467
 468 class MaxDownloadsReached(Exception):
 469     """ --max-downloads limit has been reached. """
 470     pass
 471
 472
 473 class UnavailableVideoError(Exception):
 474     """Unavailable Format exception.
 475
 476     This exception will be thrown when a video is requested
 477     in a format that is not available for that video.
 478     """
 479     pass
 480
 481
 482 class ContentTooShortError(Exception):
 483     """Content Too Short exception.
 484
 485     This exception may be raised by FileDownloader objects when a file they
 486     download is too small for what the server announced first, indicating
 487     the connection was probably interrupted.
 488     """
 489     # Both in bytes
 490     downloaded = None
 491     expected = None
 492
 493     def __init__(self, downloaded, expected):
 494         self.downloaded = downloaded
 495         self.expected = expected
 496
 497 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 498     """Handler for HTTP requests and responses.
 499
 500     This class, when installed with an OpenerDirector, automatically adds
 501     the standard headers to every HTTP request and handles gzipped and
 502     deflated responses from web servers. If compression is to be avoided in
 503     a particular request, the original request in the program code only has
 504     to include the HTTP header "Youtubedl-No-Compression", which will be
 505     removed before making the real request.
 506
 507     Part of this code was copied from:
 508
 509     http://techknack.net/python-urllib2-handlers/
 510
 511     Andrew Rowls, the author of that code, agreed to release it to the
 512     public domain.
 513     """
 514
 515     @staticmethod
 516     def deflate(data):
 517         try:
 518             return zlib.decompress(data, -zlib.MAX_WBITS)
 519         except zlib.error:
 520             return zlib.decompress(data)
 521
 522     @staticmethod
 523     def addinfourl_wrapper(stream, headers, url, code):
 524         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 525             return compat_urllib_request.addinfourl(stream, headers, url, code)
 526         ret = compat_urllib_request.addinfourl(stream, headers, url)
 527         ret.code = code
 528         return ret
 529
 530     def http_request(self, req):
 531         for h in std_headers:
 532             if h in req.headers:
 533                 del req.headers[h]
 534             req.add_header(h, std_headers[h])
 535         if 'Youtubedl-no-compression' in req.headers:
 536             if 'Accept-encoding' in req.headers:
 537                 del req.headers['Accept-encoding']
 538             del req.headers['Youtubedl-no-compression']
 539         return req
 540
 541     def http_response(self, req, resp):
 542         old_resp = resp
 543         # gzip
 544         if resp.headers.get('Content-encoding', '') == 'gzip':
 545             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 546             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 547             resp.msg = old_resp.msg
 548         # deflate
 549         if resp.headers.get('Content-encoding', '') == 'deflate':
 550             gz = io.BytesIO(self.deflate(resp.read()))
 551             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 552             resp.msg = old_resp.msg
 553         return resp
 554
 555     https_request = http_request
 556     https_response = http_response