_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15     import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17     import urllib2 as compat_urllib_request
  18
  19 try:
  20     import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22     import urllib2 as compat_urllib_error
  23
  24 try:
  25     import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27     import urllib as compat_urllib_parse
  28
  29 try:
  30     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  31 except ImportError: # Python 2
  32     from urlparse import urlparse as compat_urllib_parse_urlparse
  33
  34 try:
  35     import http.cookiejar as compat_cookiejar
  36 except ImportError: # Python 2
  37     import cookielib as compat_cookiejar
  38
  39 try:
  40     import html.entities as compat_html_entities
  41 except ImportError: # Python 2
  42     import htmlentitydefs as compat_html_entities
  43
  44 try:
  45     import html.parser as compat_html_parser
  46 except ImportError: # Python 2
  47     import HTMLParser as compat_html_parser
  48
  49 try:
  50     import http.client as compat_http_client
  51 except ImportError: # Python 2
  52     import httplib as compat_http_client
  53
  54 try:
  55     from urllib.parse import parse_qs as compat_parse_qs
  56 except ImportError: # Python 2
  57     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  58     # Python 2's version is apparently totally broken
  59     def _unquote(string, encoding='utf-8', errors='replace'):
  60         if string == '':
  61             return string
  62         res = string.split('%')
  63         if len(res) == 1:
  64             return string
  65         if encoding is None:
  66             encoding = 'utf-8'
  67         if errors is None:
  68             errors = 'replace'
  69         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  70         pct_sequence = b''
  71         string = res[0]
  72         for item in res[1:]:
  73             try:
  74                 if not item:
  75                     raise ValueError
  76                 pct_sequence += item[:2].decode('hex')
  77                 rest = item[2:]
  78                 if not rest:
  79                     # This segment was just a single percent-encoded character.
  80                     # May be part of a sequence of code units, so delay decoding.
  81                     # (Stored in pct_sequence).
  82                     continue
  83             except ValueError:
  84                 rest = '%' + item
  85             # Encountered non-percent-encoded characters. Flush the current
  86             # pct_sequence.
  87             string += pct_sequence.decode(encoding, errors) + rest
  88             pct_sequence = b''
  89         if pct_sequence:
  90             # Flush the final pct_sequence
  91             string += pct_sequence.decode(encoding, errors)
  92         return string
  93
  94     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  95                 encoding='utf-8', errors='replace'):
  96         qs, _coerce_result = qs, unicode
  97         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  98         r = []
  99         for name_value in pairs:
 100             if not name_value and not strict_parsing:
 101                 continue
 102             nv = name_value.split('=', 1)
 103             if len(nv) != 2:
 104                 if strict_parsing:
 105                     raise ValueError("bad query field: %r" % (name_value,))
 106                 # Handle case of a control-name with no equal sign
 107                 if keep_blank_values:
 108                     nv.append('')
 109                 else:
 110                     continue
 111             if len(nv[1]) or keep_blank_values:
 112                 name = nv[0].replace('+', ' ')
 113                 name = _unquote(name, encoding=encoding, errors=errors)
 114                 name = _coerce_result(name)
 115                 value = nv[1].replace('+', ' ')
 116                 value = _unquote(value, encoding=encoding, errors=errors)
 117                 value = _coerce_result(value)
 118                 r.append((name, value))
 119         return r
 120
 121     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 122                 encoding='utf-8', errors='replace'):
 123         parsed_result = {}
 124         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 125                         encoding=encoding, errors=errors)
 126         for name, value in pairs:
 127             if name in parsed_result:
 128                 parsed_result[name].append(value)
 129             else:
 130                 parsed_result[name] = [value]
 131         return parsed_result
 132
 133 try:
 134     compat_str = unicode # Python 2
 135 except NameError:
 136     compat_str = str
 137
 138 try:
 139     compat_chr = unichr # Python 2
 140 except NameError:
 141     compat_chr = chr
 142
 143 std_headers = {
 144     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 145     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 146     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 147     'Accept-Encoding': 'gzip, deflate',
 148     'Accept-Language': 'en-us,en;q=0.5',
 149 }
 150 def preferredencoding():
 151     """Get preferred encoding.
 152
 153     Returns the best encoding scheme for the system, based on
 154     locale.getpreferredencoding() and some further tweaks.
 155     """
 156     try:
 157         pref = locale.getpreferredencoding()
 158         u'TEST'.encode(pref)
 159     except:
 160         pref = 'UTF-8'
 161
 162     return pref
 163
 164 if sys.version_info < (3,0):
 165     def compat_print(s):
 166         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 167 else:
 168     def compat_print(s):
 169         assert type(s) == type(u'')
 170         print(s)
 171
 172 def htmlentity_transform(matchobj):
 173     """Transforms an HTML entity to a character.
 174
 175     This function receives a match object and is intended to be used with
 176     the re.sub() function.
 177     """
 178     entity = matchobj.group(1)
 179
 180     # Known non-numeric HTML entity
 181     if entity in compat_html_entities.name2codepoint:
 182         return compat_chr(compat_html_entities.name2codepoint[entity])
 183
 184     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 185     if mobj is not None:
 186         numstr = mobj.group(1)
 187         if numstr.startswith(u'x'):
 188             base = 16
 189             numstr = u'0%s' % numstr
 190         else:
 191             base = 10
 192         return compat_chr(int(numstr, base))
 193
 194     # Unknown entity in name, return its literal representation
 195     return (u'&%s;' % entity)
 196
 197 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 198 class IDParser(compat_html_parser.HTMLParser):
 199     """Modified HTMLParser that isolates a tag with the specified id"""
 200     def __init__(self, id):
 201         self.id = id
 202         self.result = None
 203         self.started = False
 204         self.depth = {}
 205         self.html = None
 206         self.watch_startpos = False
 207         self.error_count = 0
 208         compat_html_parser.HTMLParser.__init__(self)
 209
 210     def error(self, message):
 211         if self.error_count > 10 or self.started:
 212             raise compat_html_parser.HTMLParseError(message, self.getpos())
 213         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 214         self.error_count += 1
 215         self.goahead(1)
 216
 217     def loads(self, html):
 218         self.html = html
 219         self.feed(html)
 220         self.close()
 221
 222     def handle_starttag(self, tag, attrs):
 223         attrs = dict(attrs)
 224         if self.started:
 225             self.find_startpos(None)
 226         if 'id' in attrs and attrs['id'] == self.id:
 227             self.result = [tag]
 228             self.started = True
 229             self.watch_startpos = True
 230         if self.started:
 231             if not tag in self.depth: self.depth[tag] = 0
 232             self.depth[tag] += 1
 233
 234     def handle_endtag(self, tag):
 235         if self.started:
 236             if tag in self.depth: self.depth[tag] -= 1
 237             if self.depth[self.result[0]] == 0:
 238                 self.started = False
 239                 self.result.append(self.getpos())
 240
 241     def find_startpos(self, x):
 242         """Needed to put the start position of the result (self.result[1])
 243         after the opening tag with the requested id"""
 244         if self.watch_startpos:
 245             self.watch_startpos = False
 246             self.result.append(self.getpos())
 247     handle_entityref = handle_charref = handle_data = handle_comment = \
 248     handle_decl = handle_pi = unknown_decl = find_startpos
 249
 250     def get_result(self):
 251         if self.result is None:
 252             return None
 253         if len(self.result) != 3:
 254             return None
 255         lines = self.html.split('\n')
 256         lines = lines[self.result[1][0]-1:self.result[2][0]]
 257         lines[0] = lines[0][self.result[1][1]:]
 258         if len(lines) == 1:
 259             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 260         lines[-1] = lines[-1][:self.result[2][1]]
 261         return '\n'.join(lines).strip()
 262
 263 def get_element_by_id(id, html):
 264     """Return the content of the tag with the specified id in the passed HTML document"""
 265     parser = IDParser(id)
 266     try:
 267         parser.loads(html)
 268     except compat_html_parser.HTMLParseError:
 269         pass
 270     return parser.get_result()
 271
 272
 273 def clean_html(html):
 274     """Clean an HTML snippet into a readable string"""
 275     # Newline vs <br />
 276     html = html.replace('\n', ' ')
 277     html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 278     # Strip html tags
 279     html = re.sub('<.*?>', '', html)
 280     # Replace html entities
 281     html = unescapeHTML(html)
 282     return html
 283
 284
 285 def sanitize_open(filename, open_mode):
 286     """Try to open the given filename, and slightly tweak it if this fails.
 287
 288     Attempts to open the given filename. If this fails, it tries to change
 289     the filename slightly, step by step, until it's either able to open it
 290     or it fails and raises a final exception, like the standard open()
 291     function.
 292
 293     It returns the tuple (stream, definitive_file_name).
 294     """
 295     try:
 296         if filename == u'-':
 297             if sys.platform == 'win32':
 298                 import msvcrt
 299                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 300             return (sys.stdout, filename)
 301         stream = open(encodeFilename(filename), open_mode)
 302         return (stream, filename)
 303     except (IOError, OSError) as err:
 304         # In case of error, try to remove win32 forbidden chars
 305         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 306
 307         # An exception here should be caught in the caller
 308         stream = open(encodeFilename(filename), open_mode)
 309         return (stream, filename)
 310
 311
 312 def timeconvert(timestr):
 313     """Convert RFC 2822 defined time string into system timestamp"""
 314     timestamp = None
 315     timetuple = email.utils.parsedate_tz(timestr)
 316     if timetuple is not None:
 317         timestamp = email.utils.mktime_tz(timetuple)
 318     return timestamp
 319
 320 def sanitize_filename(s, restricted=False, is_id=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 324     """
 325     def replace_insane(char):
 326         if char == '?' or ord(char) < 32 or ord(char) == 127:
 327             return ''
 328         elif char == '"':
 329             return '' if restricted else '\''
 330         elif char == ':':
 331             return '_-' if restricted else ' -'
 332         elif char in '\\/|*<>':
 333             return '_'
 334         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 335             return '_'
 336         if restricted and ord(char) > 127:
 337             return '_'
 338         return char
 339
 340     result = u''.join(map(replace_insane, s))
 341     if not is_id:
 342         while '__' in result:
 343             result = result.replace('__', '_')
 344         result = result.strip('_')
 345         # Common case of "Foreign band name - English song title"
 346         if restricted and result.startswith('-_'):
 347             result = result[2:]
 348         if not result:
 349             result = '_'
 350     return result
 351
 352 def orderedSet(iterable):
 353     """ Remove all duplicates from the input iterable """
 354     res = []
 355     for el in iterable:
 356         if el not in res:
 357             res.append(el)
 358     return res
 359
 360 def unescapeHTML(s):
 361     """
 362     @param s a string
 363     """
 364     assert type(s) == type(u'')
 365
 366     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 367     return result
 368
 369 def encodeFilename(s):
 370     """
 371     @param s The name of the file
 372     """
 373
 374     assert type(s) == type(u'')
 375
 376     # Python 3 has a Unicode API
 377     if sys.version_info >= (3, 0):
 378         return s
 379
 380     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 381         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 382         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 383         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 384         return s
 385     else:
 386         return s.encode(sys.getfilesystemencoding(), 'ignore')
 387
 388 class DownloadError(Exception):
 389     """Download Error exception.
 390
 391     This exception may be thrown by FileDownloader objects if they are not
 392     configured to continue on errors. They will contain the appropriate
 393     error message.
 394     """
 395     pass
 396
 397
 398 class SameFileError(Exception):
 399     """Same File exception.
 400
 401     This exception will be thrown by FileDownloader objects if they detect
 402     multiple files would have to be downloaded to the same file on disk.
 403     """
 404     pass
 405
 406
 407 class PostProcessingError(Exception):
 408     """Post Processing exception.
 409
 410     This exception may be raised by PostProcessor's .run() method to
 411     indicate an error in the postprocessing task.
 412     """
 413     pass
 414
 415 class MaxDownloadsReached(Exception):
 416     """ --max-downloads limit has been reached. """
 417     pass
 418
 419
 420 class UnavailableVideoError(Exception):
 421     """Unavailable Format exception.
 422
 423     This exception will be thrown when a video is requested
 424     in a format that is not available for that video.
 425     """
 426     pass
 427
 428
 429 class ContentTooShortError(Exception):
 430     """Content Too Short exception.
 431
 432     This exception may be raised by FileDownloader objects when a file they
 433     download is too small for what the server announced first, indicating
 434     the connection was probably interrupted.
 435     """
 436     # Both in bytes
 437     downloaded = None
 438     expected = None
 439
 440     def __init__(self, downloaded, expected):
 441         self.downloaded = downloaded
 442         self.expected = expected
 443
 444
 445 class Trouble(Exception):
 446     """Trouble helper exception
 447
 448     This is an exception to be handled with
 449     FileDownloader.trouble
 450     """
 451
 452 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 453     """Handler for HTTP requests and responses.
 454
 455     This class, when installed with an OpenerDirector, automatically adds
 456     the standard headers to every HTTP request and handles gzipped and
 457     deflated responses from web servers. If compression is to be avoided in
 458     a particular request, the original request in the program code only has
 459     to include the HTTP header "Youtubedl-No-Compression", which will be
 460     removed before making the real request.
 461
 462     Part of this code was copied from:
 463
 464     http://techknack.net/python-urllib2-handlers/
 465
 466     Andrew Rowls, the author of that code, agreed to release it to the
 467     public domain.
 468     """
 469
 470     @staticmethod
 471     def deflate(data):
 472         try:
 473             return zlib.decompress(data, -zlib.MAX_WBITS)
 474         except zlib.error:
 475             return zlib.decompress(data)
 476
 477     @staticmethod
 478     def addinfourl_wrapper(stream, headers, url, code):
 479         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 480             return compat_urllib_request.addinfourl(stream, headers, url, code)
 481         ret = compat_urllib_request.addinfourl(stream, headers, url)
 482         ret.code = code
 483         return ret
 484
 485     def http_request(self, req):
 486         for h in std_headers:
 487             if h in req.headers:
 488                 del req.headers[h]
 489             req.add_header(h, std_headers[h])
 490         if 'Youtubedl-no-compression' in req.headers:
 491             if 'Accept-encoding' in req.headers:
 492                 del req.headers['Accept-encoding']
 493             del req.headers['Youtubedl-no-compression']
 494         return req
 495
 496     def http_response(self, req, resp):
 497         old_resp = resp
 498         # gzip
 499         if resp.headers.get('Content-encoding', '') == 'gzip':
 500             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 501             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 502             resp.msg = old_resp.msg
 503         # deflate
 504         if resp.headers.get('Content-encoding', '') == 'deflate':
 505             gz = io.BytesIO(self.deflate(resp.read()))
 506             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 507             resp.msg = old_resp.msg
 508         return resp
 509
 510     https_request = http_request
 511     https_response = http_response