_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15     import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17     import urllib2 as compat_urllib_request
  18
  19 try:
  20     import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22     import urllib2 as compat_urllib_error
  23
  24 try:
  25     import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27     import urllib as compat_urllib_parse
  28
  29 try:
  30     import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32     import cookielib as compat_cookiejar
  33
  34 try:
  35     import html.entities as compat_html_entities
  36 except ImportError: # Python 2
  37     import htmlentitydefs as compat_html_entities
  38
  39 try:
  40     import html.parser as compat_html_parser
  41 except ImportError: # Python 2
  42     import HTMLParser as compat_html_parser
  43
  44 try:
  45     import http.client as compat_http_client
  46 except ImportError: # Python 2
  47     import httplib as compat_http_client
  48
  49 try:
  50     from urllib.parse import parse_qs as compat_parse_qs
  51 except ImportError: # Python 2
  52     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  53     # Python 2's version is apparently totally broken
  54     def _unquote(string, encoding='utf-8', errors='replace'):
  55         if string == '':
  56             return string
  57         res = string.split('%')
  58         if len(res) == 1:
  59             return string
  60         if encoding is None:
  61             encoding = 'utf-8'
  62         if errors is None:
  63             errors = 'replace'
  64         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  65         pct_sequence = b''
  66         string = res[0]
  67         for item in res[1:]:
  68             try:
  69                 if not item:
  70                     raise ValueError
  71                 pct_sequence += item[:2].decode('hex')
  72                 rest = item[2:]
  73                 if not rest:
  74                     # This segment was just a single percent-encoded character.
  75                     # May be part of a sequence of code units, so delay decoding.
  76                     # (Stored in pct_sequence).
  77                     continue
  78             except ValueError:
  79                 rest = '%' + item
  80             # Encountered non-percent-encoded characters. Flush the current
  81             # pct_sequence.
  82             string += pct_sequence.decode(encoding, errors) + rest
  83             pct_sequence = b''
  84         if pct_sequence:
  85             # Flush the final pct_sequence
  86             string += pct_sequence.decode(encoding, errors)
  87         return string
  88
  89     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  90                 encoding='utf-8', errors='replace'):
  91         qs, _coerce_result = qs, unicode
  92         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  93         r = []
  94         for name_value in pairs:
  95             if not name_value and not strict_parsing:
  96                 continue
  97             nv = name_value.split('=', 1)
  98             if len(nv) != 2:
  99                 if strict_parsing:
 100                     raise ValueError("bad query field: %r" % (name_value,))
 101                 # Handle case of a control-name with no equal sign
 102                 if keep_blank_values:
 103                     nv.append('')
 104                 else:
 105                     continue
 106             if len(nv[1]) or keep_blank_values:
 107                 name = nv[0].replace('+', ' ')
 108                 name = _unquote(name, encoding=encoding, errors=errors)
 109                 name = _coerce_result(name)
 110                 value = nv[1].replace('+', ' ')
 111                 value = _unquote(value, encoding=encoding, errors=errors)
 112                 value = _coerce_result(value)
 113                 r.append((name, value))
 114         return r
 115
 116     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 117                 encoding='utf-8', errors='replace'):
 118         parsed_result = {}
 119         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 120                         encoding=encoding, errors=errors)
 121         for name, value in pairs:
 122             if name in parsed_result:
 123                 parsed_result[name].append(value)
 124             else:
 125                 parsed_result[name] = [value]
 126         return parsed_result
 127
 128 try:
 129     compat_str = unicode # Python 2
 130 except NameError:
 131     compat_str = str
 132
 133 try:
 134     compat_chr = unichr # Python 2
 135 except NameError:
 136     compat_chr = chr
 137
 138 std_headers = {
 139     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 140     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 141     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 142     'Accept-Encoding': 'gzip, deflate',
 143     'Accept-Language': 'en-us,en;q=0.5',
 144 }
 145 def preferredencoding():
 146     """Get preferred encoding.
 147
 148     Returns the best encoding scheme for the system, based on
 149     locale.getpreferredencoding() and some further tweaks.
 150     """
 151     try:
 152         pref = locale.getpreferredencoding()
 153         u'TEST'.encode(pref)
 154     except:
 155         pref = 'UTF-8'
 156
 157     return pref
 158
 159 if sys.version_info < (3,0):
 160     def compat_print(s):
 161         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 162 else:
 163     def compat_print(s):
 164         assert type(s) == type(u'')
 165         print(s)
 166
 167 def htmlentity_transform(matchobj):
 168     """Transforms an HTML entity to a character.
 169
 170     This function receives a match object and is intended to be used with
 171     the re.sub() function.
 172     """
 173     entity = matchobj.group(1)
 174
 175     # Known non-numeric HTML entity
 176     if entity in compat_html_entities.name2codepoint:
 177         return compat_chr(compat_html_entities.name2codepoint[entity])
 178
 179     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 180     if mobj is not None:
 181         numstr = mobj.group(1)
 182         if numstr.startswith(u'x'):
 183             base = 16
 184             numstr = u'0%s' % numstr
 185         else:
 186             base = 10
 187         return compat_chr(int(numstr, base))
 188
 189     # Unknown entity in name, return its literal representation
 190     return (u'&%s;' % entity)
 191
 192 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 193 class IDParser(compat_html_parser.HTMLParser):
 194     """Modified HTMLParser that isolates a tag with the specified id"""
 195     def __init__(self, id):
 196         self.id = id
 197         self.result = None
 198         self.started = False
 199         self.depth = {}
 200         self.html = None
 201         self.watch_startpos = False
 202         self.error_count = 0
 203         compat_html_parser.HTMLParser.__init__(self)
 204
 205     def error(self, message):
 206         if self.error_count > 10 or self.started:
 207             raise compat_html_parser.HTMLParseError(message, self.getpos())
 208         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 209         self.error_count += 1
 210         self.goahead(1)
 211
 212     def loads(self, html):
 213         self.html = html
 214         self.feed(html)
 215         self.close()
 216
 217     def handle_starttag(self, tag, attrs):
 218         attrs = dict(attrs)
 219         if self.started:
 220             self.find_startpos(None)
 221         if 'id' in attrs and attrs['id'] == self.id:
 222             self.result = [tag]
 223             self.started = True
 224             self.watch_startpos = True
 225         if self.started:
 226             if not tag in self.depth: self.depth[tag] = 0
 227             self.depth[tag] += 1
 228
 229     def handle_endtag(self, tag):
 230         if self.started:
 231             if tag in self.depth: self.depth[tag] -= 1
 232             if self.depth[self.result[0]] == 0:
 233                 self.started = False
 234                 self.result.append(self.getpos())
 235
 236     def find_startpos(self, x):
 237         """Needed to put the start position of the result (self.result[1])
 238         after the opening tag with the requested id"""
 239         if self.watch_startpos:
 240             self.watch_startpos = False
 241             self.result.append(self.getpos())
 242     handle_entityref = handle_charref = handle_data = handle_comment = \
 243     handle_decl = handle_pi = unknown_decl = find_startpos
 244
 245     def get_result(self):
 246         if self.result is None:
 247             return None
 248         if len(self.result) != 3:
 249             return None
 250         lines = self.html.split('\n')
 251         lines = lines[self.result[1][0]-1:self.result[2][0]]
 252         lines[0] = lines[0][self.result[1][1]:]
 253         if len(lines) == 1:
 254             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 255         lines[-1] = lines[-1][:self.result[2][1]]
 256         return '\n'.join(lines).strip()
 257
 258 def get_element_by_id(id, html):
 259     """Return the content of the tag with the specified id in the passed HTML document"""
 260     parser = IDParser(id)
 261     try:
 262         parser.loads(html)
 263     except compat_html_parser.HTMLParseError:
 264         pass
 265     return parser.get_result()
 266
 267
 268 def clean_html(html):
 269     """Clean an HTML snippet into a readable string"""
 270     # Newline vs <br />
 271     html = html.replace('\n', ' ')
 272     html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 273     # Strip html tags
 274     html = re.sub('<.*?>', '', html)
 275     # Replace html entities
 276     html = unescapeHTML(html)
 277     return html
 278
 279
 280 def sanitize_open(filename, open_mode):
 281     """Try to open the given filename, and slightly tweak it if this fails.
 282
 283     Attempts to open the given filename. If this fails, it tries to change
 284     the filename slightly, step by step, until it's either able to open it
 285     or it fails and raises a final exception, like the standard open()
 286     function.
 287
 288     It returns the tuple (stream, definitive_file_name).
 289     """
 290     try:
 291         if filename == u'-':
 292             if sys.platform == 'win32':
 293                 import msvcrt
 294                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 295             return (sys.stdout, filename)
 296         stream = open(encodeFilename(filename), open_mode)
 297         return (stream, filename)
 298     except (IOError, OSError) as err:
 299         # In case of error, try to remove win32 forbidden chars
 300         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 301
 302         # An exception here should be caught in the caller
 303         stream = open(encodeFilename(filename), open_mode)
 304         return (stream, filename)
 305
 306
 307 def timeconvert(timestr):
 308     """Convert RFC 2822 defined time string into system timestamp"""
 309     timestamp = None
 310     timetuple = email.utils.parsedate_tz(timestr)
 311     if timetuple is not None:
 312         timestamp = email.utils.mktime_tz(timetuple)
 313     return timestamp
 314
 315 def sanitize_filename(s, restricted=False):
 316     """Sanitizes a string so it could be used as part of a filename.
 317     If restricted is set, use a stricter subset of allowed characters.
 318     """
 319     def replace_insane(char):
 320         if char == '?' or ord(char) < 32 or ord(char) == 127:
 321             return ''
 322         elif char == '"':
 323             return '' if restricted else '\''
 324         elif char == ':':
 325             return '_-' if restricted else ' -'
 326         elif char in '\\/|*<>':
 327             return '_'
 328         if restricted and (char in '!&\'' or char.isspace()):
 329             return '_'
 330         if restricted and ord(char) > 127:
 331             return '_'
 332         return char
 333
 334     result = u''.join(map(replace_insane, s))
 335     while '__' in result:
 336         result = result.replace('__', '_')
 337     result = result.strip('_')
 338     # Common case of "Foreign band name - English song title"
 339     if restricted and result.startswith('-_'):
 340         result = result[2:]
 341     if not result:
 342         result = '_'
 343     return result
 344
 345 def orderedSet(iterable):
 346     """ Remove all duplicates from the input iterable """
 347     res = []
 348     for el in iterable:
 349         if el not in res:
 350             res.append(el)
 351     return res
 352
 353 def unescapeHTML(s):
 354     """
 355     @param s a string
 356     """
 357     assert type(s) == type(u'')
 358
 359     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 360     return result
 361
 362 def encodeFilename(s):
 363     """
 364     @param s The name of the file
 365     """
 366
 367     assert type(s) == type(u'')
 368
 369     # Python 3 has a Unicode API
 370     if sys.version_info >= (3, 0):
 371         return s
 372
 373     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 374         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 375         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 376         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 377         return s
 378     else:
 379         return s.encode(sys.getfilesystemencoding(), 'ignore')
 380
 381 class DownloadError(Exception):
 382     """Download Error exception.
 383
 384     This exception may be thrown by FileDownloader objects if they are not
 385     configured to continue on errors. They will contain the appropriate
 386     error message.
 387     """
 388     pass
 389
 390
 391 class SameFileError(Exception):
 392     """Same File exception.
 393
 394     This exception will be thrown by FileDownloader objects if they detect
 395     multiple files would have to be downloaded to the same file on disk.
 396     """
 397     pass
 398
 399
 400 class PostProcessingError(Exception):
 401     """Post Processing exception.
 402
 403     This exception may be raised by PostProcessor's .run() method to
 404     indicate an error in the postprocessing task.
 405     """
 406     pass
 407
 408 class MaxDownloadsReached(Exception):
 409     """ --max-downloads limit has been reached. """
 410     pass
 411
 412
 413 class UnavailableVideoError(Exception):
 414     """Unavailable Format exception.
 415
 416     This exception will be thrown when a video is requested
 417     in a format that is not available for that video.
 418     """
 419     pass
 420
 421
 422 class ContentTooShortError(Exception):
 423     """Content Too Short exception.
 424
 425     This exception may be raised by FileDownloader objects when a file they
 426     download is too small for what the server announced first, indicating
 427     the connection was probably interrupted.
 428     """
 429     # Both in bytes
 430     downloaded = None
 431     expected = None
 432
 433     def __init__(self, downloaded, expected):
 434         self.downloaded = downloaded
 435         self.expected = expected
 436
 437
 438 class Trouble(Exception):
 439     """Trouble helper exception
 440
 441     This is an exception to be handled with
 442     FileDownloader.trouble
 443     """
 444
 445 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 446     """Handler for HTTP requests and responses.
 447
 448     This class, when installed with an OpenerDirector, automatically adds
 449     the standard headers to every HTTP request and handles gzipped and
 450     deflated responses from web servers. If compression is to be avoided in
 451     a particular request, the original request in the program code only has
 452     to include the HTTP header "Youtubedl-No-Compression", which will be
 453     removed before making the real request.
 454
 455     Part of this code was copied from:
 456
 457     http://techknack.net/python-urllib2-handlers/
 458
 459     Andrew Rowls, the author of that code, agreed to release it to the
 460     public domain.
 461     """
 462
 463     @staticmethod
 464     def deflate(data):
 465         try:
 466             return zlib.decompress(data, -zlib.MAX_WBITS)
 467         except zlib.error:
 468             return zlib.decompress(data)
 469
 470     @staticmethod
 471     def addinfourl_wrapper(stream, headers, url, code):
 472         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 473             return compat_urllib_request.addinfourl(stream, headers, url, code)
 474         ret = compat_urllib_request.addinfourl(stream, headers, url)
 475         ret.code = code
 476         return ret
 477
 478     def http_request(self, req):
 479         for h in std_headers:
 480             if h in req.headers:
 481                 del req.headers[h]
 482             req.add_header(h, std_headers[h])
 483         if 'Youtubedl-no-compression' in req.headers:
 484             if 'Accept-encoding' in req.headers:
 485                 del req.headers['Accept-encoding']
 486             del req.headers['Youtubedl-no-compression']
 487         return req
 488
 489     def http_response(self, req, resp):
 490         old_resp = resp
 491         # gzip
 492         if resp.headers.get('Content-encoding', '') == 'gzip':
 493             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 494             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 495             resp.msg = old_resp.msg
 496         # deflate
 497         if resp.headers.get('Content-encoding', '') == 'deflate':
 498             gz = io.BytesIO(self.deflate(resp.read()))
 499             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 500             resp.msg = old_resp.msg
 501         return resp