_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except ImportError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except ImportError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         import http.client as compat_http_client
  46 except ImportError: # Python 2
  47         import httplib as compat_http_client
  48
  49 try:
  50         from urllib.parse import parse_qs as compat_parse_qs
  51 except ImportError: # Python 2
  52         from urlparse import parse_qs as compat_parse_qs
  53
  54 try:
  55         compat_str = unicode # Python 2
  56 except NameError:
  57         compat_str = str
  58
  59 try:
  60         compat_chr = unichr # Python 2
  61 except NameError:
  62         compat_chr = chr
  63
  64 std_headers = {
  65         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  66         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  67         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  68         'Accept-Encoding': 'gzip, deflate',
  69         'Accept-Language': 'en-us,en;q=0.5',
  70 }
  71 def preferredencoding():
  72         """Get preferred encoding.
  73
  74         Returns the best encoding scheme for the system, based on
  75         locale.getpreferredencoding() and some further tweaks.
  76         """
  77         try:
  78                 pref = locale.getpreferredencoding()
  79                 u'TEST'.encode(pref)
  80         except:
  81                 pref = 'UTF-8'
  82
  83         return pref
  84
  85 if sys.version_info < (3,0):
  86         def compat_print(s):
  87                 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
  88 else:
  89         def compat_print(s):
  90                 print(s)
  91
  92 def htmlentity_transform(matchobj):
  93         """Transforms an HTML entity to a character.
  94
  95         This function receives a match object and is intended to be used with
  96         the re.sub() function.
  97         """
  98         entity = matchobj.group(1)
  99
 100         # Known non-numeric HTML entity
 101         if entity in compat_html_entities.name2codepoint:
 102                 return compat_chr(compat_html_entities.name2codepoint[entity])
 103
 104         mobj = re.match(u'(?u)#(x?\\d+)', entity)
 105         if mobj is not None:
 106                 numstr = mobj.group(1)
 107                 if numstr.startswith(u'x'):
 108                         base = 16
 109                         numstr = u'0%s' % numstr
 110                 else:
 111                         base = 10
 112                 return compat_chr(int(numstr, base))
 113
 114         # Unknown entity in name, return its literal representation
 115         return (u'&%s;' % entity)
 116
 117 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 118 class IDParser(compat_html_parser.HTMLParser):
 119         """Modified HTMLParser that isolates a tag with the specified id"""
 120         def __init__(self, id):
 121                 self.id = id
 122                 self.result = None
 123                 self.started = False
 124                 self.depth = {}
 125                 self.html = None
 126                 self.watch_startpos = False
 127                 self.error_count = 0
 128                 compat_html_parser.HTMLParser.__init__(self)
 129
 130         def error(self, message):
 131                 if self.error_count > 10 or self.started:
 132                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 133                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 134                 self.error_count += 1
 135                 self.goahead(1)
 136
 137         def loads(self, html):
 138                 self.html = html
 139                 self.feed(html)
 140                 self.close()
 141
 142         def handle_starttag(self, tag, attrs):
 143                 attrs = dict(attrs)
 144                 if self.started:
 145                         self.find_startpos(None)
 146                 if 'id' in attrs and attrs['id'] == self.id:
 147                         self.result = [tag]
 148                         self.started = True
 149                         self.watch_startpos = True
 150                 if self.started:
 151                         if not tag in self.depth: self.depth[tag] = 0
 152                         self.depth[tag] += 1
 153
 154         def handle_endtag(self, tag):
 155                 if self.started:
 156                         if tag in self.depth: self.depth[tag] -= 1
 157                         if self.depth[self.result[0]] == 0:
 158                                 self.started = False
 159                                 self.result.append(self.getpos())
 160
 161         def find_startpos(self, x):
 162                 """Needed to put the start position of the result (self.result[1])
 163                 after the opening tag with the requested id"""
 164                 if self.watch_startpos:
 165                         self.watch_startpos = False
 166                         self.result.append(self.getpos())
 167         handle_entityref = handle_charref = handle_data = handle_comment = \
 168         handle_decl = handle_pi = unknown_decl = find_startpos
 169
 170         def get_result(self):
 171                 if self.result is None:
 172                         return None
 173                 if len(self.result) != 3:
 174                         return None
 175                 lines = self.html.split('\n')
 176                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 177                 lines[0] = lines[0][self.result[1][1]:]
 178                 if len(lines) == 1:
 179                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 180                 lines[-1] = lines[-1][:self.result[2][1]]
 181                 return '\n'.join(lines).strip()
 182
 183 def get_element_by_id(id, html):
 184         """Return the content of the tag with the specified id in the passed HTML document"""
 185         parser = IDParser(id)
 186         try:
 187                 parser.loads(html)
 188         except compat_html_parser.HTMLParseError:
 189                 pass
 190         return parser.get_result()
 191
 192
 193 def clean_html(html):
 194         """Clean an HTML snippet into a readable string"""
 195         # Newline vs <br />
 196         html = html.replace('\n', ' ')
 197         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 198         # Strip html tags
 199         html = re.sub('<.*?>', '', html)
 200         # Replace html entities
 201         html = unescapeHTML(html)
 202         return html
 203
 204
 205 def sanitize_open(filename, open_mode):
 206         """Try to open the given filename, and slightly tweak it if this fails.
 207
 208         Attempts to open the given filename. If this fails, it tries to change
 209         the filename slightly, step by step, until it's either able to open it
 210         or it fails and raises a final exception, like the standard open()
 211         function.
 212
 213         It returns the tuple (stream, definitive_file_name).
 214         """
 215         try:
 216                 if filename == u'-':
 217                         if sys.platform == 'win32':
 218                                 import msvcrt
 219                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 220                         return (sys.stdout, filename)
 221                 stream = open(encodeFilename(filename), open_mode)
 222                 return (stream, filename)
 223         except (IOError, OSError) as err:
 224                 # In case of error, try to remove win32 forbidden chars
 225                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 226
 227                 # An exception here should be caught in the caller
 228                 stream = open(encodeFilename(filename), open_mode)
 229                 return (stream, filename)
 230
 231
 232 def timeconvert(timestr):
 233         """Convert RFC 2822 defined time string into system timestamp"""
 234         timestamp = None
 235         timetuple = email.utils.parsedate_tz(timestr)
 236         if timetuple is not None:
 237                 timestamp = email.utils.mktime_tz(timetuple)
 238         return timestamp
 239
 240 def sanitize_filename(s, restricted=False):
 241         """Sanitizes a string so it could be used as part of a filename.
 242         If restricted is set, use a stricter subset of allowed characters.
 243         """
 244         def replace_insane(char):
 245                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 246                         return ''
 247                 elif char == '"':
 248                         return '' if restricted else '\''
 249                 elif char == ':':
 250                         return '_-' if restricted else ' -'
 251                 elif char in '\\/|*<>':
 252                         return '_'
 253                 if restricted and (char in '!&\'' or char.isspace()):
 254                         return '_'
 255                 if restricted and ord(char) > 127:
 256                         return '_'
 257                 return char
 258
 259         result = u''.join(map(replace_insane, s))
 260         while '__' in result:
 261                 result = result.replace('__', '_')
 262         result = result.strip('_')
 263         # Common case of "Foreign band name - English song title"
 264         if restricted and result.startswith('-_'):
 265                 result = result[2:]
 266         if not result:
 267                 result = '_'
 268         return result
 269
 270 def orderedSet(iterable):
 271         """ Remove all duplicates from the input iterable """
 272         res = []
 273         for el in iterable:
 274                 if el not in res:
 275                         res.append(el)
 276         return res
 277
 278 def unescapeHTML(s):
 279         """
 280         @param s a string
 281         """
 282         assert type(s) == type(u'')
 283
 284         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 285         return result
 286
 287 def encodeFilename(s):
 288         """
 289         @param s The name of the file
 290         """
 291
 292         assert type(s) == type(u'')
 293
 294         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 295                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 296                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 297                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 298                 return s
 299         else:
 300                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 301
 302 class DownloadError(Exception):
 303         """Download Error exception.
 304
 305         This exception may be thrown by FileDownloader objects if they are not
 306         configured to continue on errors. They will contain the appropriate
 307         error message.
 308         """
 309         pass
 310
 311
 312 class SameFileError(Exception):
 313         """Same File exception.
 314
 315         This exception will be thrown by FileDownloader objects if they detect
 316         multiple files would have to be downloaded to the same file on disk.
 317         """
 318         pass
 319
 320
 321 class PostProcessingError(Exception):
 322         """Post Processing exception.
 323
 324         This exception may be raised by PostProcessor's .run() method to
 325         indicate an error in the postprocessing task.
 326         """
 327         pass
 328
 329 class MaxDownloadsReached(Exception):
 330         """ --max-downloads limit has been reached. """
 331         pass
 332
 333
 334 class UnavailableVideoError(Exception):
 335         """Unavailable Format exception.
 336
 337         This exception will be thrown when a video is requested
 338         in a format that is not available for that video.
 339         """
 340         pass
 341
 342
 343 class ContentTooShortError(Exception):
 344         """Content Too Short exception.
 345
 346         This exception may be raised by FileDownloader objects when a file they
 347         download is too small for what the server announced first, indicating
 348         the connection was probably interrupted.
 349         """
 350         # Both in bytes
 351         downloaded = None
 352         expected = None
 353
 354         def __init__(self, downloaded, expected):
 355                 self.downloaded = downloaded
 356                 self.expected = expected
 357
 358
 359 class Trouble(Exception):
 360         """Trouble helper exception
 361
 362         This is an exception to be handled with
 363         FileDownloader.trouble
 364         """
 365
 366 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 367         """Handler for HTTP requests and responses.
 368
 369         This class, when installed with an OpenerDirector, automatically adds
 370         the standard headers to every HTTP request and handles gzipped and
 371         deflated responses from web servers. If compression is to be avoided in
 372         a particular request, the original request in the program code only has
 373         to include the HTTP header "Youtubedl-No-Compression", which will be
 374         removed before making the real request.
 375
 376         Part of this code was copied from:
 377
 378         http://techknack.net/python-urllib2-handlers/
 379
 380         Andrew Rowls, the author of that code, agreed to release it to the
 381         public domain.
 382         """
 383
 384         @staticmethod
 385         def deflate(data):
 386                 try:
 387                         return zlib.decompress(data, -zlib.MAX_WBITS)
 388                 except zlib.error:
 389                         return zlib.decompress(data)
 390
 391         @staticmethod
 392         def addinfourl_wrapper(stream, headers, url, code):
 393                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 394                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 395                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 396                 ret.code = code
 397                 return ret
 398
 399         def http_request(self, req):
 400                 for h in std_headers:
 401                         if h in req.headers:
 402                                 del req.headers[h]
 403                         req.add_header(h, std_headers[h])
 404                 if 'Youtubedl-no-compression' in req.headers:
 405                         if 'Accept-encoding' in req.headers:
 406                                 del req.headers['Accept-encoding']
 407                         del req.headers['Youtubedl-no-compression']
 408                 return req
 409
 410         def http_response(self, req, resp):
 411                 old_resp = resp
 412                 # gzip
 413                 if resp.headers.get('Content-encoding', '') == 'gzip':
 414                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 415                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 416                         resp.msg = old_resp.msg
 417                 # deflate
 418                 if resp.headers.get('Content-encoding', '') == 'deflate':
 419                         gz = io.BytesIO(self.deflate(resp.read()))
 420                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 421                         resp.msg = old_resp.msg
 422                 return resp