_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except NameError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except NameError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         import http.client as compat_html_client
  46 except NameError: # Python 2
  47         import httplib as compat_html_client
  48
  49
  50 try:
  51         compat_str = unicode # Python 2
  52 except NameError:
  53         compat_str = str
  54
  55 try:
  56         compat_chr = unichr # Python 2
  57 except NameError:
  58         compat_chr = chr
  59
  60
  61 std_headers = {
  62         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  63         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  64         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  65         'Accept-Encoding': 'gzip, deflate',
  66         'Accept-Language': 'en-us,en;q=0.5',
  67 }
  68 def preferredencoding():
  69         """Get preferred encoding.
  70
  71         Returns the best encoding scheme for the system, based on
  72         locale.getpreferredencoding() and some further tweaks.
  73         """
  74         try:
  75                 pref = locale.getpreferredencoding()
  76                 u'TEST'.encode(pref)
  77         except:
  78                 pref = 'UTF-8'
  79
  80         return pref
  81
  82
  83 def htmlentity_transform(matchobj):
  84         """Transforms an HTML entity to a character.
  85
  86         This function receives a match object and is intended to be used with
  87         the re.sub() function.
  88         """
  89         entity = matchobj.group(1)
  90
  91         # Known non-numeric HTML entity
  92         if entity in compat_html_entities.name2codepoint:
  93                 return compat_chr(compat_html_entities.name2codepoint[entity])
  94
  95         mobj = re.match(u'(?u)#(x?\\d+)', entity)
  96         if mobj is not None:
  97                 numstr = mobj.group(1)
  98                 if numstr.startswith(u'x'):
  99                         base = 16
 100                         numstr = u'0%s' % numstr
 101                 else:
 102                         base = 10
 103                 return compat_chr(int(numstr, base))
 104
 105         # Unknown entity in name, return its literal representation
 106         return (u'&%s;' % entity)
 107
 108 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 109 class IDParser(compat_html_parser.HTMLParser):
 110         """Modified HTMLParser that isolates a tag with the specified id"""
 111         def __init__(self, id):
 112                 self.id = id
 113                 self.result = None
 114                 self.started = False
 115                 self.depth = {}
 116                 self.html = None
 117                 self.watch_startpos = False
 118                 self.error_count = 0
 119                 compat_html_parser.HTMLParser.__init__(self)
 120
 121         def error(self, message):
 122                 if self.error_count > 10 or self.started:
 123                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 124                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 125                 self.error_count += 1
 126                 self.goahead(1)
 127
 128         def loads(self, html):
 129                 self.html = html
 130                 self.feed(html)
 131                 self.close()
 132
 133         def handle_starttag(self, tag, attrs):
 134                 attrs = dict(attrs)
 135                 if self.started:
 136                         self.find_startpos(None)
 137                 if 'id' in attrs and attrs['id'] == self.id:
 138                         self.result = [tag]
 139                         self.started = True
 140                         self.watch_startpos = True
 141                 if self.started:
 142                         if not tag in self.depth: self.depth[tag] = 0
 143                         self.depth[tag] += 1
 144
 145         def handle_endtag(self, tag):
 146                 if self.started:
 147                         if tag in self.depth: self.depth[tag] -= 1
 148                         if self.depth[self.result[0]] == 0:
 149                                 self.started = False
 150                                 self.result.append(self.getpos())
 151
 152         def find_startpos(self, x):
 153                 """Needed to put the start position of the result (self.result[1])
 154                 after the opening tag with the requested id"""
 155                 if self.watch_startpos:
 156                         self.watch_startpos = False
 157                         self.result.append(self.getpos())
 158         handle_entityref = handle_charref = handle_data = handle_comment = \
 159         handle_decl = handle_pi = unknown_decl = find_startpos
 160
 161         def get_result(self):
 162                 if self.result is None:
 163                         return None
 164                 if len(self.result) != 3:
 165                         return None
 166                 lines = self.html.split('\n')
 167                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 168                 lines[0] = lines[0][self.result[1][1]:]
 169                 if len(lines) == 1:
 170                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 171                 lines[-1] = lines[-1][:self.result[2][1]]
 172                 return '\n'.join(lines).strip()
 173
 174 def get_element_by_id(id, html):
 175         """Return the content of the tag with the specified id in the passed HTML document"""
 176         parser = IDParser(id)
 177         try:
 178                 parser.loads(html)
 179         except compat_html_parser.HTMLParseError:
 180                 pass
 181         return parser.get_result()
 182
 183
 184 def clean_html(html):
 185         """Clean an HTML snippet into a readable string"""
 186         # Newline vs <br />
 187         html = html.replace('\n', ' ')
 188         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 189         # Strip html tags
 190         html = re.sub('<.*?>', '', html)
 191         # Replace html entities
 192         html = unescapeHTML(html)
 193         return html
 194
 195
 196 def sanitize_open(filename, open_mode):
 197         """Try to open the given filename, and slightly tweak it if this fails.
 198
 199         Attempts to open the given filename. If this fails, it tries to change
 200         the filename slightly, step by step, until it's either able to open it
 201         or it fails and raises a final exception, like the standard open()
 202         function.
 203
 204         It returns the tuple (stream, definitive_file_name).
 205         """
 206         try:
 207                 if filename == u'-':
 208                         if sys.platform == 'win32':
 209                                 import msvcrt
 210                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 211                         return (sys.stdout, filename)
 212                 stream = open(encodeFilename(filename), open_mode)
 213                 return (stream, filename)
 214         except (IOError, OSError) as err:
 215                 # In case of error, try to remove win32 forbidden chars
 216                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 217
 218                 # An exception here should be caught in the caller
 219                 stream = open(encodeFilename(filename), open_mode)
 220                 return (stream, filename)
 221
 222
 223 def timeconvert(timestr):
 224         """Convert RFC 2822 defined time string into system timestamp"""
 225         timestamp = None
 226         timetuple = email.utils.parsedate_tz(timestr)
 227         if timetuple is not None:
 228                 timestamp = email.utils.mktime_tz(timetuple)
 229         return timestamp
 230
 231 def sanitize_filename(s, restricted=False):
 232         """Sanitizes a string so it could be used as part of a filename.
 233         If restricted is set, use a stricter subset of allowed characters.
 234         """
 235         def replace_insane(char):
 236                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 237                         return ''
 238                 elif char == '"':
 239                         return '' if restricted else '\''
 240                 elif char == ':':
 241                         return '_-' if restricted else ' -'
 242                 elif char in '\\/|*<>':
 243                         return '_'
 244                 if restricted and (char in '!&\'' or char.isspace()):
 245                         return '_'
 246                 if restricted and ord(char) > 127:
 247                         return '_'
 248                 return char
 249
 250         result = u''.join(map(replace_insane, s))
 251         while '__' in result:
 252                 result = result.replace('__', '_')
 253         result = result.strip('_')
 254         # Common case of "Foreign band name - English song title"
 255         if restricted and result.startswith('-_'):
 256                 result = result[2:]
 257         if not result:
 258                 result = '_'
 259         return result
 260
 261 def orderedSet(iterable):
 262         """ Remove all duplicates from the input iterable """
 263         res = []
 264         for el in iterable:
 265                 if el not in res:
 266                         res.append(el)
 267         return res
 268
 269 def unescapeHTML(s):
 270         """
 271         @param s a string
 272         """
 273         assert type(s) == type(u'')
 274
 275         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 276         return result
 277
 278 def encodeFilename(s):
 279         """
 280         @param s The name of the file
 281         """
 282
 283         assert type(s) == type(u'')
 284
 285         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 286                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 287                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 288                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 289                 return s
 290         else:
 291                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 292
 293 class DownloadError(Exception):
 294         """Download Error exception.
 295
 296         This exception may be thrown by FileDownloader objects if they are not
 297         configured to continue on errors. They will contain the appropriate
 298         error message.
 299         """
 300         pass
 301
 302
 303 class SameFileError(Exception):
 304         """Same File exception.
 305
 306         This exception will be thrown by FileDownloader objects if they detect
 307         multiple files would have to be downloaded to the same file on disk.
 308         """
 309         pass
 310
 311
 312 class PostProcessingError(Exception):
 313         """Post Processing exception.
 314
 315         This exception may be raised by PostProcessor's .run() method to
 316         indicate an error in the postprocessing task.
 317         """
 318         pass
 319
 320 class MaxDownloadsReached(Exception):
 321         """ --max-downloads limit has been reached. """
 322         pass
 323
 324
 325 class UnavailableVideoError(Exception):
 326         """Unavailable Format exception.
 327
 328         This exception will be thrown when a video is requested
 329         in a format that is not available for that video.
 330         """
 331         pass
 332
 333
 334 class ContentTooShortError(Exception):
 335         """Content Too Short exception.
 336
 337         This exception may be raised by FileDownloader objects when a file they
 338         download is too small for what the server announced first, indicating
 339         the connection was probably interrupted.
 340         """
 341         # Both in bytes
 342         downloaded = None
 343         expected = None
 344
 345         def __init__(self, downloaded, expected):
 346                 self.downloaded = downloaded
 347                 self.expected = expected
 348
 349
 350 class Trouble(Exception):
 351         """Trouble helper exception
 352
 353         This is an exception to be handled with
 354         FileDownloader.trouble
 355         """
 356
 357 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 358         """Handler for HTTP requests and responses.
 359
 360         This class, when installed with an OpenerDirector, automatically adds
 361         the standard headers to every HTTP request and handles gzipped and
 362         deflated responses from web servers. If compression is to be avoided in
 363         a particular request, the original request in the program code only has
 364         to include the HTTP header "Youtubedl-No-Compression", which will be
 365         removed before making the real request.
 366
 367         Part of this code was copied from:
 368
 369         http://techknack.net/python-urllib2-handlers/
 370
 371         Andrew Rowls, the author of that code, agreed to release it to the
 372         public domain.
 373         """
 374
 375         @staticmethod
 376         def deflate(data):
 377                 try:
 378                         return zlib.decompress(data, -zlib.MAX_WBITS)
 379                 except zlib.error:
 380                         return zlib.decompress(data)
 381
 382         @staticmethod
 383         def addinfourl_wrapper(stream, headers, url, code):
 384                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 385                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 386                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 387                 ret.code = code
 388                 return ret
 389
 390         def http_request(self, req):
 391                 for h in std_headers:
 392                         if h in req.headers:
 393                                 del req.headers[h]
 394                         req.add_header(h, std_headers[h])
 395                 if 'Youtubedl-no-compression' in req.headers:
 396                         if 'Accept-encoding' in req.headers:
 397                                 del req.headers['Accept-encoding']
 398                         del req.headers['Youtubedl-no-compression']
 399                 return req
 400
 401         def http_response(self, req, resp):
 402                 old_resp = resp
 403                 # gzip
 404                 if resp.headers.get('Content-encoding', '') == 'gzip':
 405                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 406                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 407                         resp.msg = old_resp.msg
 408                 # deflate
 409                 if resp.headers.get('Content-encoding', '') == 'deflate':
 410                         gz = io.BytesIO(self.deflate(resp.read()))
 411                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 412                         resp.msg = old_resp.msg
 413                 return resp