_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except ImportError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except ImportError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         import http.client as compat_http_client
  46 except ImportError: # Python 2
  47         import httplib as compat_http_client
  48
  49 try:
  50         from urllib.parse import parse_qs as compat_parse_qs
  51 except ImportError: # Python 2
  52         from urlparse import parse_qs as compat_parse_qs
  53
  54 try:
  55         compat_str = unicode # Python 2
  56 except NameError:
  57         compat_str = str
  58
  59 try:
  60         compat_chr = unichr # Python 2
  61 except NameError:
  62         compat_chr = chr
  63
  64 std_headers = {
  65         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  66         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  67         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  68         'Accept-Encoding': 'gzip, deflate',
  69         'Accept-Language': 'en-us,en;q=0.5',
  70 }
  71 def preferredencoding():
  72         """Get preferred encoding.
  73
  74         Returns the best encoding scheme for the system, based on
  75         locale.getpreferredencoding() and some further tweaks.
  76         """
  77         try:
  78                 pref = locale.getpreferredencoding()
  79                 u'TEST'.encode(pref)
  80         except:
  81                 pref = 'UTF-8'
  82
  83         return pref
  84
  85 if sys.version_info < (3,0):
  86         def compat_print(s):
  87                 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
  88 else:
  89         def compat_print(s):
  90                 assert type(s) == type(u'')
  91                 print(s)
  92
  93 def htmlentity_transform(matchobj):
  94         """Transforms an HTML entity to a character.
  95
  96         This function receives a match object and is intended to be used with
  97         the re.sub() function.
  98         """
  99         entity = matchobj.group(1)
 100
 101         # Known non-numeric HTML entity
 102         if entity in compat_html_entities.name2codepoint:
 103                 return compat_chr(compat_html_entities.name2codepoint[entity])
 104
 105         mobj = re.match(u'(?u)#(x?\\d+)', entity)
 106         if mobj is not None:
 107                 numstr = mobj.group(1)
 108                 if numstr.startswith(u'x'):
 109                         base = 16
 110                         numstr = u'0%s' % numstr
 111                 else:
 112                         base = 10
 113                 return compat_chr(int(numstr, base))
 114
 115         # Unknown entity in name, return its literal representation
 116         return (u'&%s;' % entity)
 117
 118 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 119 class IDParser(compat_html_parser.HTMLParser):
 120         """Modified HTMLParser that isolates a tag with the specified id"""
 121         def __init__(self, id):
 122                 self.id = id
 123                 self.result = None
 124                 self.started = False
 125                 self.depth = {}
 126                 self.html = None
 127                 self.watch_startpos = False
 128                 self.error_count = 0
 129                 compat_html_parser.HTMLParser.__init__(self)
 130
 131         def error(self, message):
 132                 if self.error_count > 10 or self.started:
 133                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 134                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 135                 self.error_count += 1
 136                 self.goahead(1)
 137
 138         def loads(self, html):
 139                 self.html = html
 140                 self.feed(html)
 141                 self.close()
 142
 143         def handle_starttag(self, tag, attrs):
 144                 attrs = dict(attrs)
 145                 if self.started:
 146                         self.find_startpos(None)
 147                 if 'id' in attrs and attrs['id'] == self.id:
 148                         self.result = [tag]
 149                         self.started = True
 150                         self.watch_startpos = True
 151                 if self.started:
 152                         if not tag in self.depth: self.depth[tag] = 0
 153                         self.depth[tag] += 1
 154
 155         def handle_endtag(self, tag):
 156                 if self.started:
 157                         if tag in self.depth: self.depth[tag] -= 1
 158                         if self.depth[self.result[0]] == 0:
 159                                 self.started = False
 160                                 self.result.append(self.getpos())
 161
 162         def find_startpos(self, x):
 163                 """Needed to put the start position of the result (self.result[1])
 164                 after the opening tag with the requested id"""
 165                 if self.watch_startpos:
 166                         self.watch_startpos = False
 167                         self.result.append(self.getpos())
 168         handle_entityref = handle_charref = handle_data = handle_comment = \
 169         handle_decl = handle_pi = unknown_decl = find_startpos
 170
 171         def get_result(self):
 172                 if self.result is None:
 173                         return None
 174                 if len(self.result) != 3:
 175                         return None
 176                 lines = self.html.split('\n')
 177                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 178                 lines[0] = lines[0][self.result[1][1]:]
 179                 if len(lines) == 1:
 180                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 181                 lines[-1] = lines[-1][:self.result[2][1]]
 182                 return '\n'.join(lines).strip()
 183
 184 def get_element_by_id(id, html):
 185         """Return the content of the tag with the specified id in the passed HTML document"""
 186         parser = IDParser(id)
 187         try:
 188                 parser.loads(html)
 189         except compat_html_parser.HTMLParseError:
 190                 pass
 191         return parser.get_result()
 192
 193
 194 def clean_html(html):
 195         """Clean an HTML snippet into a readable string"""
 196         # Newline vs <br />
 197         html = html.replace('\n', ' ')
 198         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 199         # Strip html tags
 200         html = re.sub('<.*?>', '', html)
 201         # Replace html entities
 202         html = unescapeHTML(html)
 203         return html
 204
 205
 206 def sanitize_open(filename, open_mode):
 207         """Try to open the given filename, and slightly tweak it if this fails.
 208
 209         Attempts to open the given filename. If this fails, it tries to change
 210         the filename slightly, step by step, until it's either able to open it
 211         or it fails and raises a final exception, like the standard open()
 212         function.
 213
 214         It returns the tuple (stream, definitive_file_name).
 215         """
 216         try:
 217                 if filename == u'-':
 218                         if sys.platform == 'win32':
 219                                 import msvcrt
 220                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 221                         return (sys.stdout, filename)
 222                 stream = open(encodeFilename(filename), open_mode)
 223                 return (stream, filename)
 224         except (IOError, OSError) as err:
 225                 # In case of error, try to remove win32 forbidden chars
 226                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 227
 228                 # An exception here should be caught in the caller
 229                 stream = open(encodeFilename(filename), open_mode)
 230                 return (stream, filename)
 231
 232
 233 def timeconvert(timestr):
 234         """Convert RFC 2822 defined time string into system timestamp"""
 235         timestamp = None
 236         timetuple = email.utils.parsedate_tz(timestr)
 237         if timetuple is not None:
 238                 timestamp = email.utils.mktime_tz(timetuple)
 239         return timestamp
 240
 241 def sanitize_filename(s, restricted=False):
 242         """Sanitizes a string so it could be used as part of a filename.
 243         If restricted is set, use a stricter subset of allowed characters.
 244         """
 245         def replace_insane(char):
 246                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 247                         return ''
 248                 elif char == '"':
 249                         return '' if restricted else '\''
 250                 elif char == ':':
 251                         return '_-' if restricted else ' -'
 252                 elif char in '\\/|*<>':
 253                         return '_'
 254                 if restricted and (char in '!&\'' or char.isspace()):
 255                         return '_'
 256                 if restricted and ord(char) > 127:
 257                         return '_'
 258                 return char
 259
 260         result = u''.join(map(replace_insane, s))
 261         while '__' in result:
 262                 result = result.replace('__', '_')
 263         result = result.strip('_')
 264         # Common case of "Foreign band name - English song title"
 265         if restricted and result.startswith('-_'):
 266                 result = result[2:]
 267         if not result:
 268                 result = '_'
 269         return result
 270
 271 def orderedSet(iterable):
 272         """ Remove all duplicates from the input iterable """
 273         res = []
 274         for el in iterable:
 275                 if el not in res:
 276                         res.append(el)
 277         return res
 278
 279 def unescapeHTML(s):
 280         """
 281         @param s a string
 282         """
 283         assert type(s) == type(u'')
 284
 285         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 286         return result
 287
 288 def encodeFilename(s):
 289         """
 290         @param s The name of the file
 291         """
 292
 293         assert type(s) == type(u'')
 294
 295         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 296                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 297                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 298                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 299                 return s
 300         else:
 301                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 302
 303 class DownloadError(Exception):
 304         """Download Error exception.
 305
 306         This exception may be thrown by FileDownloader objects if they are not
 307         configured to continue on errors. They will contain the appropriate
 308         error message.
 309         """
 310         pass
 311
 312
 313 class SameFileError(Exception):
 314         """Same File exception.
 315
 316         This exception will be thrown by FileDownloader objects if they detect
 317         multiple files would have to be downloaded to the same file on disk.
 318         """
 319         pass
 320
 321
 322 class PostProcessingError(Exception):
 323         """Post Processing exception.
 324
 325         This exception may be raised by PostProcessor's .run() method to
 326         indicate an error in the postprocessing task.
 327         """
 328         pass
 329
 330 class MaxDownloadsReached(Exception):
 331         """ --max-downloads limit has been reached. """
 332         pass
 333
 334
 335 class UnavailableVideoError(Exception):
 336         """Unavailable Format exception.
 337
 338         This exception will be thrown when a video is requested
 339         in a format that is not available for that video.
 340         """
 341         pass
 342
 343
 344 class ContentTooShortError(Exception):
 345         """Content Too Short exception.
 346
 347         This exception may be raised by FileDownloader objects when a file they
 348         download is too small for what the server announced first, indicating
 349         the connection was probably interrupted.
 350         """
 351         # Both in bytes
 352         downloaded = None
 353         expected = None
 354
 355         def __init__(self, downloaded, expected):
 356                 self.downloaded = downloaded
 357                 self.expected = expected
 358
 359
 360 class Trouble(Exception):
 361         """Trouble helper exception
 362
 363         This is an exception to be handled with
 364         FileDownloader.trouble
 365         """
 366
 367 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 368         """Handler for HTTP requests and responses.
 369
 370         This class, when installed with an OpenerDirector, automatically adds
 371         the standard headers to every HTTP request and handles gzipped and
 372         deflated responses from web servers. If compression is to be avoided in
 373         a particular request, the original request in the program code only has
 374         to include the HTTP header "Youtubedl-No-Compression", which will be
 375         removed before making the real request.
 376
 377         Part of this code was copied from:
 378
 379         http://techknack.net/python-urllib2-handlers/
 380
 381         Andrew Rowls, the author of that code, agreed to release it to the
 382         public domain.
 383         """
 384
 385         @staticmethod
 386         def deflate(data):
 387                 try:
 388                         return zlib.decompress(data, -zlib.MAX_WBITS)
 389                 except zlib.error:
 390                         return zlib.decompress(data)
 391
 392         @staticmethod
 393         def addinfourl_wrapper(stream, headers, url, code):
 394                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 395                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 396                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 397                 ret.code = code
 398                 return ret
 399
 400         def http_request(self, req):
 401                 for h in std_headers:
 402                         if h in req.headers:
 403                                 del req.headers[h]
 404                         req.add_header(h, std_headers[h])
 405                 if 'Youtubedl-no-compression' in req.headers:
 406                         if 'Accept-encoding' in req.headers:
 407                                 del req.headers['Accept-encoding']
 408                         del req.headers['Youtubedl-no-compression']
 409                 return req
 410
 411         def http_response(self, req, resp):
 412                 old_resp = resp
 413                 # gzip
 414                 if resp.headers.get('Content-encoding', '') == 'gzip':
 415                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 416                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 417                         resp.msg = old_resp.msg
 418                 # deflate
 419                 if resp.headers.get('Content-encoding', '') == 'deflate':
 420                         gz = io.BytesIO(self.deflate(resp.read()))
 421                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 422                         resp.msg = old_resp.msg
 423                 return resp