_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except ImportError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except ImportError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         import http.client as compat_http_client
  46 except ImportError: # Python 2
  47         import httplib as compat_http_client
  48
  49 try:
  50         from urllib.parse import parse_qs as compat_parse_qs
  51 except ImportError: # Python 2
  52         from urlparse import parse_qs as compat_parse_qs
  53
  54 try:
  55         compat_str = unicode # Python 2
  56 except NameError:
  57         compat_str = str
  58
  59 try:
  60         compat_chr = unichr # Python 2
  61 except NameError:
  62         compat_chr = chr
  63
  64 std_headers = {
  65         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  66         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  67         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  68         'Accept-Encoding': 'gzip, deflate',
  69         'Accept-Language': 'en-us,en;q=0.5',
  70 }
  71 def preferredencoding():
  72         """Get preferred encoding.
  73
  74         Returns the best encoding scheme for the system, based on
  75         locale.getpreferredencoding() and some further tweaks.
  76         """
  77         try:
  78                 pref = locale.getpreferredencoding()
  79                 u'TEST'.encode(pref)
  80         except:
  81                 pref = 'UTF-8'
  82
  83         return pref
  84
  85 if sys.version_info < (3,0):
  86         def compat_print(s):
  87                 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
  88 else:
  89         def compat_print(s):
  90                 assert type(s) == type(u'')
  91                 print(s)
  92
  93 def htmlentity_transform(matchobj):
  94         """Transforms an HTML entity to a character.
  95
  96         This function receives a match object and is intended to be used with
  97         the re.sub() function.
  98         """
  99         entity = matchobj.group(1)
 100
 101         # Known non-numeric HTML entity
 102         if entity in compat_html_entities.name2codepoint:
 103                 return compat_chr(compat_html_entities.name2codepoint[entity])
 104
 105         mobj = re.match(u'(?u)#(x?\\d+)', entity)
 106         if mobj is not None:
 107                 numstr = mobj.group(1)
 108                 if numstr.startswith(u'x'):
 109                         base = 16
 110                         numstr = u'0%s' % numstr
 111                 else:
 112                         base = 10
 113                 return compat_chr(int(numstr, base))
 114
 115         # Unknown entity in name, return its literal representation
 116         return (u'&%s;' % entity)
 117
 118 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 119 class IDParser(compat_html_parser.HTMLParser):
 120         """Modified HTMLParser that isolates a tag with the specified id"""
 121         def __init__(self, id):
 122                 self.id = id
 123                 self.result = None
 124                 self.started = False
 125                 self.depth = {}
 126                 self.html = None
 127                 self.watch_startpos = False
 128                 self.error_count = 0
 129                 compat_html_parser.HTMLParser.__init__(self)
 130
 131         def error(self, message):
 132                 if self.error_count > 10 or self.started:
 133                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 134                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 135                 self.error_count += 1
 136                 self.goahead(1)
 137
 138         def loads(self, html):
 139                 self.html = html
 140                 self.feed(html)
 141                 self.close()
 142
 143         def handle_starttag(self, tag, attrs):
 144                 attrs = dict(attrs)
 145                 if self.started:
 146                         self.find_startpos(None)
 147                 if 'id' in attrs and attrs['id'] == self.id:
 148                         self.result = [tag]
 149                         self.started = True
 150                         self.watch_startpos = True
 151                 if self.started:
 152                         if not tag in self.depth: self.depth[tag] = 0
 153                         self.depth[tag] += 1
 154
 155         def handle_endtag(self, tag):
 156                 if self.started:
 157                         if tag in self.depth: self.depth[tag] -= 1
 158                         if self.depth[self.result[0]] == 0:
 159                                 self.started = False
 160                                 self.result.append(self.getpos())
 161
 162         def find_startpos(self, x):
 163                 """Needed to put the start position of the result (self.result[1])
 164                 after the opening tag with the requested id"""
 165                 if self.watch_startpos:
 166                         self.watch_startpos = False
 167                         self.result.append(self.getpos())
 168         handle_entityref = handle_charref = handle_data = handle_comment = \
 169         handle_decl = handle_pi = unknown_decl = find_startpos
 170
 171         def get_result(self):
 172                 if self.result is None:
 173                         return None
 174                 if len(self.result) != 3:
 175                         return None
 176                 lines = self.html.split('\n')
 177                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 178                 lines[0] = lines[0][self.result[1][1]:]
 179                 if len(lines) == 1:
 180                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 181                 lines[-1] = lines[-1][:self.result[2][1]]
 182                 return '\n'.join(lines).strip()
 183
 184 def get_element_by_id(id, html):
 185         """Return the content of the tag with the specified id in the passed HTML document"""
 186         parser = IDParser(id)
 187         try:
 188                 parser.loads(html)
 189         except compat_html_parser.HTMLParseError:
 190                 pass
 191         return parser.get_result()
 192
 193
 194 def clean_html(html):
 195         """Clean an HTML snippet into a readable string"""
 196         # Newline vs <br />
 197         html = html.replace('\n', ' ')
 198         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 199         # Strip html tags
 200         html = re.sub('<.*?>', '', html)
 201         # Replace html entities
 202         html = unescapeHTML(html)
 203         return html
 204
 205
 206 def sanitize_open(filename, open_mode):
 207         """Try to open the given filename, and slightly tweak it if this fails.
 208
 209         Attempts to open the given filename. If this fails, it tries to change
 210         the filename slightly, step by step, until it's either able to open it
 211         or it fails and raises a final exception, like the standard open()
 212         function.
 213
 214         It returns the tuple (stream, definitive_file_name).
 215         """
 216         try:
 217                 if filename == u'-':
 218                         if sys.platform == 'win32':
 219                                 import msvcrt
 220                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 221                         return (sys.stdout, filename)
 222                 stream = open(encodeFilename(filename), open_mode)
 223                 return (stream, filename)
 224         except (IOError, OSError) as err:
 225                 # In case of error, try to remove win32 forbidden chars
 226                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 227
 228                 # An exception here should be caught in the caller
 229                 stream = open(encodeFilename(filename), open_mode)
 230                 return (stream, filename)
 231
 232
 233 def timeconvert(timestr):
 234         """Convert RFC 2822 defined time string into system timestamp"""
 235         timestamp = None
 236         timetuple = email.utils.parsedate_tz(timestr)
 237         if timetuple is not None:
 238                 timestamp = email.utils.mktime_tz(timetuple)
 239         return timestamp
 240
 241 def sanitize_filename(s, restricted=False):
 242         """Sanitizes a string so it could be used as part of a filename.
 243         If restricted is set, use a stricter subset of allowed characters.
 244         """
 245         def replace_insane(char):
 246                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 247                         return ''
 248                 elif char == '"':
 249                         return '' if restricted else '\''
 250                 elif char == ':':
 251                         return '_-' if restricted else ' -'
 252                 elif char in '\\/|*<>':
 253                         return '_'
 254                 if restricted and (char in '!&\'' or char.isspace()):
 255                         return '_'
 256                 if restricted and ord(char) > 127:
 257                         return '_'
 258                 return char
 259
 260         result = u''.join(map(replace_insane, s))
 261         while '__' in result:
 262                 result = result.replace('__', '_')
 263         result = result.strip('_')
 264         # Common case of "Foreign band name - English song title"
 265         if restricted and result.startswith('-_'):
 266                 result = result[2:]
 267         if not result:
 268                 result = '_'
 269         return result
 270
 271 def orderedSet(iterable):
 272         """ Remove all duplicates from the input iterable """
 273         res = []
 274         for el in iterable:
 275                 if el not in res:
 276                         res.append(el)
 277         return res
 278
 279 def unescapeHTML(s):
 280         """
 281         @param s a string
 282         """
 283         assert type(s) == type(u'')
 284
 285         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 286         return result
 287
 288 def encodeFilename(s):
 289         """
 290         @param s The name of the file
 291         """
 292
 293         assert type(s) == type(u'')
 294
 295         # Python 3 has a Unicode API
 296         if sys.version_info >= (3, 0):
 297                 return s
 298
 299         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 300                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 301                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 302                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 303                 return s
 304         else:
 305                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 306
 307 class DownloadError(Exception):
 308         """Download Error exception.
 309
 310         This exception may be thrown by FileDownloader objects if they are not
 311         configured to continue on errors. They will contain the appropriate
 312         error message.
 313         """
 314         pass
 315
 316
 317 class SameFileError(Exception):
 318         """Same File exception.
 319
 320         This exception will be thrown by FileDownloader objects if they detect
 321         multiple files would have to be downloaded to the same file on disk.
 322         """
 323         pass
 324
 325
 326 class PostProcessingError(Exception):
 327         """Post Processing exception.
 328
 329         This exception may be raised by PostProcessor's .run() method to
 330         indicate an error in the postprocessing task.
 331         """
 332         pass
 333
 334 class MaxDownloadsReached(Exception):
 335         """ --max-downloads limit has been reached. """
 336         pass
 337
 338
 339 class UnavailableVideoError(Exception):
 340         """Unavailable Format exception.
 341
 342         This exception will be thrown when a video is requested
 343         in a format that is not available for that video.
 344         """
 345         pass
 346
 347
 348 class ContentTooShortError(Exception):
 349         """Content Too Short exception.
 350
 351         This exception may be raised by FileDownloader objects when a file they
 352         download is too small for what the server announced first, indicating
 353         the connection was probably interrupted.
 354         """
 355         # Both in bytes
 356         downloaded = None
 357         expected = None
 358
 359         def __init__(self, downloaded, expected):
 360                 self.downloaded = downloaded
 361                 self.expected = expected
 362
 363
 364 class Trouble(Exception):
 365         """Trouble helper exception
 366
 367         This is an exception to be handled with
 368         FileDownloader.trouble
 369         """
 370
 371 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 372         """Handler for HTTP requests and responses.
 373
 374         This class, when installed with an OpenerDirector, automatically adds
 375         the standard headers to every HTTP request and handles gzipped and
 376         deflated responses from web servers. If compression is to be avoided in
 377         a particular request, the original request in the program code only has
 378         to include the HTTP header "Youtubedl-No-Compression", which will be
 379         removed before making the real request.
 380
 381         Part of this code was copied from:
 382
 383         http://techknack.net/python-urllib2-handlers/
 384
 385         Andrew Rowls, the author of that code, agreed to release it to the
 386         public domain.
 387         """
 388
 389         @staticmethod
 390         def deflate(data):
 391                 try:
 392                         return zlib.decompress(data, -zlib.MAX_WBITS)
 393                 except zlib.error:
 394                         return zlib.decompress(data)
 395
 396         @staticmethod
 397         def addinfourl_wrapper(stream, headers, url, code):
 398                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 399                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 400                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 401                 ret.code = code
 402                 return ret
 403
 404         def http_request(self, req):
 405                 for h in std_headers:
 406                         if h in req.headers:
 407                                 del req.headers[h]
 408                         req.add_header(h, std_headers[h])
 409                 if 'Youtubedl-no-compression' in req.headers:
 410                         if 'Accept-encoding' in req.headers:
 411                                 del req.headers['Accept-encoding']
 412                         del req.headers['Youtubedl-no-compression']
 413                 return req
 414
 415         def http_response(self, req, resp):
 416                 old_resp = resp
 417                 # gzip
 418                 if resp.headers.get('Content-encoding', '') == 'gzip':
 419                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 420                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 421                         resp.msg = old_resp.msg
 422                 # deflate
 423                 if resp.headers.get('Content-encoding', '') == 'deflate':
 424                         gz = io.BytesIO(self.deflate(resp.read()))
 425                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 426                         resp.msg = old_resp.msg
 427                 return resp