_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14 import json
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  25         'Accept-Encoding': 'gzip, deflate',
  26         'Accept-Language': 'en-us,en;q=0.5',
  27 }
  28
  29 try:
  30     compat_str = unicode # Python 2
  31 except NameError:
  32     compat_str = str
  33
  34 def preferredencoding():
  35         """Get preferred encoding.
  36
  37         Returns the best encoding scheme for the system, based on
  38         locale.getpreferredencoding() and some further tweaks.
  39         """
  40         try:
  41                 pref = locale.getpreferredencoding()
  42                 u'TEST'.encode(pref)
  43         except:
  44                 pref = 'UTF-8'
  45
  46         return pref
  47
  48
  49 def htmlentity_transform(matchobj):
  50         """Transforms an HTML entity to a Unicode character.
  51
  52         This function receives a match object and is intended to be used with
  53         the re.sub() function.
  54         """
  55         entity = matchobj.group(1)
  56
  57         # Known non-numeric HTML entity
  58         if entity in htmlentitydefs.name2codepoint:
  59                 return unichr(htmlentitydefs.name2codepoint[entity])
  60
  61         # Unicode character
  62         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  63         if mobj is not None:
  64                 numstr = mobj.group(1)
  65                 if numstr.startswith(u'x'):
  66                         base = 16
  67                         numstr = u'0%s' % numstr
  68                 else:
  69                         base = 10
  70                 return unichr(long(numstr, base))
  71
  72         # Unknown entity in name, return its literal representation
  73         return (u'&%s;' % entity)
  74
  75 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  76 class IDParser(HTMLParser.HTMLParser):
  77         """Modified HTMLParser that isolates a tag with the specified id"""
  78         def __init__(self, id):
  79                 self.id = id
  80                 self.result = None
  81                 self.started = False
  82                 self.depth = {}
  83                 self.html = None
  84                 self.watch_startpos = False
  85                 self.error_count = 0
  86                 HTMLParser.HTMLParser.__init__(self)
  87
  88         def error(self, message):
  89                 if self.error_count > 10 or self.started:
  90                         raise HTMLParser.HTMLParseError(message, self.getpos())
  91                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  92                 self.error_count += 1
  93                 self.goahead(1)
  94
  95         def loads(self, html):
  96                 self.html = html
  97                 self.feed(html)
  98                 self.close()
  99
 100         def handle_starttag(self, tag, attrs):
 101                 attrs = dict(attrs)
 102                 if self.started:
 103                         self.find_startpos(None)
 104                 if 'id' in attrs and attrs['id'] == self.id:
 105                         self.result = [tag]
 106                         self.started = True
 107                         self.watch_startpos = True
 108                 if self.started:
 109                         if not tag in self.depth: self.depth[tag] = 0
 110                         self.depth[tag] += 1
 111
 112         def handle_endtag(self, tag):
 113                 if self.started:
 114                         if tag in self.depth: self.depth[tag] -= 1
 115                         if self.depth[self.result[0]] == 0:
 116                                 self.started = False
 117                                 self.result.append(self.getpos())
 118
 119         def find_startpos(self, x):
 120                 """Needed to put the start position of the result (self.result[1])
 121                 after the opening tag with the requested id"""
 122                 if self.watch_startpos:
 123                         self.watch_startpos = False
 124                         self.result.append(self.getpos())
 125         handle_entityref = handle_charref = handle_data = handle_comment = \
 126         handle_decl = handle_pi = unknown_decl = find_startpos
 127
 128         def get_result(self):
 129                 if self.result == None: return None
 130                 if len(self.result) != 3: return None
 131                 lines = self.html.split('\n')
 132                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 133                 lines[0] = lines[0][self.result[1][1]:]
 134                 if len(lines) == 1:
 135                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 136                 lines[-1] = lines[-1][:self.result[2][1]]
 137                 return '\n'.join(lines).strip()
 138
 139 def get_element_by_id(id, html):
 140         """Return the content of the tag with the specified id in the passed HTML document"""
 141         parser = IDParser(id)
 142         try:
 143                 parser.loads(html)
 144         except HTMLParser.HTMLParseError:
 145                 pass
 146         return parser.get_result()
 147
 148
 149 def clean_html(html):
 150         """Clean an HTML snippet into a readable string"""
 151         # Newline vs <br />
 152         html = html.replace('\n', ' ')
 153         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 154         # Strip html tags
 155         html = re.sub('<.*?>', '', html)
 156         # Replace html entities
 157         html = unescapeHTML(html)
 158         return html
 159
 160
 161 def sanitize_open(filename, open_mode):
 162         """Try to open the given filename, and slightly tweak it if this fails.
 163
 164         Attempts to open the given filename. If this fails, it tries to change
 165         the filename slightly, step by step, until it's either able to open it
 166         or it fails and raises a final exception, like the standard open()
 167         function.
 168
 169         It returns the tuple (stream, definitive_file_name).
 170         """
 171         try:
 172                 if filename == u'-':
 173                         if sys.platform == 'win32':
 174                                 import msvcrt
 175                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 176                         return (sys.stdout, filename)
 177                 stream = open(encodeFilename(filename), open_mode)
 178                 return (stream, filename)
 179         except (IOError, OSError), err:
 180                 # In case of error, try to remove win32 forbidden chars
 181                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 182
 183                 # An exception here should be caught in the caller
 184                 stream = open(encodeFilename(filename), open_mode)
 185                 return (stream, filename)
 186
 187
 188 def timeconvert(timestr):
 189         """Convert RFC 2822 defined time string into system timestamp"""
 190         timestamp = None
 191         timetuple = email.utils.parsedate_tz(timestr)
 192         if timetuple is not None:
 193                 timestamp = email.utils.mktime_tz(timetuple)
 194         return timestamp
 195
 196 def sanitize_filename(s, restricted=False):
 197         """Sanitizes a string so it could be used as part of a filename.
 198         If restricted is set, use a stricter subset of allowed characters.
 199         """
 200         def replace_insane(char):
 201                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 202                         return ''
 203                 elif char == '"':
 204                         return '' if restricted else '\''
 205                 elif char == ':':
 206                         return '_-' if restricted else ' -'
 207                 elif char in '\\/|*<>':
 208                         return '_'
 209                 if restricted and (char in '!&\'' or char.isspace()):
 210                         return '_'
 211                 if restricted and ord(char) > 127:
 212                         return '_'
 213                 return char
 214
 215         result = u''.join(map(replace_insane, s))
 216         while '__' in result:
 217                 result = result.replace('__', '_')
 218         result = result.strip('_')
 219         # Common case of "Foreign band name - English song title"
 220         if restricted and result.startswith('-_'):
 221                 result = result[2:]
 222         if not result:
 223                 result = '_'
 224         return result
 225
 226 def orderedSet(iterable):
 227         """ Remove all duplicates from the input iterable """
 228         res = []
 229         for el in iterable:
 230                 if el not in res:
 231                         res.append(el)
 232         return res
 233
 234 def unescapeHTML(s):
 235         """
 236         @param s a string (of type unicode)
 237         """
 238         assert type(s) == type(u'')
 239
 240         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 241         return result
 242
 243 def encodeFilename(s):
 244         """
 245         @param s The name of the file (of type unicode)
 246         """
 247
 248         assert type(s) == type(u'')
 249
 250         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 251                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 252                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 253                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 254                 return s
 255         else:
 256                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 257
 258 class DownloadError(Exception):
 259         """Download Error exception.
 260
 261         This exception may be thrown by FileDownloader objects if they are not
 262         configured to continue on errors. They will contain the appropriate
 263         error message.
 264         """
 265         pass
 266
 267
 268 class SameFileError(Exception):
 269         """Same File exception.
 270
 271         This exception will be thrown by FileDownloader objects if they detect
 272         multiple files would have to be downloaded to the same file on disk.
 273         """
 274         pass
 275
 276
 277 class PostProcessingError(Exception):
 278         """Post Processing exception.
 279
 280         This exception may be raised by PostProcessor's .run() method to
 281         indicate an error in the postprocessing task.
 282         """
 283         pass
 284
 285 class MaxDownloadsReached(Exception):
 286         """ --max-downloads limit has been reached. """
 287         pass
 288
 289
 290 class UnavailableVideoError(Exception):
 291         """Unavailable Format exception.
 292
 293         This exception will be thrown when a video is requested
 294         in a format that is not available for that video.
 295         """
 296         pass
 297
 298
 299 class ContentTooShortError(Exception):
 300         """Content Too Short exception.
 301
 302         This exception may be raised by FileDownloader objects when a file they
 303         download is too small for what the server announced first, indicating
 304         the connection was probably interrupted.
 305         """
 306         # Both in bytes
 307         downloaded = None
 308         expected = None
 309
 310         def __init__(self, downloaded, expected):
 311                 self.downloaded = downloaded
 312                 self.expected = expected
 313
 314
 315 class Trouble(Exception):
 316         """Trouble helper exception
 317
 318         This is an exception to be handled with
 319         FileDownloader.trouble
 320         """
 321
 322 class YoutubeDLHandler(urllib2.HTTPHandler):
 323         """Handler for HTTP requests and responses.
 324
 325         This class, when installed with an OpenerDirector, automatically adds
 326         the standard headers to every HTTP request and handles gzipped and
 327         deflated responses from web servers. If compression is to be avoided in
 328         a particular request, the original request in the program code only has
 329         to include the HTTP header "Youtubedl-No-Compression", which will be
 330         removed before making the real request.
 331
 332         Part of this code was copied from:
 333
 334         http://techknack.net/python-urllib2-handlers/
 335
 336         Andrew Rowls, the author of that code, agreed to release it to the
 337         public domain.
 338         """
 339
 340         @staticmethod
 341         def deflate(data):
 342                 try:
 343                         return zlib.decompress(data, -zlib.MAX_WBITS)
 344                 except zlib.error:
 345                         return zlib.decompress(data)
 346
 347         @staticmethod
 348         def addinfourl_wrapper(stream, headers, url, code):
 349                 if hasattr(urllib2.addinfourl, 'getcode'):
 350                         return urllib2.addinfourl(stream, headers, url, code)
 351                 ret = urllib2.addinfourl(stream, headers, url)
 352                 ret.code = code
 353                 return ret
 354
 355         def http_request(self, req):
 356                 for h in std_headers:
 357                         if h in req.headers:
 358                                 del req.headers[h]
 359                         req.add_header(h, std_headers[h])
 360                 if 'Youtubedl-no-compression' in req.headers:
 361                         if 'Accept-encoding' in req.headers:
 362                                 del req.headers['Accept-encoding']
 363                         del req.headers['Youtubedl-no-compression']
 364                 return req
 365
 366         def http_response(self, req, resp):
 367                 old_resp = resp
 368                 # gzip
 369                 if resp.headers.get('Content-encoding', '') == 'gzip':
 370                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 371                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 372                         resp.msg = old_resp.msg
 373                 # deflate
 374                 if resp.headers.get('Content-encoding', '') == 'deflate':
 375                         gz = StringIO.StringIO(self.deflate(resp.read()))
 376                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 377                         resp.msg = old_resp.msg
 378                 return resp