_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14
  15 try:
  16         import cStringIO as StringIO
  17 except ImportError:
  18         import StringIO
  19
  20 try:
  21         import json
  22 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  23         import trivialjson as json
  24
  25 std_headers = {
  26         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  27         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  28         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  29         'Accept-Encoding': 'gzip, deflate',
  30         'Accept-Language': 'en-us,en;q=0.5',
  31 }
  32
  33 def preferredencoding():
  34         """Get preferred encoding.
  35
  36         Returns the best encoding scheme for the system, based on
  37         locale.getpreferredencoding() and some further tweaks.
  38         """
  39         def yield_preferredencoding():
  40                 try:
  41                         pref = locale.getpreferredencoding()
  42                         u'TEST'.encode(pref)
  43                 except:
  44                         pref = 'UTF-8'
  45                 while True:
  46                         yield pref
  47         return yield_preferredencoding().next()
  48
  49
  50 def htmlentity_transform(matchobj):
  51         """Transforms an HTML entity to a Unicode character.
  52
  53         This function receives a match object and is intended to be used with
  54         the re.sub() function.
  55         """
  56         entity = matchobj.group(1)
  57
  58         # Known non-numeric HTML entity
  59         if entity in htmlentitydefs.name2codepoint:
  60                 return unichr(htmlentitydefs.name2codepoint[entity])
  61
  62         # Unicode character
  63         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  64         if mobj is not None:
  65                 numstr = mobj.group(1)
  66                 if numstr.startswith(u'x'):
  67                         base = 16
  68                         numstr = u'0%s' % numstr
  69                 else:
  70                         base = 10
  71                 return unichr(long(numstr, base))
  72
  73         # Unknown entity in name, return its literal representation
  74         return (u'&%s;' % entity)
  75
  76
  77 class IDParser(HTMLParser.HTMLParser):
  78         """Modified HTMLParser that isolates a tag with the specified id"""
  79         def __init__(self, id):
  80                 self.id = id
  81                 self.result = None
  82                 self.started = False
  83                 self.depth = {}
  84                 self.html = None
  85                 self.watch_startpos = False
  86                 HTMLParser.HTMLParser.__init__(self)
  87
  88         def loads(self, html):
  89                 self.html = html
  90                 self.feed(html)
  91                 self.close()
  92
  93         def handle_starttag(self, tag, attrs):
  94                 attrs = dict(attrs)
  95                 if self.started:
  96                         self.find_startpos(None)
  97                 if 'id' in attrs and attrs['id'] == self.id:
  98                         self.result = [tag]
  99                         self.started = True
 100                         self.watch_startpos = True
 101                 if self.started:
 102                         if not tag in self.depth: self.depth[tag] = 0
 103                         self.depth[tag] += 1
 104
 105         def handle_endtag(self, tag):
 106                 if self.started:
 107                         if tag in self.depth: self.depth[tag] -= 1
 108                         if self.depth[self.result[0]] == 0:
 109                                 self.started = False
 110                                 self.result.append(self.getpos())
 111
 112         def find_startpos(self, x):
 113                 """Needed to put the start position of the result (self.result[1])
 114                 after the opening tag with the requested id"""
 115                 if self.watch_startpos:
 116                         self.watch_startpos = False
 117                         self.result.append(self.getpos())
 118         handle_entityref = handle_charref = handle_data = handle_comment = \
 119         handle_decl = handle_pi = unknown_decl = find_startpos
 120
 121         def get_result(self):
 122                 if self.result == None: return None
 123                 if len(self.result) != 3: return None
 124                 lines = self.html.split('\n')
 125                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 126                 lines[0] = lines[0][self.result[1][1]:]
 127                 if len(lines) == 1:
 128                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 129                 lines[-1] = lines[-1][:self.result[2][1]]
 130                 return '\n'.join(lines).strip()
 131
 132 def get_element_by_id(id, html):
 133         """Return the content of the tag with the specified id in the passed HTML document"""
 134         parser = IDParser(id)
 135         try:
 136                 parser.loads(html)
 137         except HTMLParser.HTMLParseError:
 138                 pass
 139         return parser.get_result()
 140
 141
 142 def clean_html(html):
 143         """Clean an HTML snippet into a readable string"""
 144         # Newline vs <br />
 145         html = html.replace('\n', ' ')
 146         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 147         # Strip html tags
 148         html = re.sub('<.*?>', '', html)
 149         # Replace html entities
 150         html = unescapeHTML(html)
 151         return html
 152
 153
 154 def sanitize_title(utitle):
 155         """Sanitizes a video title so it could be used as part of a filename."""
 156         utitle = unescapeHTML(utitle)
 157         return utitle.replace(unicode(os.sep), u'%')
 158
 159
 160 def sanitize_open(filename, open_mode):
 161         """Try to open the given filename, and slightly tweak it if this fails.
 162
 163         Attempts to open the given filename. If this fails, it tries to change
 164         the filename slightly, step by step, until it's either able to open it
 165         or it fails and raises a final exception, like the standard open()
 166         function.
 167
 168         It returns the tuple (stream, definitive_file_name).
 169         """
 170         try:
 171                 if filename == u'-':
 172                         if sys.platform == 'win32':
 173                                 import msvcrt
 174                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 175                         return (sys.stdout, filename)
 176                 stream = open(encodeFilename(filename), open_mode)
 177                 return (stream, filename)
 178         except (IOError, OSError), err:
 179                 # In case of error, try to remove win32 forbidden chars
 180                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 181
 182                 # An exception here should be caught in the caller
 183                 stream = open(encodeFilename(filename), open_mode)
 184                 return (stream, filename)
 185
 186
 187 def timeconvert(timestr):
 188         """Convert RFC 2822 defined time string into system timestamp"""
 189         timestamp = None
 190         timetuple = email.utils.parsedate_tz(timestr)
 191         if timetuple is not None:
 192                 timestamp = email.utils.mktime_tz(timetuple)
 193         return timestamp
 194
 195 def simplify_title(title):
 196         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 197         return expr.sub(u'_', title).strip(u'_')
 198
 199 def orderedSet(iterable):
 200         """ Remove all duplicates from the input iterable """
 201         res = []
 202         for el in iterable:
 203                 if el not in res:
 204                         res.append(el)
 205         return res
 206
 207 def unescapeHTML(s):
 208         """
 209         @param s a string (of type unicode)
 210         """
 211         assert type(s) == type(u'')
 212
 213         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 214         return result
 215
 216 def encodeFilename(s):
 217         """
 218         @param s The name of the file (of type unicode)
 219         """
 220
 221         assert type(s) == type(u'')
 222
 223         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 224                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 225                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 226                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 227                 return s
 228         else:
 229                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 230
 231 class DownloadError(Exception):
 232         """Download Error exception.
 233
 234         This exception may be thrown by FileDownloader objects if they are not
 235         configured to continue on errors. They will contain the appropriate
 236         error message.
 237         """
 238         pass
 239
 240
 241 class SameFileError(Exception):
 242         """Same File exception.
 243
 244         This exception will be thrown by FileDownloader objects if they detect
 245         multiple files would have to be downloaded to the same file on disk.
 246         """
 247         pass
 248
 249
 250 class PostProcessingError(Exception):
 251         """Post Processing exception.
 252
 253         This exception may be raised by PostProcessor's .run() method to
 254         indicate an error in the postprocessing task.
 255         """
 256         pass
 257
 258 class MaxDownloadsReached(Exception):
 259         """ --max-downloads limit has been reached. """
 260         pass
 261
 262
 263 class UnavailableVideoError(Exception):
 264         """Unavailable Format exception.
 265
 266         This exception will be thrown when a video is requested
 267         in a format that is not available for that video.
 268         """
 269         pass
 270
 271
 272 class ContentTooShortError(Exception):
 273         """Content Too Short exception.
 274
 275         This exception may be raised by FileDownloader objects when a file they
 276         download is too small for what the server announced first, indicating
 277         the connection was probably interrupted.
 278         """
 279         # Both in bytes
 280         downloaded = None
 281         expected = None
 282
 283         def __init__(self, downloaded, expected):
 284                 self.downloaded = downloaded
 285                 self.expected = expected
 286
 287
 288 class YoutubeDLHandler(urllib2.HTTPHandler):
 289         """Handler for HTTP requests and responses.
 290
 291         This class, when installed with an OpenerDirector, automatically adds
 292         the standard headers to every HTTP request and handles gzipped and
 293         deflated responses from web servers. If compression is to be avoided in
 294         a particular request, the original request in the program code only has
 295         to include the HTTP header "Youtubedl-No-Compression", which will be
 296         removed before making the real request.
 297
 298         Part of this code was copied from:
 299
 300         http://techknack.net/python-urllib2-handlers/
 301
 302         Andrew Rowls, the author of that code, agreed to release it to the
 303         public domain.
 304         """
 305
 306         @staticmethod
 307         def deflate(data):
 308                 try:
 309                         return zlib.decompress(data, -zlib.MAX_WBITS)
 310                 except zlib.error:
 311                         return zlib.decompress(data)
 312
 313         @staticmethod
 314         def addinfourl_wrapper(stream, headers, url, code):
 315                 if hasattr(urllib2.addinfourl, 'getcode'):
 316                         return urllib2.addinfourl(stream, headers, url, code)
 317                 ret = urllib2.addinfourl(stream, headers, url)
 318                 ret.code = code
 319                 return ret
 320
 321         def http_request(self, req):
 322                 for h in std_headers:
 323                         if h in req.headers:
 324                                 del req.headers[h]
 325                         req.add_header(h, std_headers[h])
 326                 if 'Youtubedl-no-compression' in req.headers:
 327                         if 'Accept-encoding' in req.headers:
 328                                 del req.headers['Accept-encoding']
 329                         del req.headers['Youtubedl-no-compression']
 330                 return req
 331
 332         def http_response(self, req, resp):
 333                 old_resp = resp
 334                 # gzip
 335                 if resp.headers.get('Content-encoding', '') == 'gzip':
 336                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 337                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 338                         resp.msg = old_resp.msg
 339                 # deflate
 340                 if resp.headers.get('Content-encoding', '') == 'deflate':
 341                         gz = StringIO.StringIO(self.deflate(resp.read()))
 342                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 343                         resp.msg = old_resp.msg
 344                 return resp