git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14 import json
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  25         'Accept-Encoding': 'gzip, deflate',
  26         'Accept-Language': 'en-us,en;q=0.5',
  27 }
  28
  29 def preferredencoding():
  30         """Get preferred encoding.
  31
  32         Returns the best encoding scheme for the system, based on
  33         locale.getpreferredencoding() and some further tweaks.
  34         """
  35         def yield_preferredencoding():
  36                 try:
  37                         pref = locale.getpreferredencoding()
  38                         u'TEST'.encode(pref)
  39                 except:
  40                         pref = 'UTF-8'
  41                 while True:
  42                         yield pref
  43         return yield_preferredencoding().next()
  44
  45
  46 def htmlentity_transform(matchobj):
  47         """Transforms an HTML entity to a Unicode character.
  48
  49         This function receives a match object and is intended to be used with
  50         the re.sub() function.
  51         """
  52         entity = matchobj.group(1)
  53
  54         # Known non-numeric HTML entity
  55         if entity in htmlentitydefs.name2codepoint:
  56                 return unichr(htmlentitydefs.name2codepoint[entity])
  57
  58         # Unicode character
  59         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  60         if mobj is not None:
  61                 numstr = mobj.group(1)
  62                 if numstr.startswith(u'x'):
  63                         base = 16
  64                         numstr = u'0%s' % numstr
  65                 else:
  66                         base = 10
  67                 return unichr(long(numstr, base))
  68
  69         # Unknown entity in name, return its literal representation
  70         return (u'&%s;' % entity)
  71
  72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  73 class IDParser(HTMLParser.HTMLParser):
  74         """Modified HTMLParser that isolates a tag with the specified id"""
  75         def __init__(self, id):
  76                 self.id = id
  77                 self.result = None
  78                 self.started = False
  79                 self.depth = {}
  80                 self.html = None
  81                 self.watch_startpos = False
  82                 self.error_count = 0
  83                 HTMLParser.HTMLParser.__init__(self)
  84
  85         def error(self, message):
  86                 print >> sys.stderr, self.getpos()
  87                 if self.error_count > 10 or self.started:
  88                         raise HTMLParser.HTMLParseError(message, self.getpos())
  89                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  90                 self.error_count += 1
  91                 self.goahead(1)
  92
  93         def loads(self, html):
  94                 self.html = html
  95                 self.feed(html)
  96                 self.close()
  97
  98         def handle_starttag(self, tag, attrs):
  99                 attrs = dict(attrs)
 100                 if self.started:
 101                         self.find_startpos(None)
 102                 if 'id' in attrs and attrs['id'] == self.id:
 103                         self.result = [tag]
 104                         self.started = True
 105                         self.watch_startpos = True
 106                 if self.started:
 107                         if not tag in self.depth: self.depth[tag] = 0
 108                         self.depth[tag] += 1
 109
 110         def handle_endtag(self, tag):
 111                 if self.started:
 112                         if tag in self.depth: self.depth[tag] -= 1
 113                         if self.depth[self.result[0]] == 0:
 114                                 self.started = False
 115                                 self.result.append(self.getpos())
 116
 117         def find_startpos(self, x):
 118                 """Needed to put the start position of the result (self.result[1])
 119                 after the opening tag with the requested id"""
 120                 if self.watch_startpos:
 121                         self.watch_startpos = False
 122                         self.result.append(self.getpos())
 123         handle_entityref = handle_charref = handle_data = handle_comment = \
 124         handle_decl = handle_pi = unknown_decl = find_startpos
 125
 126         def get_result(self):
 127                 if self.result == None: return None
 128                 if len(self.result) != 3: return None
 129                 lines = self.html.split('\n')
 130                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 131                 lines[0] = lines[0][self.result[1][1]:]
 132                 if len(lines) == 1:
 133                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 134                 lines[-1] = lines[-1][:self.result[2][1]]
 135                 return '\n'.join(lines).strip()
 136
 137 def get_element_by_id(id, html):
 138         """Return the content of the tag with the specified id in the passed HTML document"""
 139         parser = IDParser(id)
 140         try:
 141                 parser.loads(html)
 142         except HTMLParser.HTMLParseError:
 143                 pass
 144         return parser.get_result()
 145
 146
 147 def clean_html(html):
 148         """Clean an HTML snippet into a readable string"""
 149         # Newline vs <br />
 150         html = html.replace('\n', ' ')
 151         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 152         # Strip html tags
 153         html = re.sub('<.*?>', '', html)
 154         # Replace html entities
 155         html = unescapeHTML(html)
 156         return html
 157
 158
 159 def sanitize_open(filename, open_mode):
 160         """Try to open the given filename, and slightly tweak it if this fails.
 161
 162         Attempts to open the given filename. If this fails, it tries to change
 163         the filename slightly, step by step, until it's either able to open it
 164         or it fails and raises a final exception, like the standard open()
 165         function.
 166
 167         It returns the tuple (stream, definitive_file_name).
 168         """
 169         try:
 170                 if filename == u'-':
 171                         if sys.platform == 'win32':
 172                                 import msvcrt
 173                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 174                         return (sys.stdout, filename)
 175                 stream = open(encodeFilename(filename), open_mode)
 176                 return (stream, filename)
 177         except (IOError, OSError), err:
 178                 # In case of error, try to remove win32 forbidden chars
 179                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 180
 181                 # An exception here should be caught in the caller
 182                 stream = open(encodeFilename(filename), open_mode)
 183                 return (stream, filename)
 184
 185
 186 def timeconvert(timestr):
 187         """Convert RFC 2822 defined time string into system timestamp"""
 188         timestamp = None
 189         timetuple = email.utils.parsedate_tz(timestr)
 190         if timetuple is not None:
 191                 timestamp = email.utils.mktime_tz(timetuple)
 192         return timestamp
 193
 194 def sanitize_filename(s):
 195         """Sanitizes a string so it could be used as part of a filename."""
 196         def replace_insane(char):
 197                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 198                         return ''
 199                 elif char == '"':
 200                         return '\''
 201                 elif char == ':':
 202                         return ' -'
 203                 elif char in '\\/|*<>':
 204                         return '-'
 205                 return char
 206
 207         result = u''.join(map(replace_insane, s))
 208         while '--' in result:
 209                 result = result.replace('--', '-')
 210         return result.strip('-')
 211
 212 def orderedSet(iterable):
 213         """ Remove all duplicates from the input iterable """
 214         res = []
 215         for el in iterable:
 216                 if el not in res:
 217                         res.append(el)
 218         return res
 219
 220 def unescapeHTML(s):
 221         """
 222         @param s a string (of type unicode)
 223         """
 224         assert type(s) == type(u'')
 225
 226         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 227         return result
 228
 229 def encodeFilename(s):
 230         """
 231         @param s The name of the file (of type unicode)
 232         """
 233
 234         assert type(s) == type(u'')
 235
 236         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 237                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 238                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 239                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 240                 return s
 241         else:
 242                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 243
 244 class DownloadError(Exception):
 245         """Download Error exception.
 246
 247         This exception may be thrown by FileDownloader objects if they are not
 248         configured to continue on errors. They will contain the appropriate
 249         error message.
 250         """
 251         pass
 252
 253
 254 class SameFileError(Exception):
 255         """Same File exception.
 256
 257         This exception will be thrown by FileDownloader objects if they detect
 258         multiple files would have to be downloaded to the same file on disk.
 259         """
 260         pass
 261
 262
 263 class PostProcessingError(Exception):
 264         """Post Processing exception.
 265
 266         This exception may be raised by PostProcessor's .run() method to
 267         indicate an error in the postprocessing task.
 268         """
 269         pass
 270
 271 class MaxDownloadsReached(Exception):
 272         """ --max-downloads limit has been reached. """
 273         pass
 274
 275
 276 class UnavailableVideoError(Exception):
 277         """Unavailable Format exception.
 278
 279         This exception will be thrown when a video is requested
 280         in a format that is not available for that video.
 281         """
 282         pass
 283
 284
 285 class ContentTooShortError(Exception):
 286         """Content Too Short exception.
 287
 288         This exception may be raised by FileDownloader objects when a file they
 289         download is too small for what the server announced first, indicating
 290         the connection was probably interrupted.
 291         """
 292         # Both in bytes
 293         downloaded = None
 294         expected = None
 295
 296         def __init__(self, downloaded, expected):
 297                 self.downloaded = downloaded
 298                 self.expected = expected
 299
 300
 301 class Trouble(Exception):
 302         """Trouble helper exception
 303
 304         This is an exception to be handled with
 305         FileDownloader.trouble
 306         """
 307
 308 class YoutubeDLHandler(urllib2.HTTPHandler):
 309         """Handler for HTTP requests and responses.
 310
 311         This class, when installed with an OpenerDirector, automatically adds
 312         the standard headers to every HTTP request and handles gzipped and
 313         deflated responses from web servers. If compression is to be avoided in
 314         a particular request, the original request in the program code only has
 315         to include the HTTP header "Youtubedl-No-Compression", which will be
 316         removed before making the real request.
 317
 318         Part of this code was copied from:
 319
 320         http://techknack.net/python-urllib2-handlers/
 321
 322         Andrew Rowls, the author of that code, agreed to release it to the
 323         public domain.
 324         """
 325
 326         @staticmethod
 327         def deflate(data):
 328                 try:
 329                         return zlib.decompress(data, -zlib.MAX_WBITS)
 330                 except zlib.error:
 331                         return zlib.decompress(data)
 332
 333         @staticmethod
 334         def addinfourl_wrapper(stream, headers, url, code):
 335                 if hasattr(urllib2.addinfourl, 'getcode'):
 336                         return urllib2.addinfourl(stream, headers, url, code)
 337                 ret = urllib2.addinfourl(stream, headers, url)
 338                 ret.code = code
 339                 return ret
 340
 341         def http_request(self, req):
 342                 for h in std_headers:
 343                         if h in req.headers:
 344                                 del req.headers[h]
 345                         req.add_header(h, std_headers[h])
 346                 if 'Youtubedl-no-compression' in req.headers:
 347                         if 'Accept-encoding' in req.headers:
 348                                 del req.headers['Accept-encoding']
 349                         del req.headers['Youtubedl-no-compression']
 350                 return req
 351
 352         def http_response(self, req, resp):
 353                 old_resp = resp
 354                 # gzip
 355                 if resp.headers.get('Content-encoding', '') == 'gzip':
 356                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 357                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 358                         resp.msg = old_resp.msg
 359                 # deflate
 360                 if resp.headers.get('Content-encoding', '') == 'deflate':
 361                         gz = StringIO.StringIO(self.deflate(resp.read()))
 362                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 363                         resp.msg = old_resp.msg
 364                 return resp