_ Git - youtube-dl/blob - youtube_dl/Utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14
  15 try:
  16         import cStringIO as StringIO
  17 except ImportError:
  18         import StringIO
  19
  20 std_headers = {
  21         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  22         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  23         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  24         'Accept-Encoding': 'gzip, deflate',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 def preferredencoding():
  29         """Get preferred encoding.
  30
  31         Returns the best encoding scheme for the system, based on
  32         locale.getpreferredencoding() and some further tweaks.
  33         """
  34         def yield_preferredencoding():
  35                 try:
  36                         pref = locale.getpreferredencoding()
  37                         u'TEST'.encode(pref)
  38                 except:
  39                         pref = 'UTF-8'
  40                 while True:
  41                         yield pref
  42         return yield_preferredencoding().next()
  43
  44
  45 def htmlentity_transform(matchobj):
  46         """Transforms an HTML entity to a Unicode character.
  47
  48         This function receives a match object and is intended to be used with
  49         the re.sub() function.
  50         """
  51         entity = matchobj.group(1)
  52
  53         # Known non-numeric HTML entity
  54         if entity in htmlentitydefs.name2codepoint:
  55                 return unichr(htmlentitydefs.name2codepoint[entity])
  56
  57         # Unicode character
  58         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  59         if mobj is not None:
  60                 numstr = mobj.group(1)
  61                 if numstr.startswith(u'x'):
  62                         base = 16
  63                         numstr = u'0%s' % numstr
  64                 else:
  65                         base = 10
  66                 return unichr(long(numstr, base))
  67
  68         # Unknown entity in name, return its literal representation
  69         return (u'&%s;' % entity)
  70
  71
  72 def sanitize_title(utitle):
  73         """Sanitizes a video title so it could be used as part of a filename."""
  74         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  75         return utitle.replace(unicode(os.sep), u'%')
  76
  77
  78 def sanitize_open(filename, open_mode):
  79         """Try to open the given filename, and slightly tweak it if this fails.
  80
  81         Attempts to open the given filename. If this fails, it tries to change
  82         the filename slightly, step by step, until it's either able to open it
  83         or it fails and raises a final exception, like the standard open()
  84         function.
  85
  86         It returns the tuple (stream, definitive_file_name).
  87         """
  88         try:
  89                 if filename == u'-':
  90                         if sys.platform == 'win32':
  91                                 import msvcrt
  92                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  93                         return (sys.stdout, filename)
  94                 stream = open(encodeFilename(filename), open_mode)
  95                 return (stream, filename)
  96         except (IOError, OSError), err:
  97                 # In case of error, try to remove win32 forbidden chars
  98                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
  99
 100                 # An exception here should be caught in the caller
 101                 stream = open(encodeFilename(filename), open_mode)
 102                 return (stream, filename)
 103
 104
 105 def timeconvert(timestr):
 106         """Convert RFC 2822 defined time string into system timestamp"""
 107         timestamp = None
 108         timetuple = email.utils.parsedate_tz(timestr)
 109         if timetuple is not None:
 110                 timestamp = email.utils.mktime_tz(timetuple)
 111         return timestamp
 112
 113 def simplify_title(title):
 114         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 115         return expr.sub(u'_', title).strip(u'_')
 116
 117 def orderedSet(iterable):
 118         """ Remove all duplicates from the input iterable """
 119         res = []
 120         for el in iterable:
 121                 if el not in res:
 122                         res.append(el)
 123         return res
 124
 125 def unescapeHTML(s):
 126         """
 127         @param s a string (of type unicode)
 128         """
 129         assert type(s) == type(u'')
 130
 131         htmlParser = HTMLParser.HTMLParser()
 132         return htmlParser.unescape(s)
 133
 134 def encodeFilename(s):
 135         """
 136         @param s The name of the file (of type unicode)
 137         """
 138
 139         assert type(s) == type(u'')
 140
 141         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 142                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 143                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 144                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 145                 return s
 146         else:
 147                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 148
 149 class DownloadError(Exception):
 150         """Download Error exception.
 151
 152         This exception may be thrown by FileDownloader objects if they are not
 153         configured to continue on errors. They will contain the appropriate
 154         error message.
 155         """
 156         pass
 157
 158
 159 class SameFileError(Exception):
 160         """Same File exception.
 161
 162         This exception will be thrown by FileDownloader objects if they detect
 163         multiple files would have to be downloaded to the same file on disk.
 164         """
 165         pass
 166
 167
 168 class PostProcessingError(Exception):
 169         """Post Processing exception.
 170
 171         This exception may be raised by PostProcessor's .run() method to
 172         indicate an error in the postprocessing task.
 173         """
 174         pass
 175
 176 class MaxDownloadsReached(Exception):
 177         """ --max-downloads limit has been reached. """
 178         pass
 179
 180
 181 class UnavailableVideoError(Exception):
 182         """Unavailable Format exception.
 183
 184         This exception will be thrown when a video is requested
 185         in a format that is not available for that video.
 186         """
 187         pass
 188
 189
 190 class ContentTooShortError(Exception):
 191         """Content Too Short exception.
 192
 193         This exception may be raised by FileDownloader objects when a file they
 194         download is too small for what the server announced first, indicating
 195         the connection was probably interrupted.
 196         """
 197         # Both in bytes
 198         downloaded = None
 199         expected = None
 200
 201         def __init__(self, downloaded, expected):
 202                 self.downloaded = downloaded
 203                 self.expected = expected
 204
 205
 206 class YoutubeDLHandler(urllib2.HTTPHandler):
 207         """Handler for HTTP requests and responses.
 208
 209         This class, when installed with an OpenerDirector, automatically adds
 210         the standard headers to every HTTP request and handles gzipped and
 211         deflated responses from web servers. If compression is to be avoided in
 212         a particular request, the original request in the program code only has
 213         to include the HTTP header "Youtubedl-No-Compression", which will be
 214         removed before making the real request.
 215
 216         Part of this code was copied from:
 217
 218         http://techknack.net/python-urllib2-handlers/
 219
 220         Andrew Rowls, the author of that code, agreed to release it to the
 221         public domain.
 222         """
 223
 224         @staticmethod
 225         def deflate(data):
 226                 try:
 227                         return zlib.decompress(data, -zlib.MAX_WBITS)
 228                 except zlib.error:
 229                         return zlib.decompress(data)
 230
 231         @staticmethod
 232         def addinfourl_wrapper(stream, headers, url, code):
 233                 if hasattr(urllib2.addinfourl, 'getcode'):
 234                         return urllib2.addinfourl(stream, headers, url, code)
 235                 ret = urllib2.addinfourl(stream, headers, url)
 236                 ret.code = code
 237                 return ret
 238
 239         def http_request(self, req):
 240                 for h in std_headers:
 241                         if h in req.headers:
 242                                 del req.headers[h]
 243                         req.add_header(h, std_headers[h])
 244                 if 'Youtubedl-no-compression' in req.headers:
 245                         if 'Accept-encoding' in req.headers:
 246                                 del req.headers['Accept-encoding']
 247                         del req.headers['Youtubedl-no-compression']
 248                 return req
 249
 250         def http_response(self, req, resp):
 251                 old_resp = resp
 252                 # gzip
 253                 if resp.headers.get('Content-encoding', '') == 'gzip':
 254                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 255                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 256                         resp.msg = old_resp.msg
 257                 # deflate
 258                 if resp.headers.get('Content-encoding', '') == 'deflate':
 259                         gz = StringIO.StringIO(self.deflate(resp.read()))
 260                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 261                         resp.msg = old_resp.msg
 262                 return resp
 263
 264 try:
 265         import json
 266 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
 267         import re
 268         class json(object):
 269                 @staticmethod
 270                 def loads(s):
 271                         s = s.decode('UTF-8')
 272                         def raiseError(msg, i):
 273                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
 274                         def skipSpace(i, expectMore=True):
 275                                 while i < len(s) and s[i] in ' \t\r\n':
 276                                         i += 1
 277                                 if expectMore:
 278                                         if i >= len(s):
 279                                                 raiseError('Premature end', i)
 280                                 return i
 281                         def decodeEscape(match):
 282                                 esc = match.group(1)
 283                                 _STATIC = {
 284                                         '"': '"',
 285                                         '\\': '\\',
 286                                         '/': '/',
 287                                         'b': unichr(0x8),
 288                                         'f': unichr(0xc),
 289                                         'n': '\n',
 290                                         'r': '\r',
 291                                         't': '\t',
 292                                 }
 293                                 if esc in _STATIC:
 294                                         return _STATIC[esc]
 295                                 if esc[0] == 'u':
 296                                         if len(esc) == 1+4:
 297                                                 return unichr(int(esc[1:5], 16))
 298                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 299                                                 hi = int(esc[1:5], 16)
 300                                                 low = int(esc[7:11], 16)
 301                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 302                                 raise ValueError('Unknown escape ' + str(esc))
 303                         def parseString(i):
 304                                 i += 1
 305                                 e = i
 306                                 while True:
 307                                         e = s.index('"', e)
 308                                         bslashes = 0
 309                                         while s[e-bslashes-1] == '\\':
 310                                                 bslashes += 1
 311                                         if bslashes % 2 == 1:
 312                                                 e += 1
 313                                                 continue
 314                                         break
 315                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 316                                 stri = rexp.sub(decodeEscape, s[i:e])
 317                                 return (e+1,stri)
 318                         def parseObj(i):
 319                                 i += 1
 320                                 res = {}
 321                                 i = skipSpace(i)
 322                                 if s[i] == '}': # Empty dictionary
 323                                         return (i+1,res)
 324                                 while True:
 325                                         if s[i] != '"':
 326                                                 raiseError('Expected a string object key', i)
 327                                         i,key = parseString(i)
 328                                         i = skipSpace(i)
 329                                         if i >= len(s) or s[i] != ':':
 330                                                 raiseError('Expected a colon', i)
 331                                         i,val = parse(i+1)
 332                                         res[key] = val
 333                                         i = skipSpace(i)
 334                                         if s[i] == '}':
 335                                                 return (i+1, res)
 336                                         if s[i] != ',':
 337                                                 raiseError('Expected comma or closing curly brace', i)
 338                                         i = skipSpace(i+1)
 339                         def parseArray(i):
 340                                 res = []
 341                                 i = skipSpace(i+1)
 342                                 if s[i] == ']': # Empty array
 343                                         return (i+1,res)
 344                                 while True:
 345                                         i,val = parse(i)
 346                                         res.append(val)
 347                                         i = skipSpace(i) # Raise exception if premature end
 348                                         if s[i] == ']':
 349                                                 return (i+1, res)
 350                                         if s[i] != ',':
 351                                                 raiseError('Expected a comma or closing bracket', i)
 352                                         i = skipSpace(i+1)
 353                         def parseDiscrete(i):
 354                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 355                                         if s.startswith(k, i):
 356                                                 return (i+len(k), v)
 357                                 raiseError('Not a boolean (or null)', i)
 358                         def parseNumber(i):
 359                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 360                                 if mobj is None:
 361                                         raiseError('Not a number', i)
 362                                 nums = mobj.group(1)
 363                                 if '.' in nums or 'e' in nums or 'E' in nums:
 364                                         return (i+len(nums), float(nums))
 365                                 return (i+len(nums), int(nums))
 366                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 367                         def parse(i):
 368                                 i = skipSpace(i)
 369                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 370                                 i = skipSpace(i, False)
 371                                 return (i,res)
 372                         i,res = parse(0)
 373                         if i < len(s):
 374                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 375                         return res