_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14 import json
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  25         'Accept-Encoding': 'gzip, deflate',
  26         'Accept-Language': 'en-us,en;q=0.5',
  27 }
  28
  29 def preferredencoding():
  30         """Get preferred encoding.
  31
  32         Returns the best encoding scheme for the system, based on
  33         locale.getpreferredencoding() and some further tweaks.
  34         """
  35         def yield_preferredencoding():
  36                 try:
  37                         pref = locale.getpreferredencoding()
  38                         u'TEST'.encode(pref)
  39                 except:
  40                         pref = 'UTF-8'
  41                 while True:
  42                         yield pref
  43         return yield_preferredencoding().next()
  44
  45
  46 def htmlentity_transform(matchobj):
  47         """Transforms an HTML entity to a Unicode character.
  48
  49         This function receives a match object and is intended to be used with
  50         the re.sub() function.
  51         """
  52         entity = matchobj.group(1)
  53
  54         # Known non-numeric HTML entity
  55         if entity in htmlentitydefs.name2codepoint:
  56                 return unichr(htmlentitydefs.name2codepoint[entity])
  57
  58         # Unicode character
  59         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  60         if mobj is not None:
  61                 numstr = mobj.group(1)
  62                 if numstr.startswith(u'x'):
  63                         base = 16
  64                         numstr = u'0%s' % numstr
  65                 else:
  66                         base = 10
  67                 return unichr(long(numstr, base))
  68
  69         # Unknown entity in name, return its literal representation
  70         return (u'&%s;' % entity)
  71
  72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  73 class IDParser(HTMLParser.HTMLParser):
  74         """Modified HTMLParser that isolates a tag with the specified id"""
  75         def __init__(self, id):
  76                 self.id = id
  77                 self.result = None
  78                 self.started = False
  79                 self.depth = {}
  80                 self.html = None
  81                 self.watch_startpos = False
  82                 self.error_count = 0
  83                 HTMLParser.HTMLParser.__init__(self)
  84
  85         def error(self, message):
  86                 print self.getpos()
  87                 if self.error_count > 10 or self.started:
  88                         raise HTMLParser.HTMLParseError(message, self.getpos())
  89                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  90                 self.error_count += 1
  91                 self.goahead(1)
  92
  93         def loads(self, html):
  94                 self.html = html
  95                 self.feed(html)
  96                 self.close()
  97
  98         def handle_starttag(self, tag, attrs):
  99                 attrs = dict(attrs)
 100                 if self.started:
 101                         self.find_startpos(None)
 102                 if 'id' in attrs and attrs['id'] == self.id:
 103                         self.result = [tag]
 104                         self.started = True
 105                         self.watch_startpos = True
 106                 if self.started:
 107                         if not tag in self.depth: self.depth[tag] = 0
 108                         self.depth[tag] += 1
 109
 110         def handle_endtag(self, tag):
 111                 if self.started:
 112                         if tag in self.depth: self.depth[tag] -= 1
 113                         if self.depth[self.result[0]] == 0:
 114                                 self.started = False
 115                                 self.result.append(self.getpos())
 116
 117         def find_startpos(self, x):
 118                 """Needed to put the start position of the result (self.result[1])
 119                 after the opening tag with the requested id"""
 120                 if self.watch_startpos:
 121                         self.watch_startpos = False
 122                         self.result.append(self.getpos())
 123         handle_entityref = handle_charref = handle_data = handle_comment = \
 124         handle_decl = handle_pi = unknown_decl = find_startpos
 125
 126         def get_result(self):
 127                 if self.result == None: return None
 128                 if len(self.result) != 3: return None
 129                 lines = self.html.split('\n')
 130                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 131                 lines[0] = lines[0][self.result[1][1]:]
 132                 if len(lines) == 1:
 133                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 134                 lines[-1] = lines[-1][:self.result[2][1]]
 135                 return '\n'.join(lines).strip()
 136
 137 def get_element_by_id(id, html):
 138         """Return the content of the tag with the specified id in the passed HTML document"""
 139         parser = IDParser(id)
 140         try:
 141                 parser.loads(html)
 142         except HTMLParser.HTMLParseError:
 143                 pass
 144         return parser.get_result()
 145
 146
 147 def clean_html(html):
 148         """Clean an HTML snippet into a readable string"""
 149         # Newline vs <br />
 150         html = html.replace('\n', ' ')
 151         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 152         # Strip html tags
 153         html = re.sub('<.*?>', '', html)
 154         # Replace html entities
 155         html = unescapeHTML(html)
 156         return html
 157
 158
 159 def sanitize_title(utitle):
 160         """Sanitizes a video title so it could be used as part of a filename."""
 161         utitle = unescapeHTML(utitle)
 162         return utitle.replace(unicode(os.sep), u'%')
 163
 164
 165 def sanitize_open(filename, open_mode):
 166         """Try to open the given filename, and slightly tweak it if this fails.
 167
 168         Attempts to open the given filename. If this fails, it tries to change
 169         the filename slightly, step by step, until it's either able to open it
 170         or it fails and raises a final exception, like the standard open()
 171         function.
 172
 173         It returns the tuple (stream, definitive_file_name).
 174         """
 175         try:
 176                 if filename == u'-':
 177                         if sys.platform == 'win32':
 178                                 import msvcrt
 179                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 180                         return (sys.stdout, filename)
 181                 stream = open(encodeFilename(filename), open_mode)
 182                 return (stream, filename)
 183         except (IOError, OSError), err:
 184                 # In case of error, try to remove win32 forbidden chars
 185                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 186
 187                 # An exception here should be caught in the caller
 188                 stream = open(encodeFilename(filename), open_mode)
 189                 return (stream, filename)
 190
 191
 192 def timeconvert(timestr):
 193         """Convert RFC 2822 defined time string into system timestamp"""
 194         timestamp = None
 195         timetuple = email.utils.parsedate_tz(timestr)
 196         if timetuple is not None:
 197                 timestamp = email.utils.mktime_tz(timetuple)
 198         return timestamp
 199
 200 def simplify_title(title):
 201         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 202         return expr.sub(u'_', title).strip(u'_')
 203
 204 def orderedSet(iterable):
 205         """ Remove all duplicates from the input iterable """
 206         res = []
 207         for el in iterable:
 208                 if el not in res:
 209                         res.append(el)
 210         return res
 211
 212 def unescapeHTML(s):
 213         """
 214         @param s a string (of type unicode)
 215         """
 216         assert type(s) == type(u'')
 217
 218         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 219         return result
 220
 221 def encodeFilename(s):
 222         """
 223         @param s The name of the file (of type unicode)
 224         """
 225
 226         assert type(s) == type(u'')
 227
 228         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 229                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 230                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 231                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 232                 return s
 233         else:
 234                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 235
 236 class DownloadError(Exception):
 237         """Download Error exception.
 238
 239         This exception may be thrown by FileDownloader objects if they are not
 240         configured to continue on errors. They will contain the appropriate
 241         error message.
 242         """
 243         pass
 244
 245
 246 class SameFileError(Exception):
 247         """Same File exception.
 248
 249         This exception will be thrown by FileDownloader objects if they detect
 250         multiple files would have to be downloaded to the same file on disk.
 251         """
 252         pass
 253
 254
 255 class PostProcessingError(Exception):
 256         """Post Processing exception.
 257
 258         This exception may be raised by PostProcessor's .run() method to
 259         indicate an error in the postprocessing task.
 260         """
 261         pass
 262
 263 class MaxDownloadsReached(Exception):
 264         """ --max-downloads limit has been reached. """
 265         pass
 266
 267
 268 class UnavailableVideoError(Exception):
 269         """Unavailable Format exception.
 270
 271         This exception will be thrown when a video is requested
 272         in a format that is not available for that video.
 273         """
 274         pass
 275
 276
 277 class ContentTooShortError(Exception):
 278         """Content Too Short exception.
 279
 280         This exception may be raised by FileDownloader objects when a file they
 281         download is too small for what the server announced first, indicating
 282         the connection was probably interrupted.
 283         """
 284         # Both in bytes
 285         downloaded = None
 286         expected = None
 287
 288         def __init__(self, downloaded, expected):
 289                 self.downloaded = downloaded
 290                 self.expected = expected
 291
 292
 293 class Trouble(Exception):
 294         """Trouble helper exception
 295
 296         This is an exception to be handled with
 297         FileDownloader.trouble
 298         """
 299
 300 class YoutubeDLHandler(urllib2.HTTPHandler):
 301         """Handler for HTTP requests and responses.
 302
 303         This class, when installed with an OpenerDirector, automatically adds
 304         the standard headers to every HTTP request and handles gzipped and
 305         deflated responses from web servers. If compression is to be avoided in
 306         a particular request, the original request in the program code only has
 307         to include the HTTP header "Youtubedl-No-Compression", which will be
 308         removed before making the real request.
 309
 310         Part of this code was copied from:
 311
 312         http://techknack.net/python-urllib2-handlers/
 313
 314         Andrew Rowls, the author of that code, agreed to release it to the
 315         public domain.
 316         """
 317
 318         @staticmethod
 319         def deflate(data):
 320                 try:
 321                         return zlib.decompress(data, -zlib.MAX_WBITS)
 322                 except zlib.error:
 323                         return zlib.decompress(data)
 324
 325         @staticmethod
 326         def addinfourl_wrapper(stream, headers, url, code):
 327                 if hasattr(urllib2.addinfourl, 'getcode'):
 328                         return urllib2.addinfourl(stream, headers, url, code)
 329                 ret = urllib2.addinfourl(stream, headers, url)
 330                 ret.code = code
 331                 return ret
 332
 333         def http_request(self, req):
 334                 for h in std_headers:
 335                         if h in req.headers:
 336                                 del req.headers[h]
 337                         req.add_header(h, std_headers[h])
 338                 if 'Youtubedl-no-compression' in req.headers:
 339                         if 'Accept-encoding' in req.headers:
 340                                 del req.headers['Accept-encoding']
 341                         del req.headers['Youtubedl-no-compression']
 342                 return req
 343
 344         def http_response(self, req, resp):
 345                 old_resp = resp
 346                 # gzip
 347                 if resp.headers.get('Content-encoding', '') == 'gzip':
 348                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 349                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 350                         resp.msg = old_resp.msg
 351                 # deflate
 352                 if resp.headers.get('Content-encoding', '') == 'deflate':
 353                         gz = StringIO.StringIO(self.deflate(resp.read()))
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 return resp