_ Git - youtube-dl/blob - youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import locale
   6 import os
   7 import re
   8 import sys
   9 import zlib
  10 import email.utils
  11 import json
  12
  13 try:
  14         import cStringIO as StringIO
  15 except ImportError:
  16         import StringIO
  17
  18 try:
  19         import urllib.request as compat_urllib_request
  20 except ImportError: # Python 2
  21         import urllib2 as compat_urllib_request
  22
  23 try:
  24         import urllib.error as compat_urllib_error
  25 except ImportError: # Python 2
  26         import urllib2 as compat_urllib_error
  27
  28 try:
  29         import urllib.parse as compat_urllib_parse
  30 except ImportError: # Python 2
  31         import urllib as compat_urllib_parse
  32
  33 try:
  34         import http.cookiejar as compat_cookiejar
  35 except ImportError: # Python 2
  36         import cookielib as compat_cookiejar
  37
  38 try:
  39         import html.entities as compat_html_entities
  40 except NameError: # Python 2
  41         import htmlentitydefs as compat_html_entities
  42
  43 try:
  44         import html.parser as compat_html_parser
  45 except NameError: # Python 2
  46         import HTMLParser as compat_html_parser
  47
  48 try:
  49         compat_str = unicode # Python 2
  50 except NameError:
  51         compat_str = str
  52
  53 try:
  54         compat_chr = unichr # Python 2
  55 except NameError:
  56         compat_chr = chr
  57
  58
  59 std_headers = {
  60         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  61         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  62         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  63         'Accept-Encoding': 'gzip, deflate',
  64         'Accept-Language': 'en-us,en;q=0.5',
  65 }
  66 def preferredencoding():
  67         """Get preferred encoding.
  68
  69         Returns the best encoding scheme for the system, based on
  70         locale.getpreferredencoding() and some further tweaks.
  71         """
  72         try:
  73                 pref = locale.getpreferredencoding()
  74                 u'TEST'.encode(pref)
  75         except:
  76                 pref = 'UTF-8'
  77
  78         return pref
  79
  80
  81 def htmlentity_transform(matchobj):
  82         """Transforms an HTML entity to a character.
  83
  84         This function receives a match object and is intended to be used with
  85         the re.sub() function.
  86         """
  87         entity = matchobj.group(1)
  88
  89         # Known non-numeric HTML entity
  90         if entity in compat_html_entities.name2codepoint:
  91                 return compat_chr(compat_html_entities.name2codepoint[entity])
  92
  93         mobj = re.match(u'(?u)#(x?\\d+)', entity)
  94         if mobj is not None:
  95                 numstr = mobj.group(1)
  96                 if numstr.startswith(u'x'):
  97                         base = 16
  98                         numstr = u'0%s' % numstr
  99                 else:
 100                         base = 10
 101                 return compat_chr(int(numstr, base))
 102
 103         # Unknown entity in name, return its literal representation
 104         return (u'&%s;' % entity)
 105
 106 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 107 class IDParser(compat_html_parser.HTMLParser):
 108         """Modified HTMLParser that isolates a tag with the specified id"""
 109         def __init__(self, id):
 110                 self.id = id
 111                 self.result = None
 112                 self.started = False
 113                 self.depth = {}
 114                 self.html = None
 115                 self.watch_startpos = False
 116                 self.error_count = 0
 117                 compat_html_parser.HTMLParser.__init__(self)
 118
 119         def error(self, message):
 120                 if self.error_count > 10 or self.started:
 121                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 122                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 123                 self.error_count += 1
 124                 self.goahead(1)
 125
 126         def loads(self, html):
 127                 self.html = html
 128                 self.feed(html)
 129                 self.close()
 130
 131         def handle_starttag(self, tag, attrs):
 132                 attrs = dict(attrs)
 133                 if self.started:
 134                         self.find_startpos(None)
 135                 if 'id' in attrs and attrs['id'] == self.id:
 136                         self.result = [tag]
 137                         self.started = True
 138                         self.watch_startpos = True
 139                 if self.started:
 140                         if not tag in self.depth: self.depth[tag] = 0
 141                         self.depth[tag] += 1
 142
 143         def handle_endtag(self, tag):
 144                 if self.started:
 145                         if tag in self.depth: self.depth[tag] -= 1
 146                         if self.depth[self.result[0]] == 0:
 147                                 self.started = False
 148                                 self.result.append(self.getpos())
 149
 150         def find_startpos(self, x):
 151                 """Needed to put the start position of the result (self.result[1])
 152                 after the opening tag with the requested id"""
 153                 if self.watch_startpos:
 154                         self.watch_startpos = False
 155                         self.result.append(self.getpos())
 156         handle_entityref = handle_charref = handle_data = handle_comment = \
 157         handle_decl = handle_pi = unknown_decl = find_startpos
 158
 159         def get_result(self):
 160                 if self.result is None:
 161                         return None
 162                 if len(self.result) != 3:
 163                         return None
 164                 lines = self.html.split('\n')
 165                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 166                 lines[0] = lines[0][self.result[1][1]:]
 167                 if len(lines) == 1:
 168                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 169                 lines[-1] = lines[-1][:self.result[2][1]]
 170                 return '\n'.join(lines).strip()
 171
 172 def get_element_by_id(id, html):
 173         """Return the content of the tag with the specified id in the passed HTML document"""
 174         parser = IDParser(id)
 175         try:
 176                 parser.loads(html)
 177         except compat_html_parser.HTMLParseError:
 178                 pass
 179         return parser.get_result()
 180
 181
 182 def clean_html(html):
 183         """Clean an HTML snippet into a readable string"""
 184         # Newline vs <br />
 185         html = html.replace('\n', ' ')
 186         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 187         # Strip html tags
 188         html = re.sub('<.*?>', '', html)
 189         # Replace html entities
 190         html = unescapeHTML(html)
 191         return html
 192
 193
 194 def sanitize_open(filename, open_mode):
 195         """Try to open the given filename, and slightly tweak it if this fails.
 196
 197         Attempts to open the given filename. If this fails, it tries to change
 198         the filename slightly, step by step, until it's either able to open it
 199         or it fails and raises a final exception, like the standard open()
 200         function.
 201
 202         It returns the tuple (stream, definitive_file_name).
 203         """
 204         try:
 205                 if filename == u'-':
 206                         if sys.platform == 'win32':
 207                                 import msvcrt
 208                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 209                         return (sys.stdout, filename)
 210                 stream = open(encodeFilename(filename), open_mode)
 211                 return (stream, filename)
 212         except (IOError, OSError) as err:
 213                 # In case of error, try to remove win32 forbidden chars
 214                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 215
 216                 # An exception here should be caught in the caller
 217                 stream = open(encodeFilename(filename), open_mode)
 218                 return (stream, filename)
 219
 220
 221 def timeconvert(timestr):
 222         """Convert RFC 2822 defined time string into system timestamp"""
 223         timestamp = None
 224         timetuple = email.utils.parsedate_tz(timestr)
 225         if timetuple is not None:
 226                 timestamp = email.utils.mktime_tz(timetuple)
 227         return timestamp
 228
 229 def sanitize_filename(s, restricted=False):
 230         """Sanitizes a string so it could be used as part of a filename.
 231         If restricted is set, use a stricter subset of allowed characters.
 232         """
 233         def replace_insane(char):
 234                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 235                         return ''
 236                 elif char == '"':
 237                         return '' if restricted else '\''
 238                 elif char == ':':
 239                         return '_-' if restricted else ' -'
 240                 elif char in '\\/|*<>':
 241                         return '_'
 242                 if restricted and (char in '!&\'' or char.isspace()):
 243                         return '_'
 244                 if restricted and ord(char) > 127:
 245                         return '_'
 246                 return char
 247
 248         result = u''.join(map(replace_insane, s))
 249         while '__' in result:
 250                 result = result.replace('__', '_')
 251         result = result.strip('_')
 252         # Common case of "Foreign band name - English song title"
 253         if restricted and result.startswith('-_'):
 254                 result = result[2:]
 255         if not result:
 256                 result = '_'
 257         return result
 258
 259 def orderedSet(iterable):
 260         """ Remove all duplicates from the input iterable """
 261         res = []
 262         for el in iterable:
 263                 if el not in res:
 264                         res.append(el)
 265         return res
 266
 267 def unescapeHTML(s):
 268         """
 269         @param s a string
 270         """
 271         assert type(s) == type(u'')
 272
 273         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 274         return result
 275
 276 def encodeFilename(s):
 277         """
 278         @param s The name of the file
 279         """
 280
 281         assert type(s) == type(u'')
 282
 283         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 284                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 285                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 286                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 287                 return s
 288         else:
 289                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 290
 291 class DownloadError(Exception):
 292         """Download Error exception.
 293
 294         This exception may be thrown by FileDownloader objects if they are not
 295         configured to continue on errors. They will contain the appropriate
 296         error message.
 297         """
 298         pass
 299
 300
 301 class SameFileError(Exception):
 302         """Same File exception.
 303
 304         This exception will be thrown by FileDownloader objects if they detect
 305         multiple files would have to be downloaded to the same file on disk.
 306         """
 307         pass
 308
 309
 310 class PostProcessingError(Exception):
 311         """Post Processing exception.
 312
 313         This exception may be raised by PostProcessor's .run() method to
 314         indicate an error in the postprocessing task.
 315         """
 316         pass
 317
 318 class MaxDownloadsReached(Exception):
 319         """ --max-downloads limit has been reached. """
 320         pass
 321
 322
 323 class UnavailableVideoError(Exception):
 324         """Unavailable Format exception.
 325
 326         This exception will be thrown when a video is requested
 327         in a format that is not available for that video.
 328         """
 329         pass
 330
 331
 332 class ContentTooShortError(Exception):
 333         """Content Too Short exception.
 334
 335         This exception may be raised by FileDownloader objects when a file they
 336         download is too small for what the server announced first, indicating
 337         the connection was probably interrupted.
 338         """
 339         # Both in bytes
 340         downloaded = None
 341         expected = None
 342
 343         def __init__(self, downloaded, expected):
 344                 self.downloaded = downloaded
 345                 self.expected = expected
 346
 347
 348 class Trouble(Exception):
 349         """Trouble helper exception
 350
 351         This is an exception to be handled with
 352         FileDownloader.trouble
 353         """
 354
 355 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 356         """Handler for HTTP requests and responses.
 357
 358         This class, when installed with an OpenerDirector, automatically adds
 359         the standard headers to every HTTP request and handles gzipped and
 360         deflated responses from web servers. If compression is to be avoided in
 361         a particular request, the original request in the program code only has
 362         to include the HTTP header "Youtubedl-No-Compression", which will be
 363         removed before making the real request.
 364
 365         Part of this code was copied from:
 366
 367         http://techknack.net/python-urllib2-handlers/
 368
 369         Andrew Rowls, the author of that code, agreed to release it to the
 370         public domain.
 371         """
 372
 373         @staticmethod
 374         def deflate(data):
 375                 try:
 376                         return zlib.decompress(data, -zlib.MAX_WBITS)
 377                 except zlib.error:
 378                         return zlib.decompress(data)
 379
 380         @staticmethod
 381         def addinfourl_wrapper(stream, headers, url, code):
 382                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 383                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 384                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 385                 ret.code = code
 386                 return ret
 387
 388         def http_request(self, req):
 389                 for h in std_headers:
 390                         if h in req.headers:
 391                                 del req.headers[h]
 392                         req.add_header(h, std_headers[h])
 393                 if 'Youtubedl-no-compression' in req.headers:
 394                         if 'Accept-encoding' in req.headers:
 395                                 del req.headers['Accept-encoding']
 396                         del req.headers['Youtubedl-no-compression']
 397                 return req
 398
 399         def http_response(self, req, resp):
 400                 old_resp = resp
 401                 # gzip
 402                 if resp.headers.get('Content-encoding', '') == 'gzip':
 403                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 405                         resp.msg = old_resp.msg
 406                 # deflate
 407                 if resp.headers.get('Content-encoding', '') == 'deflate':
 408                         gz = StringIO.StringIO(self.deflate(resp.read()))
 409                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 410                         resp.msg = old_resp.msg
 411                 return resp