Split code as a package, compiled into an executable zip
[youtube-dl] / youtube_dl / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import htmlentitydefs
6 import HTMLParser
7 import locale
8 import os
9 import re
10 import sys
11 import zlib
12 import urllib2
13 import email.utils
14
15 try:
16         import cStringIO as StringIO
17 except ImportError:
18         import StringIO
19
20 std_headers = {
21         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
22         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
23         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24         'Accept-Encoding': 'gzip, deflate',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 def preferredencoding():
29         """Get preferred encoding.
30
31         Returns the best encoding scheme for the system, based on
32         locale.getpreferredencoding() and some further tweaks.
33         """
34         def yield_preferredencoding():
35                 try:
36                         pref = locale.getpreferredencoding()
37                         u'TEST'.encode(pref)
38                 except:
39                         pref = 'UTF-8'
40                 while True:
41                         yield pref
42         return yield_preferredencoding().next()
43
44
45 def htmlentity_transform(matchobj):
46         """Transforms an HTML entity to a Unicode character.
47
48         This function receives a match object and is intended to be used with
49         the re.sub() function.
50         """
51         entity = matchobj.group(1)
52
53         # Known non-numeric HTML entity
54         if entity in htmlentitydefs.name2codepoint:
55                 return unichr(htmlentitydefs.name2codepoint[entity])
56
57         # Unicode character
58         mobj = re.match(ur'(?u)#(x?\d+)', entity)
59         if mobj is not None:
60                 numstr = mobj.group(1)
61                 if numstr.startswith(u'x'):
62                         base = 16
63                         numstr = u'0%s' % numstr
64                 else:
65                         base = 10
66                 return unichr(long(numstr, base))
67
68         # Unknown entity in name, return its literal representation
69         return (u'&%s;' % entity)
70
71
72 def sanitize_title(utitle):
73         """Sanitizes a video title so it could be used as part of a filename."""
74         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
75         return utitle.replace(unicode(os.sep), u'%')
76
77
78 def sanitize_open(filename, open_mode):
79         """Try to open the given filename, and slightly tweak it if this fails.
80
81         Attempts to open the given filename. If this fails, it tries to change
82         the filename slightly, step by step, until it's either able to open it
83         or it fails and raises a final exception, like the standard open()
84         function.
85
86         It returns the tuple (stream, definitive_file_name).
87         """
88         try:
89                 if filename == u'-':
90                         if sys.platform == 'win32':
91                                 import msvcrt
92                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
93                         return (sys.stdout, filename)
94                 stream = open(encodeFilename(filename), open_mode)
95                 return (stream, filename)
96         except (IOError, OSError), err:
97                 # In case of error, try to remove win32 forbidden chars
98                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
99
100                 # An exception here should be caught in the caller
101                 stream = open(encodeFilename(filename), open_mode)
102                 return (stream, filename)
103
104
105 def timeconvert(timestr):
106         """Convert RFC 2822 defined time string into system timestamp"""
107         timestamp = None
108         timetuple = email.utils.parsedate_tz(timestr)
109         if timetuple is not None:
110                 timestamp = email.utils.mktime_tz(timetuple)
111         return timestamp
112
113 def simplify_title(title):
114         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
115         return expr.sub(u'_', title).strip(u'_')
116
117 def orderedSet(iterable):
118         """ Remove all duplicates from the input iterable """
119         res = []
120         for el in iterable:
121                 if el not in res:
122                         res.append(el)
123         return res
124
125 def unescapeHTML(s):
126         """
127         @param s a string (of type unicode)
128         """
129         assert type(s) == type(u'')
130
131         htmlParser = HTMLParser.HTMLParser()
132         return htmlParser.unescape(s)
133
134 def encodeFilename(s):
135         """
136         @param s The name of the file (of type unicode)
137         """
138
139         assert type(s) == type(u'')
140
141         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
142                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
143                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
144                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
145                 return s
146         else:
147                 return s.encode(sys.getfilesystemencoding(), 'ignore')
148
149 class DownloadError(Exception):
150         """Download Error exception.
151
152         This exception may be thrown by FileDownloader objects if they are not
153         configured to continue on errors. They will contain the appropriate
154         error message.
155         """
156         pass
157
158
159 class SameFileError(Exception):
160         """Same File exception.
161
162         This exception will be thrown by FileDownloader objects if they detect
163         multiple files would have to be downloaded to the same file on disk.
164         """
165         pass
166
167
168 class PostProcessingError(Exception):
169         """Post Processing exception.
170
171         This exception may be raised by PostProcessor's .run() method to
172         indicate an error in the postprocessing task.
173         """
174         pass
175
176 class MaxDownloadsReached(Exception):
177         """ --max-downloads limit has been reached. """
178         pass
179
180
181 class UnavailableVideoError(Exception):
182         """Unavailable Format exception.
183
184         This exception will be thrown when a video is requested
185         in a format that is not available for that video.
186         """
187         pass
188
189
190 class ContentTooShortError(Exception):
191         """Content Too Short exception.
192
193         This exception may be raised by FileDownloader objects when a file they
194         download is too small for what the server announced first, indicating
195         the connection was probably interrupted.
196         """
197         # Both in bytes
198         downloaded = None
199         expected = None
200
201         def __init__(self, downloaded, expected):
202                 self.downloaded = downloaded
203                 self.expected = expected
204
205
206 class YoutubeDLHandler(urllib2.HTTPHandler):
207         """Handler for HTTP requests and responses.
208
209         This class, when installed with an OpenerDirector, automatically adds
210         the standard headers to every HTTP request and handles gzipped and
211         deflated responses from web servers. If compression is to be avoided in
212         a particular request, the original request in the program code only has
213         to include the HTTP header "Youtubedl-No-Compression", which will be
214         removed before making the real request.
215
216         Part of this code was copied from:
217
218         http://techknack.net/python-urllib2-handlers/
219
220         Andrew Rowls, the author of that code, agreed to release it to the
221         public domain.
222         """
223
224         @staticmethod
225         def deflate(data):
226                 try:
227                         return zlib.decompress(data, -zlib.MAX_WBITS)
228                 except zlib.error:
229                         return zlib.decompress(data)
230
231         @staticmethod
232         def addinfourl_wrapper(stream, headers, url, code):
233                 if hasattr(urllib2.addinfourl, 'getcode'):
234                         return urllib2.addinfourl(stream, headers, url, code)
235                 ret = urllib2.addinfourl(stream, headers, url)
236                 ret.code = code
237                 return ret
238
239         def http_request(self, req):
240                 for h in std_headers:
241                         if h in req.headers:
242                                 del req.headers[h]
243                         req.add_header(h, std_headers[h])
244                 if 'Youtubedl-no-compression' in req.headers:
245                         if 'Accept-encoding' in req.headers:
246                                 del req.headers['Accept-encoding']
247                         del req.headers['Youtubedl-no-compression']
248                 return req
249
250         def http_response(self, req, resp):
251                 old_resp = resp
252                 # gzip
253                 if resp.headers.get('Content-encoding', '') == 'gzip':
254                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
255                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
256                         resp.msg = old_resp.msg
257                 # deflate
258                 if resp.headers.get('Content-encoding', '') == 'deflate':
259                         gz = StringIO.StringIO(self.deflate(resp.read()))
260                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
261                         resp.msg = old_resp.msg
262                 return resp
263                 
264 try:
265         import json
266 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
267         import re
268         class json(object):
269                 @staticmethod
270                 def loads(s):
271                         s = s.decode('UTF-8')
272                         def raiseError(msg, i):
273                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
274                         def skipSpace(i, expectMore=True):
275                                 while i < len(s) and s[i] in ' \t\r\n':
276                                         i += 1
277                                 if expectMore:
278                                         if i >= len(s):
279                                                 raiseError('Premature end', i)
280                                 return i
281                         def decodeEscape(match):
282                                 esc = match.group(1)
283                                 _STATIC = {
284                                         '"': '"',
285                                         '\\': '\\',
286                                         '/': '/',
287                                         'b': unichr(0x8),
288                                         'f': unichr(0xc),
289                                         'n': '\n',
290                                         'r': '\r',
291                                         't': '\t',
292                                 }
293                                 if esc in _STATIC:
294                                         return _STATIC[esc]
295                                 if esc[0] == 'u':
296                                         if len(esc) == 1+4:
297                                                 return unichr(int(esc[1:5], 16))
298                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
299                                                 hi = int(esc[1:5], 16)
300                                                 low = int(esc[7:11], 16)
301                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
302                                 raise ValueError('Unknown escape ' + str(esc))
303                         def parseString(i):
304                                 i += 1
305                                 e = i
306                                 while True:
307                                         e = s.index('"', e)
308                                         bslashes = 0
309                                         while s[e-bslashes-1] == '\\':
310                                                 bslashes += 1
311                                         if bslashes % 2 == 1:
312                                                 e += 1
313                                                 continue
314                                         break
315                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
316                                 stri = rexp.sub(decodeEscape, s[i:e])
317                                 return (e+1,stri)
318                         def parseObj(i):
319                                 i += 1
320                                 res = {}
321                                 i = skipSpace(i)
322                                 if s[i] == '}': # Empty dictionary
323                                         return (i+1,res)
324                                 while True:
325                                         if s[i] != '"':
326                                                 raiseError('Expected a string object key', i)
327                                         i,key = parseString(i)
328                                         i = skipSpace(i)
329                                         if i >= len(s) or s[i] != ':':
330                                                 raiseError('Expected a colon', i)
331                                         i,val = parse(i+1)
332                                         res[key] = val
333                                         i = skipSpace(i)
334                                         if s[i] == '}':
335                                                 return (i+1, res)
336                                         if s[i] != ',':
337                                                 raiseError('Expected comma or closing curly brace', i)
338                                         i = skipSpace(i+1)
339                         def parseArray(i):
340                                 res = []
341                                 i = skipSpace(i+1)
342                                 if s[i] == ']': # Empty array
343                                         return (i+1,res)
344                                 while True:
345                                         i,val = parse(i)
346                                         res.append(val)
347                                         i = skipSpace(i) # Raise exception if premature end
348                                         if s[i] == ']':
349                                                 return (i+1, res)
350                                         if s[i] != ',':
351                                                 raiseError('Expected a comma or closing bracket', i)
352                                         i = skipSpace(i+1)
353                         def parseDiscrete(i):
354                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
355                                         if s.startswith(k, i):
356                                                 return (i+len(k), v)
357                                 raiseError('Not a boolean (or null)', i)
358                         def parseNumber(i):
359                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
360                                 if mobj is None:
361                                         raiseError('Not a number', i)
362                                 nums = mobj.group(1)
363                                 if '.' in nums or 'e' in nums or 'E' in nums:
364                                         return (i+len(nums), float(nums))
365                                 return (i+len(nums), int(nums))
366                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
367                         def parse(i):
368                                 i = skipSpace(i)
369                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
370                                 i = skipSpace(i, False)
371                                 return (i,res)
372                         i,res = parse(0)
373                         if i < len(s):
374                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
375                         return res