Add some parentheses around print for #180
[youtube-dl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import htmlentitydefs
6 import HTMLParser
7 import locale
8 import os
9 import re
10 import sys
11 import zlib
12 import urllib2
13 import email.utils
14 import json
15
16 try:
17         import cStringIO as StringIO
18 except ImportError:
19         import StringIO
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25         'Accept-Encoding': 'gzip, deflate',
26         'Accept-Language': 'en-us,en;q=0.5',
27 }
28
29 def preferredencoding():
30         """Get preferred encoding.
31
32         Returns the best encoding scheme for the system, based on
33         locale.getpreferredencoding() and some further tweaks.
34         """
35         def yield_preferredencoding():
36                 try:
37                         pref = locale.getpreferredencoding()
38                         u'TEST'.encode(pref)
39                 except:
40                         pref = 'UTF-8'
41                 while True:
42                         yield pref
43         return yield_preferredencoding().next()
44
45
46 def htmlentity_transform(matchobj):
47         """Transforms an HTML entity to a Unicode character.
48
49         This function receives a match object and is intended to be used with
50         the re.sub() function.
51         """
52         entity = matchobj.group(1)
53
54         # Known non-numeric HTML entity
55         if entity in htmlentitydefs.name2codepoint:
56                 return unichr(htmlentitydefs.name2codepoint[entity])
57
58         # Unicode character
59         mobj = re.match(ur'(?u)#(x?\d+)', entity)
60         if mobj is not None:
61                 numstr = mobj.group(1)
62                 if numstr.startswith(u'x'):
63                         base = 16
64                         numstr = u'0%s' % numstr
65                 else:
66                         base = 10
67                 return unichr(long(numstr, base))
68
69         # Unknown entity in name, return its literal representation
70         return (u'&%s;' % entity)
71
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74         """Modified HTMLParser that isolates a tag with the specified id"""
75         def __init__(self, id):
76                 self.id = id
77                 self.result = None
78                 self.started = False
79                 self.depth = {}
80                 self.html = None
81                 self.watch_startpos = False
82                 self.error_count = 0
83                 HTMLParser.HTMLParser.__init__(self)
84
85         def error(self, message):
86                 if self.error_count > 10 or self.started:
87                         raise HTMLParser.HTMLParseError(message, self.getpos())
88                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
89                 self.error_count += 1
90                 self.goahead(1)
91
92         def loads(self, html):
93                 self.html = html
94                 self.feed(html)
95                 self.close()
96
97         def handle_starttag(self, tag, attrs):
98                 attrs = dict(attrs)
99                 if self.started:
100                         self.find_startpos(None)
101                 if 'id' in attrs and attrs['id'] == self.id:
102                         self.result = [tag]
103                         self.started = True
104                         self.watch_startpos = True
105                 if self.started:
106                         if not tag in self.depth: self.depth[tag] = 0
107                         self.depth[tag] += 1
108
109         def handle_endtag(self, tag):
110                 if self.started:
111                         if tag in self.depth: self.depth[tag] -= 1
112                         if self.depth[self.result[0]] == 0:
113                                 self.started = False
114                                 self.result.append(self.getpos())
115
116         def find_startpos(self, x):
117                 """Needed to put the start position of the result (self.result[1])
118                 after the opening tag with the requested id"""
119                 if self.watch_startpos:
120                         self.watch_startpos = False
121                         self.result.append(self.getpos())
122         handle_entityref = handle_charref = handle_data = handle_comment = \
123         handle_decl = handle_pi = unknown_decl = find_startpos
124
125         def get_result(self):
126                 if self.result == None: return None
127                 if len(self.result) != 3: return None
128                 lines = self.html.split('\n')
129                 lines = lines[self.result[1][0]-1:self.result[2][0]]
130                 lines[0] = lines[0][self.result[1][1]:]
131                 if len(lines) == 1:
132                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
133                 lines[-1] = lines[-1][:self.result[2][1]]
134                 return '\n'.join(lines).strip()
135
136 def get_element_by_id(id, html):
137         """Return the content of the tag with the specified id in the passed HTML document"""
138         parser = IDParser(id)
139         try:
140                 parser.loads(html)
141         except HTMLParser.HTMLParseError:
142                 pass
143         return parser.get_result()
144
145
146 def clean_html(html):
147         """Clean an HTML snippet into a readable string"""
148         # Newline vs <br />
149         html = html.replace('\n', ' ')
150         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
151         # Strip html tags
152         html = re.sub('<.*?>', '', html)
153         # Replace html entities
154         html = unescapeHTML(html)
155         return html
156
157
158 def sanitize_open(filename, open_mode):
159         """Try to open the given filename, and slightly tweak it if this fails.
160
161         Attempts to open the given filename. If this fails, it tries to change
162         the filename slightly, step by step, until it's either able to open it
163         or it fails and raises a final exception, like the standard open()
164         function.
165
166         It returns the tuple (stream, definitive_file_name).
167         """
168         try:
169                 if filename == u'-':
170                         if sys.platform == 'win32':
171                                 import msvcrt
172                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
173                         return (sys.stdout, filename)
174                 stream = open(encodeFilename(filename), open_mode)
175                 return (stream, filename)
176         except (IOError, OSError), err:
177                 # In case of error, try to remove win32 forbidden chars
178                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
179
180                 # An exception here should be caught in the caller
181                 stream = open(encodeFilename(filename), open_mode)
182                 return (stream, filename)
183
184
185 def timeconvert(timestr):
186         """Convert RFC 2822 defined time string into system timestamp"""
187         timestamp = None
188         timetuple = email.utils.parsedate_tz(timestr)
189         if timetuple is not None:
190                 timestamp = email.utils.mktime_tz(timetuple)
191         return timestamp
192         
193 def sanitize_filename(s):
194         """Sanitizes a string so it could be used as part of a filename."""
195         def replace_insane(char):
196                 if char == '?' or ord(char) < 32 or ord(char) == 127:
197                         return ''
198                 elif char == '"':
199                         return '\''
200                 elif char == ':':
201                         return ' -'
202                 elif char in '\\/|*<>':
203                         return '-'
204                 return char
205
206         result = u''.join(map(replace_insane, s))
207         while '--' in result:
208                 result = result.replace('--', '-')
209         return result.strip('-')
210
211 def orderedSet(iterable):
212         """ Remove all duplicates from the input iterable """
213         res = []
214         for el in iterable:
215                 if el not in res:
216                         res.append(el)
217         return res
218
219 def unescapeHTML(s):
220         """
221         @param s a string (of type unicode)
222         """
223         assert type(s) == type(u'')
224
225         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
226         return result
227
228 def encodeFilename(s):
229         """
230         @param s The name of the file (of type unicode)
231         """
232
233         assert type(s) == type(u'')
234
235         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
236                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
237                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
238                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
239                 return s
240         else:
241                 return s.encode(sys.getfilesystemencoding(), 'ignore')
242
243 class DownloadError(Exception):
244         """Download Error exception.
245
246         This exception may be thrown by FileDownloader objects if they are not
247         configured to continue on errors. They will contain the appropriate
248         error message.
249         """
250         pass
251
252
253 class SameFileError(Exception):
254         """Same File exception.
255
256         This exception will be thrown by FileDownloader objects if they detect
257         multiple files would have to be downloaded to the same file on disk.
258         """
259         pass
260
261
262 class PostProcessingError(Exception):
263         """Post Processing exception.
264
265         This exception may be raised by PostProcessor's .run() method to
266         indicate an error in the postprocessing task.
267         """
268         pass
269
270 class MaxDownloadsReached(Exception):
271         """ --max-downloads limit has been reached. """
272         pass
273
274
275 class UnavailableVideoError(Exception):
276         """Unavailable Format exception.
277
278         This exception will be thrown when a video is requested
279         in a format that is not available for that video.
280         """
281         pass
282
283
284 class ContentTooShortError(Exception):
285         """Content Too Short exception.
286
287         This exception may be raised by FileDownloader objects when a file they
288         download is too small for what the server announced first, indicating
289         the connection was probably interrupted.
290         """
291         # Both in bytes
292         downloaded = None
293         expected = None
294
295         def __init__(self, downloaded, expected):
296                 self.downloaded = downloaded
297                 self.expected = expected
298
299
300 class Trouble(Exception):
301         """Trouble helper exception
302         
303         This is an exception to be handled with
304         FileDownloader.trouble
305         """
306
307 class YoutubeDLHandler(urllib2.HTTPHandler):
308         """Handler for HTTP requests and responses.
309
310         This class, when installed with an OpenerDirector, automatically adds
311         the standard headers to every HTTP request and handles gzipped and
312         deflated responses from web servers. If compression is to be avoided in
313         a particular request, the original request in the program code only has
314         to include the HTTP header "Youtubedl-No-Compression", which will be
315         removed before making the real request.
316
317         Part of this code was copied from:
318
319         http://techknack.net/python-urllib2-handlers/
320
321         Andrew Rowls, the author of that code, agreed to release it to the
322         public domain.
323         """
324
325         @staticmethod
326         def deflate(data):
327                 try:
328                         return zlib.decompress(data, -zlib.MAX_WBITS)
329                 except zlib.error:
330                         return zlib.decompress(data)
331
332         @staticmethod
333         def addinfourl_wrapper(stream, headers, url, code):
334                 if hasattr(urllib2.addinfourl, 'getcode'):
335                         return urllib2.addinfourl(stream, headers, url, code)
336                 ret = urllib2.addinfourl(stream, headers, url)
337                 ret.code = code
338                 return ret
339
340         def http_request(self, req):
341                 for h in std_headers:
342                         if h in req.headers:
343                                 del req.headers[h]
344                         req.add_header(h, std_headers[h])
345                 if 'Youtubedl-no-compression' in req.headers:
346                         if 'Accept-encoding' in req.headers:
347                                 del req.headers['Accept-encoding']
348                         del req.headers['Youtubedl-no-compression']
349                 return req
350
351         def http_response(self, req, resp):
352                 old_resp = resp
353                 # gzip
354                 if resp.headers.get('Content-encoding', '') == 'gzip':
355                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
356                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
357                         resp.msg = old_resp.msg
358                 # deflate
359                 if resp.headers.get('Content-encoding', '') == 'deflate':
360                         gz = StringIO.StringIO(self.deflate(resp.read()))
361                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
362                         resp.msg = old_resp.msg
363                 return resp