Merge xnxx.com Support (NSFW). Test URL (SFW): http://video.xnxx.com/video1443330...
[youtube-dl] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import htmlentitydefs
6 import HTMLParser
7 import locale
8 import os
9 import re
10 import sys
11 import zlib
12 import urllib2
13 import email.utils
14 import json
15
16 try:
17         import cStringIO as StringIO
18 except ImportError:
19         import StringIO
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25         'Accept-Encoding': 'gzip, deflate',
26         'Accept-Language': 'en-us,en;q=0.5',
27 }
28
29 def preferredencoding():
30         """Get preferred encoding.
31
32         Returns the best encoding scheme for the system, based on
33         locale.getpreferredencoding() and some further tweaks.
34         """
35         def yield_preferredencoding():
36                 try:
37                         pref = locale.getpreferredencoding()
38                         u'TEST'.encode(pref)
39                 except:
40                         pref = 'UTF-8'
41                 while True:
42                         yield pref
43         return yield_preferredencoding().next()
44
45
46 def htmlentity_transform(matchobj):
47         """Transforms an HTML entity to a Unicode character.
48
49         This function receives a match object and is intended to be used with
50         the re.sub() function.
51         """
52         entity = matchobj.group(1)
53
54         # Known non-numeric HTML entity
55         if entity in htmlentitydefs.name2codepoint:
56                 return unichr(htmlentitydefs.name2codepoint[entity])
57
58         # Unicode character
59         mobj = re.match(ur'(?u)#(x?\d+)', entity)
60         if mobj is not None:
61                 numstr = mobj.group(1)
62                 if numstr.startswith(u'x'):
63                         base = 16
64                         numstr = u'0%s' % numstr
65                 else:
66                         base = 10
67                 return unichr(long(numstr, base))
68
69         # Unknown entity in name, return its literal representation
70         return (u'&%s;' % entity)
71
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74         """Modified HTMLParser that isolates a tag with the specified id"""
75         def __init__(self, id):
76                 self.id = id
77                 self.result = None
78                 self.started = False
79                 self.depth = {}
80                 self.html = None
81                 self.watch_startpos = False
82                 self.error_count = 0
83                 HTMLParser.HTMLParser.__init__(self)
84
85         def error(self, message):
86                 print >> sys.stderr, self.getpos()
87                 if self.error_count > 10 or self.started:
88                         raise HTMLParser.HTMLParseError(message, self.getpos())
89                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
90                 self.error_count += 1
91                 self.goahead(1)
92
93         def loads(self, html):
94                 self.html = html
95                 self.feed(html)
96                 self.close()
97
98         def handle_starttag(self, tag, attrs):
99                 attrs = dict(attrs)
100                 if self.started:
101                         self.find_startpos(None)
102                 if 'id' in attrs and attrs['id'] == self.id:
103                         self.result = [tag]
104                         self.started = True
105                         self.watch_startpos = True
106                 if self.started:
107                         if not tag in self.depth: self.depth[tag] = 0
108                         self.depth[tag] += 1
109
110         def handle_endtag(self, tag):
111                 if self.started:
112                         if tag in self.depth: self.depth[tag] -= 1
113                         if self.depth[self.result[0]] == 0:
114                                 self.started = False
115                                 self.result.append(self.getpos())
116
117         def find_startpos(self, x):
118                 """Needed to put the start position of the result (self.result[1])
119                 after the opening tag with the requested id"""
120                 if self.watch_startpos:
121                         self.watch_startpos = False
122                         self.result.append(self.getpos())
123         handle_entityref = handle_charref = handle_data = handle_comment = \
124         handle_decl = handle_pi = unknown_decl = find_startpos
125
126         def get_result(self):
127                 if self.result == None: return None
128                 if len(self.result) != 3: return None
129                 lines = self.html.split('\n')
130                 lines = lines[self.result[1][0]-1:self.result[2][0]]
131                 lines[0] = lines[0][self.result[1][1]:]
132                 if len(lines) == 1:
133                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
134                 lines[-1] = lines[-1][:self.result[2][1]]
135                 return '\n'.join(lines).strip()
136
137 def get_element_by_id(id, html):
138         """Return the content of the tag with the specified id in the passed HTML document"""
139         parser = IDParser(id)
140         try:
141                 parser.loads(html)
142         except HTMLParser.HTMLParseError:
143                 pass
144         return parser.get_result()
145
146
147 def clean_html(html):
148         """Clean an HTML snippet into a readable string"""
149         # Newline vs <br />
150         html = html.replace('\n', ' ')
151         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
152         # Strip html tags
153         html = re.sub('<.*?>', '', html)
154         # Replace html entities
155         html = unescapeHTML(html)
156         return html
157
158
159 def sanitize_open(filename, open_mode):
160         """Try to open the given filename, and slightly tweak it if this fails.
161
162         Attempts to open the given filename. If this fails, it tries to change
163         the filename slightly, step by step, until it's either able to open it
164         or it fails and raises a final exception, like the standard open()
165         function.
166
167         It returns the tuple (stream, definitive_file_name).
168         """
169         try:
170                 if filename == u'-':
171                         if sys.platform == 'win32':
172                                 import msvcrt
173                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
174                         return (sys.stdout, filename)
175                 stream = open(encodeFilename(filename), open_mode)
176                 return (stream, filename)
177         except (IOError, OSError), err:
178                 # In case of error, try to remove win32 forbidden chars
179                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
180
181                 # An exception here should be caught in the caller
182                 stream = open(encodeFilename(filename), open_mode)
183                 return (stream, filename)
184
185
186 def timeconvert(timestr):
187         """Convert RFC 2822 defined time string into system timestamp"""
188         timestamp = None
189         timetuple = email.utils.parsedate_tz(timestr)
190         if timetuple is not None:
191                 timestamp = email.utils.mktime_tz(timetuple)
192         return timestamp
193         
194 def sanitize_filename(s):
195         """Sanitizes a string so it could be used as part of a filename."""
196         def replace_insane(char):
197                 if char in u' .\\/|?*<>:"' or ord(char) < 32:
198                         return '_'
199                 return char
200         return u''.join(map(replace_insane, s)).strip('_')
201
202 def orderedSet(iterable):
203         """ Remove all duplicates from the input iterable """
204         res = []
205         for el in iterable:
206                 if el not in res:
207                         res.append(el)
208         return res
209
210 def unescapeHTML(s):
211         """
212         @param s a string (of type unicode)
213         """
214         assert type(s) == type(u'')
215
216         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
217         return result
218
219 def encodeFilename(s):
220         """
221         @param s The name of the file (of type unicode)
222         """
223
224         assert type(s) == type(u'')
225
226         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
227                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
228                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
229                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
230                 return s
231         else:
232                 return s.encode(sys.getfilesystemencoding(), 'ignore')
233
234 class DownloadError(Exception):
235         """Download Error exception.
236
237         This exception may be thrown by FileDownloader objects if they are not
238         configured to continue on errors. They will contain the appropriate
239         error message.
240         """
241         pass
242
243
244 class SameFileError(Exception):
245         """Same File exception.
246
247         This exception will be thrown by FileDownloader objects if they detect
248         multiple files would have to be downloaded to the same file on disk.
249         """
250         pass
251
252
253 class PostProcessingError(Exception):
254         """Post Processing exception.
255
256         This exception may be raised by PostProcessor's .run() method to
257         indicate an error in the postprocessing task.
258         """
259         pass
260
261 class MaxDownloadsReached(Exception):
262         """ --max-downloads limit has been reached. """
263         pass
264
265
266 class UnavailableVideoError(Exception):
267         """Unavailable Format exception.
268
269         This exception will be thrown when a video is requested
270         in a format that is not available for that video.
271         """
272         pass
273
274
275 class ContentTooShortError(Exception):
276         """Content Too Short exception.
277
278         This exception may be raised by FileDownloader objects when a file they
279         download is too small for what the server announced first, indicating
280         the connection was probably interrupted.
281         """
282         # Both in bytes
283         downloaded = None
284         expected = None
285
286         def __init__(self, downloaded, expected):
287                 self.downloaded = downloaded
288                 self.expected = expected
289
290
291 class Trouble(Exception):
292         """Trouble helper exception
293         
294         This is an exception to be handled with
295         FileDownloader.trouble
296         """
297
298 class YoutubeDLHandler(urllib2.HTTPHandler):
299         """Handler for HTTP requests and responses.
300
301         This class, when installed with an OpenerDirector, automatically adds
302         the standard headers to every HTTP request and handles gzipped and
303         deflated responses from web servers. If compression is to be avoided in
304         a particular request, the original request in the program code only has
305         to include the HTTP header "Youtubedl-No-Compression", which will be
306         removed before making the real request.
307
308         Part of this code was copied from:
309
310         http://techknack.net/python-urllib2-handlers/
311
312         Andrew Rowls, the author of that code, agreed to release it to the
313         public domain.
314         """
315
316         @staticmethod
317         def deflate(data):
318                 try:
319                         return zlib.decompress(data, -zlib.MAX_WBITS)
320                 except zlib.error:
321                         return zlib.decompress(data)
322
323         @staticmethod
324         def addinfourl_wrapper(stream, headers, url, code):
325                 if hasattr(urllib2.addinfourl, 'getcode'):
326                         return urllib2.addinfourl(stream, headers, url, code)
327                 ret = urllib2.addinfourl(stream, headers, url)
328                 ret.code = code
329                 return ret
330
331         def http_request(self, req):
332                 for h in std_headers:
333                         if h in req.headers:
334                                 del req.headers[h]
335                         req.add_header(h, std_headers[h])
336                 if 'Youtubedl-no-compression' in req.headers:
337                         if 'Accept-encoding' in req.headers:
338                                 del req.headers['Accept-encoding']
339                         del req.headers['Youtubedl-no-compression']
340                 return req
341
342         def http_response(self, req, resp):
343                 old_resp = resp
344                 # gzip
345                 if resp.headers.get('Content-encoding', '') == 'gzip':
346                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
347                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
348                         resp.msg = old_resp.msg
349                 # deflate
350                 if resp.headers.get('Content-encoding', '') == 'deflate':
351                         gz = StringIO.StringIO(self.deflate(resp.read()))
352                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
353                         resp.msg = old_resp.msg
354                 return resp