2 # -*- coding: utf-8 -*-
14 import cStringIO as StringIO
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
34 import http.cookiejar as compat_cookiejar
35 except ImportError: # Python 2
36 import cookielib as compat_cookiejar
39 import html.entities as compat_html_entities
40 except NameError: # Python 2
41 import htmlentitydefs as compat_html_entities
44 import html.parser as compat_html_parser
45 except NameError: # Python 2
46 import HTMLParser as compat_html_parser
49 compat_str = unicode # Python 2
54 compat_chr = unichr # Python 2
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
66 def preferredencoding():
67 """Get preferred encoding.
69 Returns the best encoding scheme for the system, based on
70 locale.getpreferredencoding() and some further tweaks.
73 pref = locale.getpreferredencoding()
81 def htmlentity_transform(matchobj):
82 """Transforms an HTML entity to a character.
84 This function receives a match object and is intended to be used with
85 the re.sub() function.
87 entity = matchobj.group(1)
89 # Known non-numeric HTML entity
90 if entity in compat_html_entities.name2codepoint:
91 return compat_chr(compat_html_entities.name2codepoint[entity])
93 mobj = re.match(u'(?u)#(x?\\d+)', entity)
95 numstr = mobj.group(1)
96 if numstr.startswith(u'x'):
98 numstr = u'0%s' % numstr
101 return compat_chr(int(numstr, base))
103 # Unknown entity in name, return its literal representation
104 return (u'&%s;' % entity)
106 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
107 class IDParser(compat_html_parser.HTMLParser):
108 """Modified HTMLParser that isolates a tag with the specified id"""
109 def __init__(self, id):
115 self.watch_startpos = False
117 compat_html_parser.HTMLParser.__init__(self)
119 def error(self, message):
120 if self.error_count > 10 or self.started:
121 raise compat_html_parser.HTMLParseError(message, self.getpos())
122 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
123 self.error_count += 1
126 def loads(self, html):
131 def handle_starttag(self, tag, attrs):
134 self.find_startpos(None)
135 if 'id' in attrs and attrs['id'] == self.id:
138 self.watch_startpos = True
140 if not tag in self.depth: self.depth[tag] = 0
143 def handle_endtag(self, tag):
145 if tag in self.depth: self.depth[tag] -= 1
146 if self.depth[self.result[0]] == 0:
148 self.result.append(self.getpos())
150 def find_startpos(self, x):
151 """Needed to put the start position of the result (self.result[1])
152 after the opening tag with the requested id"""
153 if self.watch_startpos:
154 self.watch_startpos = False
155 self.result.append(self.getpos())
156 handle_entityref = handle_charref = handle_data = handle_comment = \
157 handle_decl = handle_pi = unknown_decl = find_startpos
159 def get_result(self):
160 if self.result is None:
162 if len(self.result) != 3:
164 lines = self.html.split('\n')
165 lines = lines[self.result[1][0]-1:self.result[2][0]]
166 lines[0] = lines[0][self.result[1][1]:]
168 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
169 lines[-1] = lines[-1][:self.result[2][1]]
170 return '\n'.join(lines).strip()
172 def get_element_by_id(id, html):
173 """Return the content of the tag with the specified id in the passed HTML document"""
174 parser = IDParser(id)
177 except compat_html_parser.HTMLParseError:
179 return parser.get_result()
182 def clean_html(html):
183 """Clean an HTML snippet into a readable string"""
185 html = html.replace('\n', ' ')
186 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
188 html = re.sub('<.*?>', '', html)
189 # Replace html entities
190 html = unescapeHTML(html)
194 def sanitize_open(filename, open_mode):
195 """Try to open the given filename, and slightly tweak it if this fails.
197 Attempts to open the given filename. If this fails, it tries to change
198 the filename slightly, step by step, until it's either able to open it
199 or it fails and raises a final exception, like the standard open()
202 It returns the tuple (stream, definitive_file_name).
206 if sys.platform == 'win32':
208 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
209 return (sys.stdout, filename)
210 stream = open(encodeFilename(filename), open_mode)
211 return (stream, filename)
212 except (IOError, OSError) as err:
213 # In case of error, try to remove win32 forbidden chars
214 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
216 # An exception here should be caught in the caller
217 stream = open(encodeFilename(filename), open_mode)
218 return (stream, filename)
221 def timeconvert(timestr):
222 """Convert RFC 2822 defined time string into system timestamp"""
224 timetuple = email.utils.parsedate_tz(timestr)
225 if timetuple is not None:
226 timestamp = email.utils.mktime_tz(timetuple)
229 def sanitize_filename(s, restricted=False):
230 """Sanitizes a string so it could be used as part of a filename.
231 If restricted is set, use a stricter subset of allowed characters.
233 def replace_insane(char):
234 if char == '?' or ord(char) < 32 or ord(char) == 127:
237 return '' if restricted else '\''
239 return '_-' if restricted else ' -'
240 elif char in '\\/|*<>':
242 if restricted and (char in '!&\'' or char.isspace()):
244 if restricted and ord(char) > 127:
248 result = u''.join(map(replace_insane, s))
249 while '__' in result:
250 result = result.replace('__', '_')
251 result = result.strip('_')
252 # Common case of "Foreign band name - English song title"
253 if restricted and result.startswith('-_'):
259 def orderedSet(iterable):
260 """ Remove all duplicates from the input iterable """
271 assert type(s) == type(u'')
273 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
276 def encodeFilename(s):
278 @param s The name of the file
281 assert type(s) == type(u'')
283 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
284 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
285 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
286 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
289 return s.encode(sys.getfilesystemencoding(), 'ignore')
291 class DownloadError(Exception):
292 """Download Error exception.
294 This exception may be thrown by FileDownloader objects if they are not
295 configured to continue on errors. They will contain the appropriate
301 class SameFileError(Exception):
302 """Same File exception.
304 This exception will be thrown by FileDownloader objects if they detect
305 multiple files would have to be downloaded to the same file on disk.
310 class PostProcessingError(Exception):
311 """Post Processing exception.
313 This exception may be raised by PostProcessor's .run() method to
314 indicate an error in the postprocessing task.
318 class MaxDownloadsReached(Exception):
319 """ --max-downloads limit has been reached. """
323 class UnavailableVideoError(Exception):
324 """Unavailable Format exception.
326 This exception will be thrown when a video is requested
327 in a format that is not available for that video.
332 class ContentTooShortError(Exception):
333 """Content Too Short exception.
335 This exception may be raised by FileDownloader objects when a file they
336 download is too small for what the server announced first, indicating
337 the connection was probably interrupted.
343 def __init__(self, downloaded, expected):
344 self.downloaded = downloaded
345 self.expected = expected
348 class Trouble(Exception):
349 """Trouble helper exception
351 This is an exception to be handled with
352 FileDownloader.trouble
355 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
356 """Handler for HTTP requests and responses.
358 This class, when installed with an OpenerDirector, automatically adds
359 the standard headers to every HTTP request and handles gzipped and
360 deflated responses from web servers. If compression is to be avoided in
361 a particular request, the original request in the program code only has
362 to include the HTTP header "Youtubedl-No-Compression", which will be
363 removed before making the real request.
365 Part of this code was copied from:
367 http://techknack.net/python-urllib2-handlers/
369 Andrew Rowls, the author of that code, agreed to release it to the
376 return zlib.decompress(data, -zlib.MAX_WBITS)
378 return zlib.decompress(data)
381 def addinfourl_wrapper(stream, headers, url, code):
382 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
383 return compat_urllib_request.addinfourl(stream, headers, url, code)
384 ret = compat_urllib_request.addinfourl(stream, headers, url)
388 def http_request(self, req):
389 for h in std_headers:
392 req.add_header(h, std_headers[h])
393 if 'Youtubedl-no-compression' in req.headers:
394 if 'Accept-encoding' in req.headers:
395 del req.headers['Accept-encoding']
396 del req.headers['Youtubedl-no-compression']
399 def http_response(self, req, resp):
402 if resp.headers.get('Content-encoding', '') == 'gzip':
403 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
407 if resp.headers.get('Content-encoding', '') == 'deflate':
408 gz = StringIO.StringIO(self.deflate(resp.read()))
409 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
410 resp.msg = old_resp.msg