2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except NameError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except NameError: # Python 2
42 import HTMLParser as compat_html_parser
45 import http.client as compat_html_client
46 except NameError: # Python 2
47 import httplib as compat_html_client
51 compat_str = unicode # Python 2
56 compat_chr = unichr # Python 2
62 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
63 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
64 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
65 'Accept-Encoding': 'gzip, deflate',
66 'Accept-Language': 'en-us,en;q=0.5',
68 def preferredencoding():
69 """Get preferred encoding.
71 Returns the best encoding scheme for the system, based on
72 locale.getpreferredencoding() and some further tweaks.
75 pref = locale.getpreferredencoding()
83 def htmlentity_transform(matchobj):
84 """Transforms an HTML entity to a character.
86 This function receives a match object and is intended to be used with
87 the re.sub() function.
89 entity = matchobj.group(1)
91 # Known non-numeric HTML entity
92 if entity in compat_html_entities.name2codepoint:
93 return compat_chr(compat_html_entities.name2codepoint[entity])
95 mobj = re.match(u'(?u)#(x?\\d+)', entity)
97 numstr = mobj.group(1)
98 if numstr.startswith(u'x'):
100 numstr = u'0%s' % numstr
103 return compat_chr(int(numstr, base))
105 # Unknown entity in name, return its literal representation
106 return (u'&%s;' % entity)
108 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
109 class IDParser(compat_html_parser.HTMLParser):
110 """Modified HTMLParser that isolates a tag with the specified id"""
111 def __init__(self, id):
117 self.watch_startpos = False
119 compat_html_parser.HTMLParser.__init__(self)
121 def error(self, message):
122 if self.error_count > 10 or self.started:
123 raise compat_html_parser.HTMLParseError(message, self.getpos())
124 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
125 self.error_count += 1
128 def loads(self, html):
133 def handle_starttag(self, tag, attrs):
136 self.find_startpos(None)
137 if 'id' in attrs and attrs['id'] == self.id:
140 self.watch_startpos = True
142 if not tag in self.depth: self.depth[tag] = 0
145 def handle_endtag(self, tag):
147 if tag in self.depth: self.depth[tag] -= 1
148 if self.depth[self.result[0]] == 0:
150 self.result.append(self.getpos())
152 def find_startpos(self, x):
153 """Needed to put the start position of the result (self.result[1])
154 after the opening tag with the requested id"""
155 if self.watch_startpos:
156 self.watch_startpos = False
157 self.result.append(self.getpos())
158 handle_entityref = handle_charref = handle_data = handle_comment = \
159 handle_decl = handle_pi = unknown_decl = find_startpos
161 def get_result(self):
162 if self.result is None:
164 if len(self.result) != 3:
166 lines = self.html.split('\n')
167 lines = lines[self.result[1][0]-1:self.result[2][0]]
168 lines[0] = lines[0][self.result[1][1]:]
170 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
171 lines[-1] = lines[-1][:self.result[2][1]]
172 return '\n'.join(lines).strip()
174 def get_element_by_id(id, html):
175 """Return the content of the tag with the specified id in the passed HTML document"""
176 parser = IDParser(id)
179 except compat_html_parser.HTMLParseError:
181 return parser.get_result()
184 def clean_html(html):
185 """Clean an HTML snippet into a readable string"""
187 html = html.replace('\n', ' ')
188 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
190 html = re.sub('<.*?>', '', html)
191 # Replace html entities
192 html = unescapeHTML(html)
196 def sanitize_open(filename, open_mode):
197 """Try to open the given filename, and slightly tweak it if this fails.
199 Attempts to open the given filename. If this fails, it tries to change
200 the filename slightly, step by step, until it's either able to open it
201 or it fails and raises a final exception, like the standard open()
204 It returns the tuple (stream, definitive_file_name).
208 if sys.platform == 'win32':
210 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
211 return (sys.stdout, filename)
212 stream = open(encodeFilename(filename), open_mode)
213 return (stream, filename)
214 except (IOError, OSError) as err:
215 # In case of error, try to remove win32 forbidden chars
216 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
218 # An exception here should be caught in the caller
219 stream = open(encodeFilename(filename), open_mode)
220 return (stream, filename)
223 def timeconvert(timestr):
224 """Convert RFC 2822 defined time string into system timestamp"""
226 timetuple = email.utils.parsedate_tz(timestr)
227 if timetuple is not None:
228 timestamp = email.utils.mktime_tz(timetuple)
231 def sanitize_filename(s, restricted=False):
232 """Sanitizes a string so it could be used as part of a filename.
233 If restricted is set, use a stricter subset of allowed characters.
235 def replace_insane(char):
236 if char == '?' or ord(char) < 32 or ord(char) == 127:
239 return '' if restricted else '\''
241 return '_-' if restricted else ' -'
242 elif char in '\\/|*<>':
244 if restricted and (char in '!&\'' or char.isspace()):
246 if restricted and ord(char) > 127:
250 result = u''.join(map(replace_insane, s))
251 while '__' in result:
252 result = result.replace('__', '_')
253 result = result.strip('_')
254 # Common case of "Foreign band name - English song title"
255 if restricted and result.startswith('-_'):
261 def orderedSet(iterable):
262 """ Remove all duplicates from the input iterable """
273 assert type(s) == type(u'')
275 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
278 def encodeFilename(s):
280 @param s The name of the file
283 assert type(s) == type(u'')
285 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
286 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
287 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
288 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
291 return s.encode(sys.getfilesystemencoding(), 'ignore')
293 class DownloadError(Exception):
294 """Download Error exception.
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
303 class SameFileError(Exception):
304 """Same File exception.
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
312 class PostProcessingError(Exception):
313 """Post Processing exception.
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
350 class Trouble(Exception):
351 """Trouble helper exception
353 This is an exception to be handled with
354 FileDownloader.trouble
357 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
358 """Handler for HTTP requests and responses.
360 This class, when installed with an OpenerDirector, automatically adds
361 the standard headers to every HTTP request and handles gzipped and
362 deflated responses from web servers. If compression is to be avoided in
363 a particular request, the original request in the program code only has
364 to include the HTTP header "Youtubedl-No-Compression", which will be
365 removed before making the real request.
367 Part of this code was copied from:
369 http://techknack.net/python-urllib2-handlers/
371 Andrew Rowls, the author of that code, agreed to release it to the
378 return zlib.decompress(data, -zlib.MAX_WBITS)
380 return zlib.decompress(data)
383 def addinfourl_wrapper(stream, headers, url, code):
384 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
385 return compat_urllib_request.addinfourl(stream, headers, url, code)
386 ret = compat_urllib_request.addinfourl(stream, headers, url)
390 def http_request(self, req):
391 for h in std_headers:
394 req.add_header(h, std_headers[h])
395 if 'Youtubedl-no-compression' in req.headers:
396 if 'Accept-encoding' in req.headers:
397 del req.headers['Accept-encoding']
398 del req.headers['Youtubedl-no-compression']
401 def http_response(self, req, resp):
404 if resp.headers.get('Content-encoding', '') == 'gzip':
405 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
406 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
407 resp.msg = old_resp.msg
409 if resp.headers.get('Content-encoding', '') == 'deflate':
410 gz = io.BytesIO(self.deflate(resp.read()))
411 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
412 resp.msg = old_resp.msg