2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except ImportError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except ImportError: # Python 2
42 import HTMLParser as compat_html_parser
45 import http.client as compat_http_client
46 except ImportError: # Python 2
47 import httplib as compat_http_client
50 from urllib.parse import parse_qs as compat_parse_qs
51 except ImportError: # Python 2
52 from urlparse import parse_qs as compat_parse_qs
55 compat_str = unicode # Python 2
60 compat_chr = unichr # Python 2
65 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
66 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
67 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68 'Accept-Encoding': 'gzip, deflate',
69 'Accept-Language': 'en-us,en;q=0.5',
71 def preferredencoding():
72 """Get preferred encoding.
74 Returns the best encoding scheme for the system, based on
75 locale.getpreferredencoding() and some further tweaks.
78 pref = locale.getpreferredencoding()
85 if sys.version_info < (3,0):
87 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
90 assert type(s) == type(u'')
93 def htmlentity_transform(matchobj):
94 """Transforms an HTML entity to a character.
96 This function receives a match object and is intended to be used with
97 the re.sub() function.
99 entity = matchobj.group(1)
101 # Known non-numeric HTML entity
102 if entity in compat_html_entities.name2codepoint:
103 return compat_chr(compat_html_entities.name2codepoint[entity])
105 mobj = re.match(u'(?u)#(x?\\d+)', entity)
107 numstr = mobj.group(1)
108 if numstr.startswith(u'x'):
110 numstr = u'0%s' % numstr
113 return compat_chr(int(numstr, base))
115 # Unknown entity in name, return its literal representation
116 return (u'&%s;' % entity)
118 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
119 class IDParser(compat_html_parser.HTMLParser):
120 """Modified HTMLParser that isolates a tag with the specified id"""
121 def __init__(self, id):
127 self.watch_startpos = False
129 compat_html_parser.HTMLParser.__init__(self)
131 def error(self, message):
132 if self.error_count > 10 or self.started:
133 raise compat_html_parser.HTMLParseError(message, self.getpos())
134 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
135 self.error_count += 1
138 def loads(self, html):
143 def handle_starttag(self, tag, attrs):
146 self.find_startpos(None)
147 if 'id' in attrs and attrs['id'] == self.id:
150 self.watch_startpos = True
152 if not tag in self.depth: self.depth[tag] = 0
155 def handle_endtag(self, tag):
157 if tag in self.depth: self.depth[tag] -= 1
158 if self.depth[self.result[0]] == 0:
160 self.result.append(self.getpos())
162 def find_startpos(self, x):
163 """Needed to put the start position of the result (self.result[1])
164 after the opening tag with the requested id"""
165 if self.watch_startpos:
166 self.watch_startpos = False
167 self.result.append(self.getpos())
168 handle_entityref = handle_charref = handle_data = handle_comment = \
169 handle_decl = handle_pi = unknown_decl = find_startpos
171 def get_result(self):
172 if self.result is None:
174 if len(self.result) != 3:
176 lines = self.html.split('\n')
177 lines = lines[self.result[1][0]-1:self.result[2][0]]
178 lines[0] = lines[0][self.result[1][1]:]
180 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
181 lines[-1] = lines[-1][:self.result[2][1]]
182 return '\n'.join(lines).strip()
184 def get_element_by_id(id, html):
185 """Return the content of the tag with the specified id in the passed HTML document"""
186 parser = IDParser(id)
189 except compat_html_parser.HTMLParseError:
191 return parser.get_result()
194 def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
197 html = html.replace('\n', ' ')
198 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
200 html = re.sub('<.*?>', '', html)
201 # Replace html entities
202 html = unescapeHTML(html)
206 def sanitize_open(filename, open_mode):
207 """Try to open the given filename, and slightly tweak it if this fails.
209 Attempts to open the given filename. If this fails, it tries to change
210 the filename slightly, step by step, until it's either able to open it
211 or it fails and raises a final exception, like the standard open()
214 It returns the tuple (stream, definitive_file_name).
218 if sys.platform == 'win32':
220 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
221 return (sys.stdout, filename)
222 stream = open(encodeFilename(filename), open_mode)
223 return (stream, filename)
224 except (IOError, OSError) as err:
225 # In case of error, try to remove win32 forbidden chars
226 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
228 # An exception here should be caught in the caller
229 stream = open(encodeFilename(filename), open_mode)
230 return (stream, filename)
233 def timeconvert(timestr):
234 """Convert RFC 2822 defined time string into system timestamp"""
236 timetuple = email.utils.parsedate_tz(timestr)
237 if timetuple is not None:
238 timestamp = email.utils.mktime_tz(timetuple)
241 def sanitize_filename(s, restricted=False):
242 """Sanitizes a string so it could be used as part of a filename.
243 If restricted is set, use a stricter subset of allowed characters.
245 def replace_insane(char):
246 if char == '?' or ord(char) < 32 or ord(char) == 127:
249 return '' if restricted else '\''
251 return '_-' if restricted else ' -'
252 elif char in '\\/|*<>':
254 if restricted and (char in '!&\'' or char.isspace()):
256 if restricted and ord(char) > 127:
260 result = u''.join(map(replace_insane, s))
261 while '__' in result:
262 result = result.replace('__', '_')
263 result = result.strip('_')
264 # Common case of "Foreign band name - English song title"
265 if restricted and result.startswith('-_'):
271 def orderedSet(iterable):
272 """ Remove all duplicates from the input iterable """
283 assert type(s) == type(u'')
285 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
288 def encodeFilename(s):
290 @param s The name of the file
293 assert type(s) == type(u'')
295 # Python 3 has a Unicode API
296 if sys.version_info >= (3, 0):
299 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
300 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
301 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
302 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
305 return s.encode(sys.getfilesystemencoding(), 'ignore')
307 class DownloadError(Exception):
308 """Download Error exception.
310 This exception may be thrown by FileDownloader objects if they are not
311 configured to continue on errors. They will contain the appropriate
317 class SameFileError(Exception):
318 """Same File exception.
320 This exception will be thrown by FileDownloader objects if they detect
321 multiple files would have to be downloaded to the same file on disk.
326 class PostProcessingError(Exception):
327 """Post Processing exception.
329 This exception may be raised by PostProcessor's .run() method to
330 indicate an error in the postprocessing task.
334 class MaxDownloadsReached(Exception):
335 """ --max-downloads limit has been reached. """
339 class UnavailableVideoError(Exception):
340 """Unavailable Format exception.
342 This exception will be thrown when a video is requested
343 in a format that is not available for that video.
348 class ContentTooShortError(Exception):
349 """Content Too Short exception.
351 This exception may be raised by FileDownloader objects when a file they
352 download is too small for what the server announced first, indicating
353 the connection was probably interrupted.
359 def __init__(self, downloaded, expected):
360 self.downloaded = downloaded
361 self.expected = expected
364 class Trouble(Exception):
365 """Trouble helper exception
367 This is an exception to be handled with
368 FileDownloader.trouble
371 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
372 """Handler for HTTP requests and responses.
374 This class, when installed with an OpenerDirector, automatically adds
375 the standard headers to every HTTP request and handles gzipped and
376 deflated responses from web servers. If compression is to be avoided in
377 a particular request, the original request in the program code only has
378 to include the HTTP header "Youtubedl-No-Compression", which will be
379 removed before making the real request.
381 Part of this code was copied from:
383 http://techknack.net/python-urllib2-handlers/
385 Andrew Rowls, the author of that code, agreed to release it to the
392 return zlib.decompress(data, -zlib.MAX_WBITS)
394 return zlib.decompress(data)
397 def addinfourl_wrapper(stream, headers, url, code):
398 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
399 return compat_urllib_request.addinfourl(stream, headers, url, code)
400 ret = compat_urllib_request.addinfourl(stream, headers, url)
404 def http_request(self, req):
405 for h in std_headers:
408 req.add_header(h, std_headers[h])
409 if 'Youtubedl-no-compression' in req.headers:
410 if 'Accept-encoding' in req.headers:
411 del req.headers['Accept-encoding']
412 del req.headers['Youtubedl-no-compression']
415 def http_response(self, req, resp):
418 if resp.headers.get('Content-encoding', '') == 'gzip':
419 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
420 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
421 resp.msg = old_resp.msg
423 if resp.headers.get('Content-encoding', '') == 'deflate':
424 gz = io.BytesIO(self.deflate(resp.read()))
425 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
426 resp.msg = old_resp.msg