2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 from urllib.parse import urlparse as compat_urllib_parse_urlparse
31 except ImportError: # Python 2
32 from urlparse import urlparse as compat_urllib_parse_urlparse
35 import http.cookiejar as compat_cookiejar
36 except ImportError: # Python 2
37 import cookielib as compat_cookiejar
40 import html.entities as compat_html_entities
41 except ImportError: # Python 2
42 import htmlentitydefs as compat_html_entities
45 import html.parser as compat_html_parser
46 except ImportError: # Python 2
47 import HTMLParser as compat_html_parser
50 import http.client as compat_http_client
51 except ImportError: # Python 2
52 import httplib as compat_http_client
55 from subprocess import DEVNULL
56 compat_subprocess_get_DEVNULL = lambda: DEVNULL
58 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
61 from urllib.parse import parse_qs as compat_parse_qs
62 except ImportError: # Python 2
63 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
64 # Python 2's version is apparently totally broken
65 def _unquote(string, encoding='utf-8', errors='replace'):
68 res = string.split('%')
75 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
82 pct_sequence += item[:2].decode('hex')
85 # This segment was just a single percent-encoded character.
86 # May be part of a sequence of code units, so delay decoding.
87 # (Stored in pct_sequence).
91 # Encountered non-percent-encoded characters. Flush the current
93 string += pct_sequence.decode(encoding, errors) + rest
96 # Flush the final pct_sequence
97 string += pct_sequence.decode(encoding, errors)
100 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
101 encoding='utf-8', errors='replace'):
102 qs, _coerce_result = qs, unicode
103 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
105 for name_value in pairs:
106 if not name_value and not strict_parsing:
108 nv = name_value.split('=', 1)
111 raise ValueError("bad query field: %r" % (name_value,))
112 # Handle case of a control-name with no equal sign
113 if keep_blank_values:
117 if len(nv[1]) or keep_blank_values:
118 name = nv[0].replace('+', ' ')
119 name = _unquote(name, encoding=encoding, errors=errors)
120 name = _coerce_result(name)
121 value = nv[1].replace('+', ' ')
122 value = _unquote(value, encoding=encoding, errors=errors)
123 value = _coerce_result(value)
124 r.append((name, value))
127 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
128 encoding='utf-8', errors='replace'):
130 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
131 encoding=encoding, errors=errors)
132 for name, value in pairs:
133 if name in parsed_result:
134 parsed_result[name].append(value)
136 parsed_result[name] = [value]
140 compat_str = unicode # Python 2
145 compat_chr = unichr # Python 2
150 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
151 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
152 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
153 'Accept-Encoding': 'gzip, deflate',
154 'Accept-Language': 'en-us,en;q=0.5',
156 def preferredencoding():
157 """Get preferred encoding.
159 Returns the best encoding scheme for the system, based on
160 locale.getpreferredencoding() and some further tweaks.
163 pref = locale.getpreferredencoding()
170 if sys.version_info < (3,0):
172 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
175 assert type(s) == type(u'')
178 def htmlentity_transform(matchobj):
179 """Transforms an HTML entity to a character.
181 This function receives a match object and is intended to be used with
182 the re.sub() function.
184 entity = matchobj.group(1)
186 # Known non-numeric HTML entity
187 if entity in compat_html_entities.name2codepoint:
188 return compat_chr(compat_html_entities.name2codepoint[entity])
190 mobj = re.match(u'(?u)#(x?\\d+)', entity)
192 numstr = mobj.group(1)
193 if numstr.startswith(u'x'):
195 numstr = u'0%s' % numstr
198 return compat_chr(int(numstr, base))
200 # Unknown entity in name, return its literal representation
201 return (u'&%s;' % entity)
203 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
204 class IDParser(compat_html_parser.HTMLParser):
205 """Modified HTMLParser that isolates a tag with the specified id"""
206 def __init__(self, id):
212 self.watch_startpos = False
214 compat_html_parser.HTMLParser.__init__(self)
216 def error(self, message):
217 if self.error_count > 10 or self.started:
218 raise compat_html_parser.HTMLParseError(message, self.getpos())
219 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
220 self.error_count += 1
223 def loads(self, html):
228 def handle_starttag(self, tag, attrs):
231 self.find_startpos(None)
232 if 'id' in attrs and attrs['id'] == self.id:
235 self.watch_startpos = True
237 if not tag in self.depth: self.depth[tag] = 0
240 def handle_endtag(self, tag):
242 if tag in self.depth: self.depth[tag] -= 1
243 if self.depth[self.result[0]] == 0:
245 self.result.append(self.getpos())
247 def find_startpos(self, x):
248 """Needed to put the start position of the result (self.result[1])
249 after the opening tag with the requested id"""
250 if self.watch_startpos:
251 self.watch_startpos = False
252 self.result.append(self.getpos())
253 handle_entityref = handle_charref = handle_data = handle_comment = \
254 handle_decl = handle_pi = unknown_decl = find_startpos
256 def get_result(self):
257 if self.result is None:
259 if len(self.result) != 3:
261 lines = self.html.split('\n')
262 lines = lines[self.result[1][0]-1:self.result[2][0]]
263 lines[0] = lines[0][self.result[1][1]:]
265 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
266 lines[-1] = lines[-1][:self.result[2][1]]
267 return '\n'.join(lines).strip()
269 def get_element_by_id(id, html):
270 """Return the content of the tag with the specified id in the passed HTML document"""
271 parser = IDParser(id)
274 except compat_html_parser.HTMLParseError:
276 return parser.get_result()
279 def clean_html(html):
280 """Clean an HTML snippet into a readable string"""
282 html = html.replace('\n', ' ')
283 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
285 html = re.sub('<.*?>', '', html)
286 # Replace html entities
287 html = unescapeHTML(html)
291 def sanitize_open(filename, open_mode):
292 """Try to open the given filename, and slightly tweak it if this fails.
294 Attempts to open the given filename. If this fails, it tries to change
295 the filename slightly, step by step, until it's either able to open it
296 or it fails and raises a final exception, like the standard open()
299 It returns the tuple (stream, definitive_file_name).
303 if sys.platform == 'win32':
305 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
306 return (sys.stdout, filename)
307 stream = open(encodeFilename(filename), open_mode)
308 return (stream, filename)
309 except (IOError, OSError) as err:
310 # In case of error, try to remove win32 forbidden chars
311 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
313 # An exception here should be caught in the caller
314 stream = open(encodeFilename(filename), open_mode)
315 return (stream, filename)
318 def timeconvert(timestr):
319 """Convert RFC 2822 defined time string into system timestamp"""
321 timetuple = email.utils.parsedate_tz(timestr)
322 if timetuple is not None:
323 timestamp = email.utils.mktime_tz(timetuple)
326 def sanitize_filename(s, restricted=False, is_id=False):
327 """Sanitizes a string so it could be used as part of a filename.
328 If restricted is set, use a stricter subset of allowed characters.
329 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
331 def replace_insane(char):
332 if char == '?' or ord(char) < 32 or ord(char) == 127:
335 return '' if restricted else '\''
337 return '_-' if restricted else ' -'
338 elif char in '\\/|*<>':
340 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
342 if restricted and ord(char) > 127:
346 result = u''.join(map(replace_insane, s))
348 while '__' in result:
349 result = result.replace('__', '_')
350 result = result.strip('_')
351 # Common case of "Foreign band name - English song title"
352 if restricted and result.startswith('-_'):
358 def orderedSet(iterable):
359 """ Remove all duplicates from the input iterable """
370 assert type(s) == type(u'')
372 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
375 def encodeFilename(s):
377 @param s The name of the file
380 assert type(s) == type(u'')
382 # Python 3 has a Unicode API
383 if sys.version_info >= (3, 0):
386 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
387 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
388 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
389 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
392 return s.encode(sys.getfilesystemencoding(), 'ignore')
394 class DownloadError(Exception):
395 """Download Error exception.
397 This exception may be thrown by FileDownloader objects if they are not
398 configured to continue on errors. They will contain the appropriate
404 class SameFileError(Exception):
405 """Same File exception.
407 This exception will be thrown by FileDownloader objects if they detect
408 multiple files would have to be downloaded to the same file on disk.
413 class PostProcessingError(Exception):
414 """Post Processing exception.
416 This exception may be raised by PostProcessor's .run() method to
417 indicate an error in the postprocessing task.
421 class MaxDownloadsReached(Exception):
422 """ --max-downloads limit has been reached. """
426 class UnavailableVideoError(Exception):
427 """Unavailable Format exception.
429 This exception will be thrown when a video is requested
430 in a format that is not available for that video.
435 class ContentTooShortError(Exception):
436 """Content Too Short exception.
438 This exception may be raised by FileDownloader objects when a file they
439 download is too small for what the server announced first, indicating
440 the connection was probably interrupted.
446 def __init__(self, downloaded, expected):
447 self.downloaded = downloaded
448 self.expected = expected
451 class Trouble(Exception):
452 """Trouble helper exception
454 This is an exception to be handled with
455 FileDownloader.trouble
458 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
459 """Handler for HTTP requests and responses.
461 This class, when installed with an OpenerDirector, automatically adds
462 the standard headers to every HTTP request and handles gzipped and
463 deflated responses from web servers. If compression is to be avoided in
464 a particular request, the original request in the program code only has
465 to include the HTTP header "Youtubedl-No-Compression", which will be
466 removed before making the real request.
468 Part of this code was copied from:
470 http://techknack.net/python-urllib2-handlers/
472 Andrew Rowls, the author of that code, agreed to release it to the
479 return zlib.decompress(data, -zlib.MAX_WBITS)
481 return zlib.decompress(data)
484 def addinfourl_wrapper(stream, headers, url, code):
485 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
486 return compat_urllib_request.addinfourl(stream, headers, url, code)
487 ret = compat_urllib_request.addinfourl(stream, headers, url)
491 def http_request(self, req):
492 for h in std_headers:
495 req.add_header(h, std_headers[h])
496 if 'Youtubedl-no-compression' in req.headers:
497 if 'Accept-encoding' in req.headers:
498 del req.headers['Accept-encoding']
499 del req.headers['Youtubedl-no-compression']
502 def http_response(self, req, resp):
505 if resp.headers.get('Content-encoding', '') == 'gzip':
506 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
507 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
508 resp.msg = old_resp.msg
510 if resp.headers.get('Content-encoding', '') == 'deflate':
511 gz = io.BytesIO(self.deflate(resp.read()))
512 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
513 resp.msg = old_resp.msg
516 https_request = http_request
517 https_response = http_response