2 # -*- coding: utf-8 -*-
16 import urllib.request as compat_urllib_request
17 except ImportError: # Python 2
18 import urllib2 as compat_urllib_request
21 import urllib.error as compat_urllib_error
22 except ImportError: # Python 2
23 import urllib2 as compat_urllib_error
26 import urllib.parse as compat_urllib_parse
27 except ImportError: # Python 2
28 import urllib as compat_urllib_parse
31 from urllib.parse import urlparse as compat_urllib_parse_urlparse
32 except ImportError: # Python 2
33 from urlparse import urlparse as compat_urllib_parse_urlparse
36 import http.cookiejar as compat_cookiejar
37 except ImportError: # Python 2
38 import cookielib as compat_cookiejar
41 import html.entities as compat_html_entities
42 except ImportError: # Python 2
43 import htmlentitydefs as compat_html_entities
46 import html.parser as compat_html_parser
47 except ImportError: # Python 2
48 import HTMLParser as compat_html_parser
51 import http.client as compat_http_client
52 except ImportError: # Python 2
53 import httplib as compat_http_client
56 from subprocess import DEVNULL
57 compat_subprocess_get_DEVNULL = lambda: DEVNULL
59 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
62 from urllib.parse import parse_qs as compat_parse_qs
63 except ImportError: # Python 2
64 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
65 # Python 2's version is apparently totally broken
66 def _unquote(string, encoding='utf-8', errors='replace'):
69 res = string.split('%')
76 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
83 pct_sequence += item[:2].decode('hex')
86 # This segment was just a single percent-encoded character.
87 # May be part of a sequence of code units, so delay decoding.
88 # (Stored in pct_sequence).
92 # Encountered non-percent-encoded characters. Flush the current
94 string += pct_sequence.decode(encoding, errors) + rest
97 # Flush the final pct_sequence
98 string += pct_sequence.decode(encoding, errors)
101 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
102 encoding='utf-8', errors='replace'):
103 qs, _coerce_result = qs, unicode
104 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
106 for name_value in pairs:
107 if not name_value and not strict_parsing:
109 nv = name_value.split('=', 1)
112 raise ValueError("bad query field: %r" % (name_value,))
113 # Handle case of a control-name with no equal sign
114 if keep_blank_values:
118 if len(nv[1]) or keep_blank_values:
119 name = nv[0].replace('+', ' ')
120 name = _unquote(name, encoding=encoding, errors=errors)
121 name = _coerce_result(name)
122 value = nv[1].replace('+', ' ')
123 value = _unquote(value, encoding=encoding, errors=errors)
124 value = _coerce_result(value)
125 r.append((name, value))
128 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
131 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132 encoding=encoding, errors=errors)
133 for name, value in pairs:
134 if name in parsed_result:
135 parsed_result[name].append(value)
137 parsed_result[name] = [value]
141 compat_str = unicode # Python 2
146 compat_chr = unichr # Python 2
151 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
152 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
154 'Accept-Encoding': 'gzip, deflate',
155 'Accept-Language': 'en-us,en;q=0.5',
157 def preferredencoding():
158 """Get preferred encoding.
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
164 pref = locale.getpreferredencoding()
171 if sys.version_info < (3,0):
173 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
176 assert type(s) == type(u'')
179 # In Python 2.x, json.dump expects a bytestream.
180 # In Python 3.x, it writes to a character stream
181 if sys.version_info < (3,0):
182 def write_json_file(obj, fn):
183 with open(fn, 'wb') as f:
186 def write_json_file(obj, fn):
187 with open(fn, 'w', encoding='utf-8') as f:
191 def htmlentity_transform(matchobj):
192 """Transforms an HTML entity to a character.
194 This function receives a match object and is intended to be used with
195 the re.sub() function.
197 entity = matchobj.group(1)
199 # Known non-numeric HTML entity
200 if entity in compat_html_entities.name2codepoint:
201 return compat_chr(compat_html_entities.name2codepoint[entity])
203 mobj = re.match(u'(?u)#(x?\\d+)', entity)
205 numstr = mobj.group(1)
206 if numstr.startswith(u'x'):
208 numstr = u'0%s' % numstr
211 return compat_chr(int(numstr, base))
213 # Unknown entity in name, return its literal representation
214 return (u'&%s;' % entity)
216 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
217 class AttrParser(compat_html_parser.HTMLParser):
218 """Modified HTMLParser that isolates a tag with the specified attribute"""
219 def __init__(self, attribute, value):
220 self.attribute = attribute
226 self.watch_startpos = False
228 compat_html_parser.HTMLParser.__init__(self)
230 def error(self, message):
231 if self.error_count > 10 or self.started:
232 raise compat_html_parser.HTMLParseError(message, self.getpos())
233 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
234 self.error_count += 1
237 def loads(self, html):
242 def handle_starttag(self, tag, attrs):
245 self.find_startpos(None)
246 if self.attribute in attrs and attrs[self.attribute] == self.value:
249 self.watch_startpos = True
251 if not tag in self.depth: self.depth[tag] = 0
254 def handle_endtag(self, tag):
256 if tag in self.depth: self.depth[tag] -= 1
257 if self.depth[self.result[0]] == 0:
259 self.result.append(self.getpos())
261 def find_startpos(self, x):
262 """Needed to put the start position of the result (self.result[1])
263 after the opening tag with the requested id"""
264 if self.watch_startpos:
265 self.watch_startpos = False
266 self.result.append(self.getpos())
267 handle_entityref = handle_charref = handle_data = handle_comment = \
268 handle_decl = handle_pi = unknown_decl = find_startpos
270 def get_result(self):
271 if self.result is None:
273 if len(self.result) != 3:
275 lines = self.html.split('\n')
276 lines = lines[self.result[1][0]-1:self.result[2][0]]
277 lines[0] = lines[0][self.result[1][1]:]
279 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
280 lines[-1] = lines[-1][:self.result[2][1]]
281 return '\n'.join(lines).strip()
283 def get_element_by_id(id, html):
284 """Return the content of the tag with the specified ID in the passed HTML document"""
285 return get_element_by_attribute("id", id, html)
287 def get_element_by_attribute(attribute, value, html):
288 """Return the content of the tag with the specified attribute in the passed HTML document"""
289 parser = AttrParser(attribute, value)
292 except compat_html_parser.HTMLParseError:
294 return parser.get_result()
297 def clean_html(html):
298 """Clean an HTML snippet into a readable string"""
300 html = html.replace('\n', ' ')
301 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
303 html = re.sub('<.*?>', '', html)
304 # Replace html entities
305 html = unescapeHTML(html)
309 def sanitize_open(filename, open_mode):
310 """Try to open the given filename, and slightly tweak it if this fails.
312 Attempts to open the given filename. If this fails, it tries to change
313 the filename slightly, step by step, until it's either able to open it
314 or it fails and raises a final exception, like the standard open()
317 It returns the tuple (stream, definitive_file_name).
321 if sys.platform == 'win32':
323 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
324 return (sys.stdout, filename)
325 stream = open(encodeFilename(filename), open_mode)
326 return (stream, filename)
327 except (IOError, OSError) as err:
328 # In case of error, try to remove win32 forbidden chars
329 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
331 # An exception here should be caught in the caller
332 stream = open(encodeFilename(filename), open_mode)
333 return (stream, filename)
336 def timeconvert(timestr):
337 """Convert RFC 2822 defined time string into system timestamp"""
339 timetuple = email.utils.parsedate_tz(timestr)
340 if timetuple is not None:
341 timestamp = email.utils.mktime_tz(timetuple)
344 def sanitize_filename(s, restricted=False, is_id=False):
345 """Sanitizes a string so it could be used as part of a filename.
346 If restricted is set, use a stricter subset of allowed characters.
347 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
349 def replace_insane(char):
350 if char == '?' or ord(char) < 32 or ord(char) == 127:
353 return '' if restricted else '\''
355 return '_-' if restricted else ' -'
356 elif char in '\\/|*<>':
358 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
360 if restricted and ord(char) > 127:
364 result = u''.join(map(replace_insane, s))
366 while '__' in result:
367 result = result.replace('__', '_')
368 result = result.strip('_')
369 # Common case of "Foreign band name - English song title"
370 if restricted and result.startswith('-_'):
376 def orderedSet(iterable):
377 """ Remove all duplicates from the input iterable """
388 assert type(s) == type(u'')
390 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
393 def encodeFilename(s):
395 @param s The name of the file
398 assert type(s) == type(u'')
400 # Python 3 has a Unicode API
401 if sys.version_info >= (3, 0):
404 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
405 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
406 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
407 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
410 return s.encode(sys.getfilesystemencoding(), 'ignore')
412 class DownloadError(Exception):
413 """Download Error exception.
415 This exception may be thrown by FileDownloader objects if they are not
416 configured to continue on errors. They will contain the appropriate
422 class SameFileError(Exception):
423 """Same File exception.
425 This exception will be thrown by FileDownloader objects if they detect
426 multiple files would have to be downloaded to the same file on disk.
431 class PostProcessingError(Exception):
432 """Post Processing exception.
434 This exception may be raised by PostProcessor's .run() method to
435 indicate an error in the postprocessing task.
439 class MaxDownloadsReached(Exception):
440 """ --max-downloads limit has been reached. """
444 class UnavailableVideoError(Exception):
445 """Unavailable Format exception.
447 This exception will be thrown when a video is requested
448 in a format that is not available for that video.
453 class ContentTooShortError(Exception):
454 """Content Too Short exception.
456 This exception may be raised by FileDownloader objects when a file they
457 download is too small for what the server announced first, indicating
458 the connection was probably interrupted.
464 def __init__(self, downloaded, expected):
465 self.downloaded = downloaded
466 self.expected = expected
469 class Trouble(Exception):
470 """Trouble helper exception
472 This is an exception to be handled with
473 FileDownloader.trouble
476 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
477 """Handler for HTTP requests and responses.
479 This class, when installed with an OpenerDirector, automatically adds
480 the standard headers to every HTTP request and handles gzipped and
481 deflated responses from web servers. If compression is to be avoided in
482 a particular request, the original request in the program code only has
483 to include the HTTP header "Youtubedl-No-Compression", which will be
484 removed before making the real request.
486 Part of this code was copied from:
488 http://techknack.net/python-urllib2-handlers/
490 Andrew Rowls, the author of that code, agreed to release it to the
497 return zlib.decompress(data, -zlib.MAX_WBITS)
499 return zlib.decompress(data)
502 def addinfourl_wrapper(stream, headers, url, code):
503 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
504 return compat_urllib_request.addinfourl(stream, headers, url, code)
505 ret = compat_urllib_request.addinfourl(stream, headers, url)
509 def http_request(self, req):
510 for h in std_headers:
513 req.add_header(h, std_headers[h])
514 if 'Youtubedl-no-compression' in req.headers:
515 if 'Accept-encoding' in req.headers:
516 del req.headers['Accept-encoding']
517 del req.headers['Youtubedl-no-compression']
520 def http_response(self, req, resp):
523 if resp.headers.get('Content-encoding', '') == 'gzip':
524 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
525 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
526 resp.msg = old_resp.msg
528 if resp.headers.get('Content-encoding', '') == 'deflate':
529 gz = io.BytesIO(self.deflate(resp.read()))
530 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
531 resp.msg = old_resp.msg
534 https_request = http_request
535 https_response = http_response