2 # -*- coding: utf-8 -*-
17 import urllib.request as compat_urllib_request
18 except ImportError: # Python 2
19 import urllib2 as compat_urllib_request
22 import urllib.error as compat_urllib_error
23 except ImportError: # Python 2
24 import urllib2 as compat_urllib_error
27 import urllib.parse as compat_urllib_parse
28 except ImportError: # Python 2
29 import urllib as compat_urllib_parse
32 from urllib.parse import urlparse as compat_urllib_parse_urlparse
33 except ImportError: # Python 2
34 from urlparse import urlparse as compat_urllib_parse_urlparse
37 import http.cookiejar as compat_cookiejar
38 except ImportError: # Python 2
39 import cookielib as compat_cookiejar
42 import html.entities as compat_html_entities
43 except ImportError: # Python 2
44 import htmlentitydefs as compat_html_entities
47 import html.parser as compat_html_parser
48 except ImportError: # Python 2
49 import HTMLParser as compat_html_parser
52 import http.client as compat_http_client
53 except ImportError: # Python 2
54 import httplib as compat_http_client
57 from subprocess import DEVNULL
58 compat_subprocess_get_DEVNULL = lambda: DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
63 from urllib.parse import parse_qs as compat_parse_qs
64 except ImportError: # Python 2
65 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
66 # Python 2's version is apparently totally broken
67 def _unquote(string, encoding='utf-8', errors='replace'):
70 res = string.split('%')
77 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
84 pct_sequence += item[:2].decode('hex')
87 # This segment was just a single percent-encoded character.
88 # May be part of a sequence of code units, so delay decoding.
89 # (Stored in pct_sequence).
93 # Encountered non-percent-encoded characters. Flush the current
95 string += pct_sequence.decode(encoding, errors) + rest
98 # Flush the final pct_sequence
99 string += pct_sequence.decode(encoding, errors)
102 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
103 encoding='utf-8', errors='replace'):
104 qs, _coerce_result = qs, unicode
105 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
107 for name_value in pairs:
108 if not name_value and not strict_parsing:
110 nv = name_value.split('=', 1)
113 raise ValueError("bad query field: %r" % (name_value,))
114 # Handle case of a control-name with no equal sign
115 if keep_blank_values:
119 if len(nv[1]) or keep_blank_values:
120 name = nv[0].replace('+', ' ')
121 name = _unquote(name, encoding=encoding, errors=errors)
122 name = _coerce_result(name)
123 value = nv[1].replace('+', ' ')
124 value = _unquote(value, encoding=encoding, errors=errors)
125 value = _coerce_result(value)
126 r.append((name, value))
129 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
130 encoding='utf-8', errors='replace'):
132 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
133 encoding=encoding, errors=errors)
134 for name, value in pairs:
135 if name in parsed_result:
136 parsed_result[name].append(value)
138 parsed_result[name] = [value]
142 compat_str = unicode # Python 2
147 compat_chr = unichr # Python 2
152 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
153 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
154 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
155 'Accept-Encoding': 'gzip, deflate',
156 'Accept-Language': 'en-us,en;q=0.5',
159 def preferredencoding():
160 """Get preferred encoding.
162 Returns the best encoding scheme for the system, based on
163 locale.getpreferredencoding() and some further tweaks.
166 pref = locale.getpreferredencoding()
173 if sys.version_info < (3,0):
175 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
178 assert type(s) == type(u'')
181 # In Python 2.x, json.dump expects a bytestream.
182 # In Python 3.x, it writes to a character stream
183 if sys.version_info < (3,0):
184 def write_json_file(obj, fn):
185 with open(fn, 'wb') as f:
188 def write_json_file(obj, fn):
189 with open(fn, 'w', encoding='utf-8') as f:
192 def htmlentity_transform(matchobj):
193 """Transforms an HTML entity to a character.
195 This function receives a match object and is intended to be used with
196 the re.sub() function.
198 entity = matchobj.group(1)
200 # Known non-numeric HTML entity
201 if entity in compat_html_entities.name2codepoint:
202 return compat_chr(compat_html_entities.name2codepoint[entity])
204 mobj = re.match(u'(?u)#(x?\\d+)', entity)
206 numstr = mobj.group(1)
207 if numstr.startswith(u'x'):
209 numstr = u'0%s' % numstr
212 return compat_chr(int(numstr, base))
214 # Unknown entity in name, return its literal representation
215 return (u'&%s;' % entity)
217 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
218 class AttrParser(compat_html_parser.HTMLParser):
219 """Modified HTMLParser that isolates a tag with the specified attribute"""
220 def __init__(self, attribute, value):
221 self.attribute = attribute
227 self.watch_startpos = False
229 compat_html_parser.HTMLParser.__init__(self)
231 def error(self, message):
232 if self.error_count > 10 or self.started:
233 raise compat_html_parser.HTMLParseError(message, self.getpos())
234 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
235 self.error_count += 1
238 def loads(self, html):
243 def handle_starttag(self, tag, attrs):
246 self.find_startpos(None)
247 if self.attribute in attrs and attrs[self.attribute] == self.value:
250 self.watch_startpos = True
252 if not tag in self.depth: self.depth[tag] = 0
255 def handle_endtag(self, tag):
257 if tag in self.depth: self.depth[tag] -= 1
258 if self.depth[self.result[0]] == 0:
260 self.result.append(self.getpos())
262 def find_startpos(self, x):
263 """Needed to put the start position of the result (self.result[1])
264 after the opening tag with the requested id"""
265 if self.watch_startpos:
266 self.watch_startpos = False
267 self.result.append(self.getpos())
268 handle_entityref = handle_charref = handle_data = handle_comment = \
269 handle_decl = handle_pi = unknown_decl = find_startpos
271 def get_result(self):
272 if self.result is None:
274 if len(self.result) != 3:
276 lines = self.html.split('\n')
277 lines = lines[self.result[1][0]-1:self.result[2][0]]
278 lines[0] = lines[0][self.result[1][1]:]
280 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
281 lines[-1] = lines[-1][:self.result[2][1]]
282 return '\n'.join(lines).strip()
283 # Hack for https://github.com/rg3/youtube-dl/issues/662
284 if sys.version_info < (2, 7, 3):
285 AttrParser.parse_endtag = (lambda self, i:
286 i + len("</scr'+'ipt>")
287 if self.rawdata[i:].startswith("</scr'+'ipt>")
288 else compat_html_parser.HTMLParser.parse_endtag(self, i))
290 def get_element_by_id(id, html):
291 """Return the content of the tag with the specified ID in the passed HTML document"""
292 return get_element_by_attribute("id", id, html)
294 def get_element_by_attribute(attribute, value, html):
295 """Return the content of the tag with the specified attribute in the passed HTML document"""
296 parser = AttrParser(attribute, value)
299 except compat_html_parser.HTMLParseError:
301 return parser.get_result()
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
307 html = html.replace('\n', ' ')
308 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
309 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
311 html = re.sub('<.*?>', '', html)
312 # Replace html entities
313 html = unescapeHTML(html)
317 def sanitize_open(filename, open_mode):
318 """Try to open the given filename, and slightly tweak it if this fails.
320 Attempts to open the given filename. If this fails, it tries to change
321 the filename slightly, step by step, until it's either able to open it
322 or it fails and raises a final exception, like the standard open()
325 It returns the tuple (stream, definitive_file_name).
329 if sys.platform == 'win32':
331 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
332 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
333 stream = open(encodeFilename(filename), open_mode)
334 return (stream, filename)
335 except (IOError, OSError) as err:
336 # In case of error, try to remove win32 forbidden chars
337 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
339 # An exception here should be caught in the caller
340 stream = open(encodeFilename(filename), open_mode)
341 return (stream, filename)
344 def timeconvert(timestr):
345 """Convert RFC 2822 defined time string into system timestamp"""
347 timetuple = email.utils.parsedate_tz(timestr)
348 if timetuple is not None:
349 timestamp = email.utils.mktime_tz(timetuple)
352 def sanitize_filename(s, restricted=False, is_id=False):
353 """Sanitizes a string so it could be used as part of a filename.
354 If restricted is set, use a stricter subset of allowed characters.
355 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
357 def replace_insane(char):
358 if char == '?' or ord(char) < 32 or ord(char) == 127:
361 return '' if restricted else '\''
363 return '_-' if restricted else ' -'
364 elif char in '\\/|*<>':
366 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
368 if restricted and ord(char) > 127:
372 result = u''.join(map(replace_insane, s))
374 while '__' in result:
375 result = result.replace('__', '_')
376 result = result.strip('_')
377 # Common case of "Foreign band name - English song title"
378 if restricted and result.startswith('-_'):
384 def orderedSet(iterable):
385 """ Remove all duplicates from the input iterable """
396 assert type(s) == type(u'')
398 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
401 def encodeFilename(s):
403 @param s The name of the file
406 assert type(s) == type(u'')
408 # Python 3 has a Unicode API
409 if sys.version_info >= (3, 0):
412 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
413 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
414 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
415 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
418 encoding = sys.getfilesystemencoding()
421 return s.encode(encoding, 'ignore')
423 def decodeOption(optval):
426 if isinstance(optval, bytes):
427 optval = optval.decode(preferredencoding())
429 assert isinstance(optval, compat_str)
432 class ExtractorError(Exception):
433 """Error during info extraction."""
434 def __init__(self, msg, tb=None):
435 """ tb, if given, is the original traceback (so that it can be printed out). """
436 super(ExtractorError, self).__init__(msg)
438 self.exc_info = sys.exc_info() # preserve original exception
440 def format_traceback(self):
441 if self.traceback is None:
443 return u''.join(traceback.format_tb(self.traceback))
446 class DownloadError(Exception):
447 """Download Error exception.
449 This exception may be thrown by FileDownloader objects if they are not
450 configured to continue on errors. They will contain the appropriate
453 def __init__(self, msg, exc_info=None):
454 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
455 super(DownloadError, self).__init__(msg)
456 self.exc_info = exc_info
459 class SameFileError(Exception):
460 """Same File exception.
462 This exception will be thrown by FileDownloader objects if they detect
463 multiple files would have to be downloaded to the same file on disk.
468 class PostProcessingError(Exception):
469 """Post Processing exception.
471 This exception may be raised by PostProcessor's .run() method to
472 indicate an error in the postprocessing task.
474 def __init__(self, msg):
477 class MaxDownloadsReached(Exception):
478 """ --max-downloads limit has been reached. """
482 class UnavailableVideoError(Exception):
483 """Unavailable Format exception.
485 This exception will be thrown when a video is requested
486 in a format that is not available for that video.
491 class ContentTooShortError(Exception):
492 """Content Too Short exception.
494 This exception may be raised by FileDownloader objects when a file they
495 download is too small for what the server announced first, indicating
496 the connection was probably interrupted.
502 def __init__(self, downloaded, expected):
503 self.downloaded = downloaded
504 self.expected = expected
506 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
507 """Handler for HTTP requests and responses.
509 This class, when installed with an OpenerDirector, automatically adds
510 the standard headers to every HTTP request and handles gzipped and
511 deflated responses from web servers. If compression is to be avoided in
512 a particular request, the original request in the program code only has
513 to include the HTTP header "Youtubedl-No-Compression", which will be
514 removed before making the real request.
516 Part of this code was copied from:
518 http://techknack.net/python-urllib2-handlers/
520 Andrew Rowls, the author of that code, agreed to release it to the
527 return zlib.decompress(data, -zlib.MAX_WBITS)
529 return zlib.decompress(data)
532 def addinfourl_wrapper(stream, headers, url, code):
533 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
534 return compat_urllib_request.addinfourl(stream, headers, url, code)
535 ret = compat_urllib_request.addinfourl(stream, headers, url)
539 def http_request(self, req):
540 for h,v in std_headers.items():
544 if 'Youtubedl-no-compression' in req.headers:
545 if 'Accept-encoding' in req.headers:
546 del req.headers['Accept-encoding']
547 del req.headers['Youtubedl-no-compression']
548 if 'Youtubedl-user-agent' in req.headers:
549 if 'User-agent' in req.headers:
550 del req.headers['User-agent']
551 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
552 del req.headers['Youtubedl-user-agent']
555 def http_response(self, req, resp):
558 if resp.headers.get('Content-encoding', '') == 'gzip':
559 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
560 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
561 resp.msg = old_resp.msg
563 if resp.headers.get('Content-encoding', '') == 'deflate':
564 gz = io.BytesIO(self.deflate(resp.read()))
565 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
566 resp.msg = old_resp.msg
569 https_request = http_request
570 https_response = http_response