2 # -*- coding: utf-8 -*-
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
39 import http.cookiejar as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib as compat_cookiejar
44 import html.entities as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs as compat_html_entities
49 import html.parser as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser as compat_html_parser
54 import http.client as compat_http_client
55 except ImportError: # Python 2
56 import httplib as compat_http_client
59 from subprocess import DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: DEVNULL
62 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
65 from urllib.parse import parse_qs as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string, encoding='utf-8', errors='replace'):
72 res = string.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence += item[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string += pct_sequence.decode(encoding, errors) + rest
100 # Flush the final pct_sequence
101 string += pct_sequence.decode(encoding, errors)
104 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
105 encoding='utf-8', errors='replace'):
106 qs, _coerce_result = qs, unicode
107 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
109 for name_value in pairs:
110 if not name_value and not strict_parsing:
112 nv = name_value.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values:
121 if len(nv[1]) or keep_blank_values:
122 name = nv[0].replace('+', ' ')
123 name = _unquote(name, encoding=encoding, errors=errors)
124 name = _coerce_result(name)
125 value = nv[1].replace('+', ' ')
126 value = _unquote(value, encoding=encoding, errors=errors)
127 value = _coerce_result(value)
128 r.append((name, value))
131 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
134 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
135 encoding=encoding, errors=errors)
136 for name, value in pairs:
137 if name in parsed_result:
138 parsed_result[name].append(value)
140 parsed_result[name] = [value]
144 compat_str = unicode # Python 2
149 compat_chr = unichr # Python 2
154 if type(c) is int: return c
157 # This is not clearly defined otherwise
158 compiled_regex_type = type(re.compile(''))
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
168 def preferredencoding():
169 """Get preferred encoding.
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
175 pref = locale.getpreferredencoding()
182 if sys.version_info < (3,0):
184 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
187 assert type(s) == type(u'')
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys.version_info < (3,0):
193 def write_json_file(obj, fn):
194 with open(fn, 'wb') as f:
197 def write_json_file(obj, fn):
198 with open(fn, 'w', encoding='utf-8') as f:
201 def htmlentity_transform(matchobj):
202 """Transforms an HTML entity to a character.
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
207 entity = matchobj.group(1)
209 # Known non-numeric HTML entity
210 if entity in compat_html_entities.name2codepoint:
211 return compat_chr(compat_html_entities.name2codepoint[entity])
213 mobj = re.match(u'(?u)#(x?\\d+)', entity)
215 numstr = mobj.group(1)
216 if numstr.startswith(u'x'):
218 numstr = u'0%s' % numstr
221 return compat_chr(int(numstr, base))
223 # Unknown entity in name, return its literal representation
224 return (u'&%s;' % entity)
226 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
227 class AttrParser(compat_html_parser.HTMLParser):
228 """Modified HTMLParser that isolates a tag with the specified attribute"""
229 def __init__(self, attribute, value):
230 self.attribute = attribute
236 self.watch_startpos = False
238 compat_html_parser.HTMLParser.__init__(self)
240 def error(self, message):
241 if self.error_count > 10 or self.started:
242 raise compat_html_parser.HTMLParseError(message, self.getpos())
243 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
244 self.error_count += 1
247 def loads(self, html):
252 def handle_starttag(self, tag, attrs):
255 self.find_startpos(None)
256 if self.attribute in attrs and attrs[self.attribute] == self.value:
259 self.watch_startpos = True
261 if not tag in self.depth: self.depth[tag] = 0
264 def handle_endtag(self, tag):
266 if tag in self.depth: self.depth[tag] -= 1
267 if self.depth[self.result[0]] == 0:
269 self.result.append(self.getpos())
271 def find_startpos(self, x):
272 """Needed to put the start position of the result (self.result[1])
273 after the opening tag with the requested id"""
274 if self.watch_startpos:
275 self.watch_startpos = False
276 self.result.append(self.getpos())
277 handle_entityref = handle_charref = handle_data = handle_comment = \
278 handle_decl = handle_pi = unknown_decl = find_startpos
280 def get_result(self):
281 if self.result is None:
283 if len(self.result) != 3:
285 lines = self.html.split('\n')
286 lines = lines[self.result[1][0]-1:self.result[2][0]]
287 lines[0] = lines[0][self.result[1][1]:]
289 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
290 lines[-1] = lines[-1][:self.result[2][1]]
291 return '\n'.join(lines).strip()
292 # Hack for https://github.com/rg3/youtube-dl/issues/662
293 if sys.version_info < (2, 7, 3):
294 AttrParser.parse_endtag = (lambda self, i:
295 i + len("</scr'+'ipt>")
296 if self.rawdata[i:].startswith("</scr'+'ipt>")
297 else compat_html_parser.HTMLParser.parse_endtag(self, i))
299 def get_element_by_id(id, html):
300 """Return the content of the tag with the specified ID in the passed HTML document"""
301 return get_element_by_attribute("id", id, html)
303 def get_element_by_attribute(attribute, value, html):
304 """Return the content of the tag with the specified attribute in the passed HTML document"""
305 parser = AttrParser(attribute, value)
308 except compat_html_parser.HTMLParseError:
310 return parser.get_result()
313 def clean_html(html):
314 """Clean an HTML snippet into a readable string"""
316 html = html.replace('\n', ' ')
317 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
318 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
320 html = re.sub('<.*?>', '', html)
321 # Replace html entities
322 html = unescapeHTML(html)
326 def sanitize_open(filename, open_mode):
327 """Try to open the given filename, and slightly tweak it if this fails.
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
334 It returns the tuple (stream, definitive_file_name).
338 if sys.platform == 'win32':
340 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
341 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
342 stream = open(encodeFilename(filename), open_mode)
343 return (stream, filename)
344 except (IOError, OSError) as err:
345 if err.errno in (errno.EACCES,):
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename = os.path.join(
350 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
351 for path_part in os.path.split(filename)
353 if alt_filename == filename:
356 # An exception here should be caught in the caller
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, alt_filename)
361 def timeconvert(timestr):
362 """Convert RFC 2822 defined time string into system timestamp"""
364 timetuple = email.utils.parsedate_tz(timestr)
365 if timetuple is not None:
366 timestamp = email.utils.mktime_tz(timetuple)
369 def sanitize_filename(s, restricted=False, is_id=False):
370 """Sanitizes a string so it could be used as part of a filename.
371 If restricted is set, use a stricter subset of allowed characters.
372 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
374 def replace_insane(char):
375 if char == '?' or ord(char) < 32 or ord(char) == 127:
378 return '' if restricted else '\''
380 return '_-' if restricted else ' -'
381 elif char in '\\/|*<>':
383 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
385 if restricted and ord(char) > 127:
389 result = u''.join(map(replace_insane, s))
391 while '__' in result:
392 result = result.replace('__', '_')
393 result = result.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted and result.startswith('-_'):
401 def orderedSet(iterable):
402 """ Remove all duplicates from the input iterable """
413 assert type(s) == type(u'')
415 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
418 def encodeFilename(s):
420 @param s The name of the file
423 assert type(s) == type(u'')
425 # Python 3 has a Unicode API
426 if sys.version_info >= (3, 0):
429 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
430 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
431 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
432 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
435 encoding = sys.getfilesystemencoding()
438 return s.encode(encoding, 'ignore')
440 def decodeOption(optval):
443 if isinstance(optval, bytes):
444 optval = optval.decode(preferredencoding())
446 assert isinstance(optval, compat_str)
449 def formatSeconds(secs):
451 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
453 return '%d:%02d' % (secs // 60, secs % 60)
457 def make_HTTPS_handler(opts):
458 if sys.version_info < (3,2):
459 # Python's 2.x handler is very simplistic
460 return compat_urllib_request.HTTPSHandler()
463 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
464 context.set_default_verify_paths()
466 context.verify_mode = (ssl.CERT_NONE
467 if opts.no_check_certificate
468 else ssl.CERT_REQUIRED)
469 return compat_urllib_request.HTTPSHandler(context=context)
471 class ExtractorError(Exception):
472 """Error during info extraction."""
473 def __init__(self, msg, tb=None):
474 """ tb, if given, is the original traceback (so that it can be printed out). """
475 msg = msg + u'; please report this issue on GitHub.'
476 super(ExtractorError, self).__init__(msg)
478 self.exc_info = sys.exc_info() # preserve original exception
480 def format_traceback(self):
481 if self.traceback is None:
483 return u''.join(traceback.format_tb(self.traceback))
486 class DownloadError(Exception):
487 """Download Error exception.
489 This exception may be thrown by FileDownloader objects if they are not
490 configured to continue on errors. They will contain the appropriate
493 def __init__(self, msg, exc_info=None):
494 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
495 super(DownloadError, self).__init__(msg)
496 self.exc_info = exc_info
499 class SameFileError(Exception):
500 """Same File exception.
502 This exception will be thrown by FileDownloader objects if they detect
503 multiple files would have to be downloaded to the same file on disk.
508 class PostProcessingError(Exception):
509 """Post Processing exception.
511 This exception may be raised by PostProcessor's .run() method to
512 indicate an error in the postprocessing task.
514 def __init__(self, msg):
517 class MaxDownloadsReached(Exception):
518 """ --max-downloads limit has been reached. """
522 class UnavailableVideoError(Exception):
523 """Unavailable Format exception.
525 This exception will be thrown when a video is requested
526 in a format that is not available for that video.
531 class ContentTooShortError(Exception):
532 """Content Too Short exception.
534 This exception may be raised by FileDownloader objects when a file they
535 download is too small for what the server announced first, indicating
536 the connection was probably interrupted.
542 def __init__(self, downloaded, expected):
543 self.downloaded = downloaded
544 self.expected = expected
546 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
547 """Handler for HTTP requests and responses.
549 This class, when installed with an OpenerDirector, automatically adds
550 the standard headers to every HTTP request and handles gzipped and
551 deflated responses from web servers. If compression is to be avoided in
552 a particular request, the original request in the program code only has
553 to include the HTTP header "Youtubedl-No-Compression", which will be
554 removed before making the real request.
556 Part of this code was copied from:
558 http://techknack.net/python-urllib2-handlers/
560 Andrew Rowls, the author of that code, agreed to release it to the
567 return zlib.decompress(data, -zlib.MAX_WBITS)
569 return zlib.decompress(data)
572 def addinfourl_wrapper(stream, headers, url, code):
573 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
574 return compat_urllib_request.addinfourl(stream, headers, url, code)
575 ret = compat_urllib_request.addinfourl(stream, headers, url)
579 def http_request(self, req):
580 for h,v in std_headers.items():
584 if 'Youtubedl-no-compression' in req.headers:
585 if 'Accept-encoding' in req.headers:
586 del req.headers['Accept-encoding']
587 del req.headers['Youtubedl-no-compression']
588 if 'Youtubedl-user-agent' in req.headers:
589 if 'User-agent' in req.headers:
590 del req.headers['User-agent']
591 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
592 del req.headers['Youtubedl-user-agent']
595 def http_response(self, req, resp):
598 if resp.headers.get('Content-encoding', '') == 'gzip':
599 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
600 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
601 resp.msg = old_resp.msg
603 if resp.headers.get('Content-encoding', '') == 'deflate':
604 gz = io.BytesIO(self.deflate(resp.read()))
605 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
606 resp.msg = old_resp.msg
609 https_request = http_request
610 https_response = http_response
612 def unified_strdate(date_str):
613 """Return a string with the date in the format YYYYMMDD"""
616 date_str = date_str.replace(',',' ')
617 # %z (UTC offset) is only supported in python>=3.2
618 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
619 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
620 for expression in format_expressions:
622 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
627 def date_from_str(date_str):
629 Return a datetime object from a string in the format YYYYMMDD or
630 (now|today)[+-][0-9](day|week|month|year)(s)?"""
631 today = datetime.date.today()
632 if date_str == 'now'or date_str == 'today':
634 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
635 if match is not None:
636 sign = match.group('sign')
637 time = int(match.group('time'))
640 unit = match.group('unit')
649 delta = datetime.timedelta(**{unit: time})
651 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
653 class DateRange(object):
654 """Represents a time interval between two dates"""
655 def __init__(self, start=None, end=None):
656 """start and end must be strings in the format accepted by date"""
657 if start is not None:
658 self.start = date_from_str(start)
660 self.start = datetime.datetime.min.date()
662 self.end = date_from_str(end)
664 self.end = datetime.datetime.max.date()
665 if self.start > self.end:
666 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
669 """Returns a range that only contains the given day"""
671 def __contains__(self, date):
672 """Check if the date is in the range"""
673 if not isinstance(date, datetime.date):
674 date = date_from_str(date)
675 return self.start <= date <= self.end
677 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())