2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
65 ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
70 def preferredencoding():
71 """Get preferred encoding.
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
77 pref = locale.getpreferredencoding()
85 def write_json_file(obj, fn):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
88 fn = encodeFilename(fn)
89 if sys.version_info < (3, 0) and sys.platform != 'win32':
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
118 tf = tempfile.NamedTemporaryFile(**args)
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
130 os.rename(tf.name, fn)
139 if sys.version_info >= (2, 7):
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
144 expr = xpath + "[@%s='%s']" % (key, val)
145 return node.find(expr)
147 def find_xpath_attr(node, xpath, key, val):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath, compat_str):
151 xpath = xpath.encode('ascii')
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
162 def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
167 replaced.append(c[0])
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
174 def xpath_text(node, xpath, name=None, fatal=False):
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
179 if n is None or n.text is None:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
188 def get_element_by_id(id, html):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
193 def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
196 m = re.search(r'''(?xs)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
204 ''' % (re.escape(attribute), re.escape(value)), html)
208 res = m.group('content')
210 if res.startswith('"') or res.startswith("'"):
213 return unescapeHTML(res)
216 def clean_html(html):
217 """Clean an HTML snippet into a readable string"""
219 if html is None: # Convenience for sanitizing descriptions etc.
223 html = html.replace('\n', ' ')
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
233 def sanitize_open(filename, open_mode):
234 """Try to open the given filename, and slightly tweak it if this fails.
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
241 It returns the tuple (stream, definitive_file_name).
245 if sys.platform == 'win32':
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
252 if err.errno in (errno.EACCES,):
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = os.path.join(
257 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
258 for path_part in os.path.split(filename)
260 if alt_filename == filename:
263 # An exception here should be caught in the caller
264 stream = open(encodeFilename(filename), open_mode)
265 return (stream, alt_filename)
268 def timeconvert(timestr):
269 """Convert RFC 2822 defined time string into system timestamp"""
271 timetuple = email.utils.parsedate_tz(timestr)
272 if timetuple is not None:
273 timestamp = email.utils.mktime_tz(timetuple)
277 def sanitize_filename(s, restricted=False, is_id=False):
278 """Sanitizes a string so it could be used as part of a filename.
279 If restricted is set, use a stricter subset of allowed characters.
280 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
282 def replace_insane(char):
283 if char == '?' or ord(char) < 32 or ord(char) == 127:
286 return '' if restricted else '\''
288 return '_-' if restricted else ' -'
289 elif char in '\\/|*<>':
291 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
293 if restricted and ord(char) > 127:
298 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
299 result = ''.join(map(replace_insane, s))
301 while '__' in result:
302 result = result.replace('__', '_')
303 result = result.strip('_')
304 # Common case of "Foreign band name - English song title"
305 if restricted and result.startswith('-_'):
312 def orderedSet(iterable):
313 """ Remove all duplicates from the input iterable """
321 def _htmlentity_transform(entity):
322 """Transforms an HTML entity to a character."""
323 # Known non-numeric HTML entity
324 if entity in compat_html_entities.name2codepoint:
325 return compat_chr(compat_html_entities.name2codepoint[entity])
327 mobj = re.match(r'#(x?[0-9]+)', entity)
329 numstr = mobj.group(1)
330 if numstr.startswith('x'):
332 numstr = '0%s' % numstr
335 return compat_chr(int(numstr, base))
337 # Unknown entity in name, return its literal representation
338 return ('&%s;' % entity)
344 assert type(s) == compat_str
347 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
350 def encodeFilename(s, for_subprocess=False):
352 @param s The name of the file
355 assert type(s) == compat_str
357 # Python 3 has a Unicode API
358 if sys.version_info >= (3, 0):
361 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
362 # Pass '' directly to use Unicode APIs on Windows 2000 and up
363 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
364 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
365 if not for_subprocess:
368 # For subprocess calls, encode with locale encoding
369 # Refer to http://stackoverflow.com/a/9951851/35070
370 encoding = preferredencoding()
372 encoding = sys.getfilesystemencoding()
375 return s.encode(encoding, 'ignore')
378 def encodeArgument(s):
379 if not isinstance(s, compat_str):
380 # Legacy code that uses byte strings
381 # Uncomment the following line after fixing all post processors
382 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
383 s = s.decode('ascii')
384 return encodeFilename(s, True)
387 def decodeOption(optval):
390 if isinstance(optval, bytes):
391 optval = optval.decode(preferredencoding())
393 assert isinstance(optval, compat_str)
397 def formatSeconds(secs):
399 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
401 return '%d:%02d' % (secs // 60, secs % 60)
406 def make_HTTPS_handler(params, **kwargs):
407 opts_no_check_certificate = params.get('nocheckcertificate', False)
408 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
409 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
410 if opts_no_check_certificate:
411 context.check_hostname = False
412 context.verify_mode = ssl.CERT_NONE
414 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
417 # (create_default_context present but HTTPSHandler has no context=)
420 if sys.version_info < (3, 2):
421 return YoutubeDLHTTPSHandler(params, **kwargs)
423 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
424 context.verify_mode = (ssl.CERT_NONE
425 if opts_no_check_certificate
426 else ssl.CERT_REQUIRED)
427 context.set_default_verify_paths()
428 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
431 class ExtractorError(Exception):
432 """Error during info extraction."""
434 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 if video_id is not None:
442 msg = video_id + ': ' + msg
444 msg += ' (caused by %r)' % cause
446 if ytdl_is_updateable():
447 update_cmd = 'type youtube-dl -U to update'
449 update_cmd = 'see https://yt-dl.org/update on how to update'
450 msg += '; please report this issue on https://yt-dl.org/bug .'
451 msg += ' Make sure you are using the latest version; %s.' % update_cmd
452 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
453 super(ExtractorError, self).__init__(msg)
456 self.exc_info = sys.exc_info() # preserve original exception
458 self.video_id = video_id
460 def format_traceback(self):
461 if self.traceback is None:
463 return ''.join(traceback.format_tb(self.traceback))
466 class UnsupportedError(ExtractorError):
467 def __init__(self, url):
468 super(UnsupportedError, self).__init__(
469 'Unsupported URL: %s' % url, expected=True)
473 class RegexNotFoundError(ExtractorError):
474 """Error when a regex didn't match"""
478 class DownloadError(Exception):
479 """Download Error exception.
481 This exception may be thrown by FileDownloader objects if they are not
482 configured to continue on errors. They will contain the appropriate
486 def __init__(self, msg, exc_info=None):
487 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
488 super(DownloadError, self).__init__(msg)
489 self.exc_info = exc_info
492 class SameFileError(Exception):
493 """Same File exception.
495 This exception will be thrown by FileDownloader objects if they detect
496 multiple files would have to be downloaded to the same file on disk.
501 class PostProcessingError(Exception):
502 """Post Processing exception.
504 This exception may be raised by PostProcessor's .run() method to
505 indicate an error in the postprocessing task.
508 def __init__(self, msg):
512 class MaxDownloadsReached(Exception):
513 """ --max-downloads limit has been reached. """
517 class UnavailableVideoError(Exception):
518 """Unavailable Format exception.
520 This exception will be thrown when a video is requested
521 in a format that is not available for that video.
526 class ContentTooShortError(Exception):
527 """Content Too Short exception.
529 This exception may be raised by FileDownloader objects when a file they
530 download is too small for what the server announced first, indicating
531 the connection was probably interrupted.
537 def __init__(self, downloaded, expected):
538 self.downloaded = downloaded
539 self.expected = expected
542 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
543 hc = http_class(*args, **kwargs)
544 source_address = ydl_handler._params.get('source_address')
545 if source_address is not None:
546 sa = (source_address, 0)
547 if hasattr(hc, 'source_address'): # Python 2.7+
548 hc.source_address = sa
550 def _hc_connect(self, *args, **kwargs):
551 sock = compat_socket_create_connection(
552 (self.host, self.port), self.timeout, sa)
554 self.sock = ssl.wrap_socket(
555 sock, self.key_file, self.cert_file,
556 ssl_version=ssl.PROTOCOL_TLSv1)
559 hc.connect = functools.partial(_hc_connect, hc)
564 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
565 """Handler for HTTP requests and responses.
567 This class, when installed with an OpenerDirector, automatically adds
568 the standard headers to every HTTP request and handles gzipped and
569 deflated responses from web servers. If compression is to be avoided in
570 a particular request, the original request in the program code only has
571 to include the HTTP header "Youtubedl-No-Compression", which will be
572 removed before making the real request.
574 Part of this code was copied from:
576 http://techknack.net/python-urllib2-handlers/
578 Andrew Rowls, the author of that code, agreed to release it to the
582 def __init__(self, params, *args, **kwargs):
583 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
584 self._params = params
586 def http_open(self, req):
587 return self.do_open(functools.partial(
588 _create_http_connection, self, compat_http_client.HTTPConnection, False),
594 return zlib.decompress(data, -zlib.MAX_WBITS)
596 return zlib.decompress(data)
599 def addinfourl_wrapper(stream, headers, url, code):
600 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
601 return compat_urllib_request.addinfourl(stream, headers, url, code)
602 ret = compat_urllib_request.addinfourl(stream, headers, url)
606 def http_request(self, req):
607 for h, v in std_headers.items():
608 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
609 # The dict keys are capitalized because of this bug by urllib
610 if h.capitalize() not in req.headers:
612 if 'Youtubedl-no-compression' in req.headers:
613 if 'Accept-encoding' in req.headers:
614 del req.headers['Accept-encoding']
615 del req.headers['Youtubedl-no-compression']
617 if sys.version_info < (2, 7) and '#' in req.get_full_url():
618 # Python 2.6 is brain-dead when it comes to fragments
619 req._Request__original = req._Request__original.partition('#')[0]
620 req._Request__r_type = req._Request__r_type.partition('#')[0]
624 def http_response(self, req, resp):
627 if resp.headers.get('Content-encoding', '') == 'gzip':
628 content = resp.read()
629 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
631 uncompressed = io.BytesIO(gz.read())
632 except IOError as original_ioerror:
633 # There may be junk add the end of the file
634 # See http://stackoverflow.com/q/4928560/35070 for details
635 for i in range(1, 1024):
637 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
638 uncompressed = io.BytesIO(gz.read())
643 raise original_ioerror
644 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
645 resp.msg = old_resp.msg
647 if resp.headers.get('Content-encoding', '') == 'deflate':
648 gz = io.BytesIO(self.deflate(resp.read()))
649 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
650 resp.msg = old_resp.msg
653 https_request = http_request
654 https_response = http_response
657 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
658 def __init__(self, params, https_conn_class=None, *args, **kwargs):
659 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
660 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
661 self._params = params
663 def https_open(self, req):
665 if hasattr(self, '_context'): # python > 2.6
666 kwargs['context'] = self._context
667 if hasattr(self, '_check_hostname'): # python 3.x
668 kwargs['check_hostname'] = self._check_hostname
669 return self.do_open(functools.partial(
670 _create_http_connection, self, self._https_conn_class, True),
674 def parse_iso8601(date_str, delimiter='T', timezone=None):
675 """ Return a UNIX timestamp from the given date """
682 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
685 timezone = datetime.timedelta()
687 date_str = date_str[:-len(m.group(0))]
688 if not m.group('sign'):
689 timezone = datetime.timedelta()
691 sign = 1 if m.group('sign') == '+' else -1
692 timezone = datetime.timedelta(
693 hours=sign * int(m.group('hours')),
694 minutes=sign * int(m.group('minutes')))
695 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
696 dt = datetime.datetime.strptime(date_str, date_format) - timezone
697 return calendar.timegm(dt.timetuple())
700 def unified_strdate(date_str, day_first=True):
701 """Return a string with the date in the format YYYYMMDD"""
707 date_str = date_str.replace(',', ' ')
708 # %z (UTC offset) is only supported in python>=3.2
709 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
710 # Remove AM/PM + timezone
711 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
713 format_expressions = [
718 '%b %dst %Y %I:%M%p',
719 '%b %dnd %Y %I:%M%p',
720 '%b %dth %Y %I:%M%p',
726 '%Y-%m-%d %H:%M:%S.%f',
729 '%Y-%m-%dT%H:%M:%SZ',
730 '%Y-%m-%dT%H:%M:%S.%fZ',
731 '%Y-%m-%dT%H:%M:%S.%f0Z',
733 '%Y-%m-%dT%H:%M:%S.%f',
737 format_expressions.extend([
744 format_expressions.extend([
750 for expression in format_expressions:
752 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
755 if upload_date is None:
756 timetuple = email.utils.parsedate_tz(date_str)
758 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
762 def determine_ext(url, default_ext='unknown_video'):
765 guess = url.partition('?')[0].rpartition('.')[2]
766 if re.match(r'^[A-Za-z0-9]+$', guess):
772 def subtitles_filename(filename, sub_lang, sub_format):
773 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
776 def date_from_str(date_str):
778 Return a datetime object from a string in the format YYYYMMDD or
779 (now|today)[+-][0-9](day|week|month|year)(s)?"""
780 today = datetime.date.today()
781 if date_str in ('now', 'today'):
783 if date_str == 'yesterday':
784 return today - datetime.timedelta(days=1)
785 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
786 if match is not None:
787 sign = match.group('sign')
788 time = int(match.group('time'))
791 unit = match.group('unit')
792 # A bad aproximation?
800 delta = datetime.timedelta(**{unit: time})
802 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
805 def hyphenate_date(date_str):
807 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
808 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
809 if match is not None:
810 return '-'.join(match.groups())
815 class DateRange(object):
816 """Represents a time interval between two dates"""
818 def __init__(self, start=None, end=None):
819 """start and end must be strings in the format accepted by date"""
820 if start is not None:
821 self.start = date_from_str(start)
823 self.start = datetime.datetime.min.date()
825 self.end = date_from_str(end)
827 self.end = datetime.datetime.max.date()
828 if self.start > self.end:
829 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
833 """Returns a range that only contains the given day"""
836 def __contains__(self, date):
837 """Check if the date is in the range"""
838 if not isinstance(date, datetime.date):
839 date = date_from_str(date)
840 return self.start <= date <= self.end
843 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
847 """ Returns the platform name as a compat_str """
848 res = platform.platform()
849 if isinstance(res, bytes):
850 res = res.decode(preferredencoding())
852 assert isinstance(res, compat_str)
856 def _windows_write_string(s, out):
857 """ Returns True if the string was written using special methods,
858 False if it has yet to be written out."""
859 # Adapted from http://stackoverflow.com/a/3259271/35070
862 import ctypes.wintypes
870 fileno = out.fileno()
871 except AttributeError:
872 # If the output stream doesn't have a fileno, it's virtual
874 except io.UnsupportedOperation:
875 # Some strange Windows pseudo files?
877 if fileno not in WIN_OUTPUT_IDS:
880 GetStdHandle = ctypes.WINFUNCTYPE(
881 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
882 (b"GetStdHandle", ctypes.windll.kernel32))
883 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
885 WriteConsoleW = ctypes.WINFUNCTYPE(
886 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
887 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
888 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
889 written = ctypes.wintypes.DWORD(0)
891 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
892 FILE_TYPE_CHAR = 0x0002
893 FILE_TYPE_REMOTE = 0x8000
894 GetConsoleMode = ctypes.WINFUNCTYPE(
895 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
896 ctypes.POINTER(ctypes.wintypes.DWORD))(
897 (b"GetConsoleMode", ctypes.windll.kernel32))
898 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
900 def not_a_console(handle):
901 if handle == INVALID_HANDLE_VALUE or handle is None:
903 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
904 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
909 def next_nonbmp_pos(s):
911 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
912 except StopIteration:
916 count = min(next_nonbmp_pos(s), 1024)
919 h, s, count if count else 2, ctypes.byref(written), None)
921 raise OSError('Failed to write string')
922 if not count: # We just wrote a non-BMP character
923 assert written.value == 2
926 assert written.value > 0
927 s = s[written.value:]
931 def write_string(s, out=None, encoding=None):
934 assert type(s) == compat_str
936 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
937 if _windows_write_string(s, out):
940 if ('b' in getattr(out, 'mode', '') or
941 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
942 byt = s.encode(encoding or preferredencoding(), 'ignore')
944 elif hasattr(out, 'buffer'):
945 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
946 byt = s.encode(enc, 'ignore')
947 out.buffer.write(byt)
953 def bytes_to_intlist(bs):
956 if isinstance(bs[0], int): # Python 3
959 return [ord(c) for c in bs]
962 def intlist_to_bytes(xs):
965 return struct_pack('%dB' % len(xs), *xs)
968 # Cross-platform file locking
969 if sys.platform == 'win32':
970 import ctypes.wintypes
973 class OVERLAPPED(ctypes.Structure):
975 ('Internal', ctypes.wintypes.LPVOID),
976 ('InternalHigh', ctypes.wintypes.LPVOID),
977 ('Offset', ctypes.wintypes.DWORD),
978 ('OffsetHigh', ctypes.wintypes.DWORD),
979 ('hEvent', ctypes.wintypes.HANDLE),
982 kernel32 = ctypes.windll.kernel32
983 LockFileEx = kernel32.LockFileEx
984 LockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwFlags
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
992 LockFileEx.restype = ctypes.wintypes.BOOL
993 UnlockFileEx = kernel32.UnlockFileEx
994 UnlockFileEx.argtypes = [
995 ctypes.wintypes.HANDLE, # hFile
996 ctypes.wintypes.DWORD, # dwReserved
997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
999 ctypes.POINTER(OVERLAPPED) # Overlapped
1001 UnlockFileEx.restype = ctypes.wintypes.BOOL
1002 whole_low = 0xffffffff
1003 whole_high = 0x7fffffff
1005 def _lock_file(f, exclusive):
1006 overlapped = OVERLAPPED()
1007 overlapped.Offset = 0
1008 overlapped.OffsetHigh = 0
1009 overlapped.hEvent = 0
1010 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1011 handle = msvcrt.get_osfhandle(f.fileno())
1012 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1013 whole_low, whole_high, f._lock_file_overlapped_p):
1014 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1016 def _unlock_file(f):
1017 assert f._lock_file_overlapped_p
1018 handle = msvcrt.get_osfhandle(f.fileno())
1019 if not UnlockFileEx(handle, 0,
1020 whole_low, whole_high, f._lock_file_overlapped_p):
1021 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1026 def _lock_file(f, exclusive):
1027 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1029 def _unlock_file(f):
1030 fcntl.flock(f, fcntl.LOCK_UN)
1033 class locked_file(object):
1034 def __init__(self, filename, mode, encoding=None):
1035 assert mode in ['r', 'a', 'w']
1036 self.f = io.open(filename, mode, encoding=encoding)
1039 def __enter__(self):
1040 exclusive = self.mode != 'r'
1042 _lock_file(self.f, exclusive)
1048 def __exit__(self, etype, value, traceback):
1050 _unlock_file(self.f)
1057 def write(self, *args):
1058 return self.f.write(*args)
1060 def read(self, *args):
1061 return self.f.read(*args)
1064 def get_filesystem_encoding():
1065 encoding = sys.getfilesystemencoding()
1066 return encoding if encoding is not None else 'utf-8'
1069 def shell_quote(args):
1071 encoding = get_filesystem_encoding()
1073 if isinstance(a, bytes):
1074 # We may get a filename encoded with 'encodeFilename'
1075 a = a.decode(encoding)
1076 quoted_args.append(pipes.quote(a))
1077 return ' '.join(quoted_args)
1080 def takewhile_inclusive(pred, seq):
1081 """ Like itertools.takewhile, but include the latest evaluated element
1082 (the first element so that Not pred(e)) """
1089 def smuggle_url(url, data):
1090 """ Pass additional data in a URL for internal use. """
1092 sdata = compat_urllib_parse.urlencode(
1093 {'__youtubedl_smuggle': json.dumps(data)})
1094 return url + '#' + sdata
1097 def unsmuggle_url(smug_url, default=None):
1098 if '#__youtubedl_smuggle' not in smug_url:
1099 return smug_url, default
1100 url, _, sdata = smug_url.rpartition('#')
1101 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1102 data = json.loads(jsond)
1106 def format_bytes(bytes):
1109 if type(bytes) is str:
1110 bytes = float(bytes)
1114 exponent = int(math.log(bytes, 1024.0))
1115 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1116 converted = float(bytes) / float(1024 ** exponent)
1117 return '%.2f%s' % (converted, suffix)
1120 def parse_filesize(s):
1124 # The lower-case forms are of course incorrect and inofficial,
1125 # but we support those too
1163 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1165 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1169 num_str = m.group('num').replace(',', '.')
1170 mult = _UNIT_TABLE[m.group('unit')]
1171 return int(float(num_str) * mult)
1174 def get_term_width():
1175 columns = compat_getenv('COLUMNS', None)
1180 sp = subprocess.Popen(
1182 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1183 out, err = sp.communicate()
1184 return int(out.split()[1])
1190 def month_by_name(name):
1191 """ Return the number of a month by (locale-independently) English name """
1194 return ENGLISH_MONTH_NAMES.index(name) + 1
1199 def month_by_abbreviation(abbrev):
1200 """ Return the number of a month by (locale-independently) English
1204 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1209 def fix_xml_ampersands(xml_str):
1210 """Replace all the '&' by '&' in XML"""
1212 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1217 def setproctitle(title):
1218 assert isinstance(title, compat_str)
1220 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1223 title_bytes = title.encode('utf-8')
1224 buf = ctypes.create_string_buffer(len(title_bytes))
1225 buf.value = title_bytes
1227 libc.prctl(15, buf, 0, 0, 0)
1228 except AttributeError:
1229 return # Strange libc, just skip this
1232 def remove_start(s, start):
1233 if s.startswith(start):
1234 return s[len(start):]
1238 def remove_end(s, end):
1240 return s[:-len(end)]
1244 def url_basename(url):
1245 path = compat_urlparse.urlparse(url).path
1246 return path.strip('/').split('/')[-1]
1249 class HEADRequest(compat_urllib_request.Request):
1250 def get_method(self):
1254 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1257 v = getattr(v, get_attr, None)
1260 return default if v is None else (int(v) * invscale // scale)
1263 def str_or_none(v, default=None):
1264 return default if v is None else compat_str(v)
1267 def str_to_int(int_str):
1268 """ A more relaxed version of int_or_none """
1271 int_str = re.sub(r'[,\.\+]', '', int_str)
1275 def float_or_none(v, scale=1, invscale=1, default=None):
1276 return default if v is None else (float(v) * invscale / scale)
1279 def parse_duration(s):
1280 if not isinstance(s, compat_basestring):
1288 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1289 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1293 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1294 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1296 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1298 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1303 if m.group('only_mins'):
1304 return float_or_none(m.group('only_mins'), invscale=60)
1305 if m.group('only_hours'):
1306 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1308 res += int(m.group('secs'))
1310 res += int(m.group('mins')) * 60
1311 if m.group('hours'):
1312 res += int(m.group('hours')) * 60 * 60
1314 res += int(m.group('days')) * 24 * 60 * 60
1316 res += float(m.group('ms'))
1320 def prepend_extension(filename, ext):
1321 name, real_ext = os.path.splitext(filename)
1322 return '{0}.{1}{2}'.format(name, ext, real_ext)
1325 def check_executable(exe, args=[]):
1326 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1327 args can be a list of arguments for a short output (like -version) """
1329 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1335 def get_exe_version(exe, args=['--version'],
1336 version_re=None, unrecognized='present'):
1337 """ Returns the version of the specified executable,
1338 or False if the executable is not present """
1340 out, _ = subprocess.Popen(
1342 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1345 if isinstance(out, bytes): # Python 2.x
1346 out = out.decode('ascii', 'ignore')
1347 return detect_exe_version(out, version_re, unrecognized)
1350 def detect_exe_version(output, version_re=None, unrecognized='present'):
1351 assert isinstance(output, compat_str)
1352 if version_re is None:
1353 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1354 m = re.search(version_re, output)
1361 class PagedList(object):
1363 # This is only useful for tests
1364 return len(self.getslice())
1367 class OnDemandPagedList(PagedList):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1372 def getslice(self, start=0, end=None):
1374 for pagenum in itertools.count(start // self._pagesize):
1375 firstid = pagenum * self._pagesize
1376 nextfirstid = pagenum * self._pagesize + self._pagesize
1377 if start >= nextfirstid:
1380 page_results = list(self._pagefunc(pagenum))
1383 start % self._pagesize
1384 if firstid <= start < nextfirstid
1388 ((end - 1) % self._pagesize) + 1
1389 if (end is not None and firstid <= end <= nextfirstid)
1392 if startv != 0 or endv is not None:
1393 page_results = page_results[startv:endv]
1394 res.extend(page_results)
1396 # A little optimization - if current page is not "full", ie. does
1397 # not contain page_size videos then we can assume that this page
1398 # is the last one - there are no more ids on further pages -
1399 # i.e. no need to query again.
1400 if len(page_results) + startv < self._pagesize:
1403 # If we got the whole page, but the next page is not interesting,
1404 # break out early as well
1405 if end == nextfirstid:
1410 class InAdvancePagedList(PagedList):
1411 def __init__(self, pagefunc, pagecount, pagesize):
1412 self._pagefunc = pagefunc
1413 self._pagecount = pagecount
1414 self._pagesize = pagesize
1416 def getslice(self, start=0, end=None):
1418 start_page = start // self._pagesize
1420 self._pagecount if end is None else (end // self._pagesize + 1))
1421 skip_elems = start - start_page * self._pagesize
1422 only_more = None if end is None else end - start
1423 for pagenum in range(start_page, end_page):
1424 page = list(self._pagefunc(pagenum))
1426 page = page[skip_elems:]
1428 if only_more is not None:
1429 if len(page) < only_more:
1430 only_more -= len(page)
1432 page = page[:only_more]
1439 def uppercase_escape(s):
1440 unicode_escape = codecs.getdecoder('unicode_escape')
1442 r'\\U[0-9a-fA-F]{8}',
1443 lambda m: unicode_escape(m.group(0))[0],
1447 def escape_rfc3986(s):
1448 """Escape non-ASCII characters as suggested by RFC 3986"""
1449 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1450 s = s.encode('utf-8')
1451 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1454 def escape_url(url):
1455 """Escape URL as suggested by RFC 3986"""
1456 url_parsed = compat_urllib_parse_urlparse(url)
1457 return url_parsed._replace(
1458 path=escape_rfc3986(url_parsed.path),
1459 params=escape_rfc3986(url_parsed.params),
1460 query=escape_rfc3986(url_parsed.query),
1461 fragment=escape_rfc3986(url_parsed.fragment)
1465 struct.pack('!I', 0)
1467 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1468 def struct_pack(spec, *args):
1469 if isinstance(spec, compat_str):
1470 spec = spec.encode('ascii')
1471 return struct.pack(spec, *args)
1473 def struct_unpack(spec, *args):
1474 if isinstance(spec, compat_str):
1475 spec = spec.encode('ascii')
1476 return struct.unpack(spec, *args)
1478 struct_pack = struct.pack
1479 struct_unpack = struct.unpack
1482 def read_batch_urls(batch_fd):
1484 if not isinstance(url, compat_str):
1485 url = url.decode('utf-8', 'replace')
1486 BOM_UTF8 = '\xef\xbb\xbf'
1487 if url.startswith(BOM_UTF8):
1488 url = url[len(BOM_UTF8):]
1490 if url.startswith(('#', ';', ']')):
1494 with contextlib.closing(batch_fd) as fd:
1495 return [url for url in map(fixup, fd) if url]
1498 def urlencode_postdata(*args, **kargs):
1499 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1503 etree_iter = xml.etree.ElementTree.Element.iter
1504 except AttributeError: # Python <=2.6
1505 etree_iter = lambda n: n.findall('.//*')
1509 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1510 def doctype(self, name, pubid, system):
1511 pass # Ignore doctypes
1513 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1514 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1515 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1516 # Fix up XML parser in Python 2.x
1517 if sys.version_info < (3, 0):
1518 for n in etree_iter(tree):
1519 if n.text is not None:
1520 if not isinstance(n.text, compat_str):
1521 n.text = n.text.decode('utf-8')
1534 def parse_age_limit(s):
1537 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1538 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1541 def strip_jsonp(code):
1543 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1546 def js_to_json(code):
1549 if v in ('true', 'false', 'null'):
1551 if v.startswith('"'):
1553 if v.startswith("'"):
1555 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1562 res = re.sub(r'''(?x)
1563 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1564 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1565 [a-zA-Z_][.a-zA-Z_0-9]*
1567 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1571 def qualities(quality_ids):
1572 """ Get a numeric quality value out of a list of possible values """
1575 return quality_ids.index(qid)
1581 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1584 def limit_length(s, length):
1585 """ Add ellipses to overly long strings """
1590 return s[:length - len(ELLIPSES)] + ELLIPSES
1594 def version_tuple(v):
1595 return tuple(int(e) for e in re.split(r'[-.]', v))
1598 def is_outdated_version(version, limit, assume_new=True):
1600 return not assume_new
1602 return version_tuple(version) < version_tuple(limit)
1604 return not assume_new
1607 def ytdl_is_updateable():
1608 """ Returns if youtube-dl can be updated with -U """
1609 from zipimport import zipimporter
1611 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1614 def args_to_str(args):
1615 # Get a short string representation for a subprocess command
1616 return ' '.join(shlex_quote(a) for a in args)
1619 def urlhandle_detect_ext(url_handle):
1622 getheader = lambda h: url_handle.headers[h]
1623 except AttributeError: # Python < 3
1624 getheader = url_handle.info().getheader
1626 cd = getheader('Content-Disposition')
1628 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1630 e = determine_ext(m.group('filename'), default_ext=None)
1634 return getheader('Content-Type').split("/")[1]
1637 def age_restricted(content_limit, age_limit):
1638 """ Returns True iff the content should be blocked """
1640 if age_limit is None: # No limit set
1642 if content_limit is None:
1643 return False # Content available for everyone
1644 return age_limit < content_limit
1647 def is_html(first_bytes):
1648 """ Detect whether a file contains HTML by examining its first bytes. """
1651 (b'\xef\xbb\xbf', 'utf-8'),
1652 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1653 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1654 (b'\xff\xfe', 'utf-16-le'),
1655 (b'\xfe\xff', 'utf-16-be'),
1657 for bom, enc in BOMS:
1658 if first_bytes.startswith(bom):
1659 s = first_bytes[len(bom):].decode(enc, 'replace')
1662 s = first_bytes.decode('utf-8', 'replace')
1664 return re.match(r'^\s*<', s)
1667 def determine_protocol(info_dict):
1668 protocol = info_dict.get('protocol')
1669 if protocol is not None:
1672 url = info_dict['url']
1673 if url.startswith('rtmp'):
1675 elif url.startswith('mms'):
1677 elif url.startswith('rtsp'):
1680 ext = determine_ext(url)
1686 return compat_urllib_parse_urlparse(url).scheme
1689 def render_table(header_row, data):
1690 """ Render a list of rows, each as a list of values """
1691 table = [header_row] + data
1692 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1693 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1694 return '\n'.join(format_str % tuple(row) for row in table)
1697 def _match_one(filter_part, dct):
1698 COMPARISON_OPERATORS = {
1706 operator_rex = re.compile(r'''(?x)\s*
1708 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1710 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1711 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1714 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1715 m = operator_rex.search(filter_part)
1717 op = COMPARISON_OPERATORS[m.group('op')]
1718 if m.group('strval') is not None:
1719 if m.group('op') not in ('=', '!='):
1721 'Operator %s does not support string values!' % m.group('op'))
1722 comparison_value = m.group('strval')
1725 comparison_value = int(m.group('intval'))
1727 comparison_value = parse_filesize(m.group('intval'))
1728 if comparison_value is None:
1729 comparison_value = parse_filesize(m.group('intval') + 'B')
1730 if comparison_value is None:
1732 'Invalid integer value %r in filter part %r' % (
1733 m.group('intval'), filter_part))
1734 actual_value = dct.get(m.group('key'))
1735 if actual_value is None:
1736 return m.group('none_inclusive')
1737 return op(actual_value, comparison_value)
1740 '': lambda v: v is not None,
1741 '!': lambda v: v is None,
1743 operator_rex = re.compile(r'''(?x)\s*
1744 (?P<op>%s)\s*(?P<key>[a-z_]+)
1746 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1747 m = operator_rex.search(filter_part)
1749 op = UNARY_OPERATORS[m.group('op')]
1750 actual_value = dct.get(m.group('key'))
1751 return op(actual_value)
1753 raise ValueError('Invalid filter part %r' % filter_part)
1756 def match_str(filter_str, dct):
1757 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1760 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1763 def match_filter_func(filter_str):
1764 def _match_func(info_dict):
1765 if match_str(filter_str, info_dict):
1768 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1769 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)