2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
410 # (create_default_context present but HTTPSHandler has no context=)
413 if sys.version_info < (3, 2):
416 class HTTPSConnectionV3(httplib.HTTPSConnection):
417 def __init__(self, *args, **kwargs):
418 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
421 sock = socket.create_connection((self.host, self.port), self.timeout)
422 if getattr(self, '_tunnel_host', False):
426 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
428 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
430 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
432 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
433 context.verify_mode = (ssl.CERT_NONE
434 if opts_no_check_certificate
435 else ssl.CERT_REQUIRED)
436 context.set_default_verify_paths()
437 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
440 class ExtractorError(Exception):
441 """Error during info extraction."""
443 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
444 """ tb, if given, is the original traceback (so that it can be printed out).
445 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
448 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
450 if video_id is not None:
451 msg = video_id + ': ' + msg
453 msg += ' (caused by %r)' % cause
455 if ytdl_is_updateable():
456 update_cmd = 'type youtube-dl -U to update'
458 update_cmd = 'see https://yt-dl.org/update on how to update'
459 msg += '; please report this issue on https://yt-dl.org/bug .'
460 msg += ' Make sure you are using the latest version; %s.' % update_cmd
461 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
462 super(ExtractorError, self).__init__(msg)
465 self.exc_info = sys.exc_info() # preserve original exception
467 self.video_id = video_id
469 def format_traceback(self):
470 if self.traceback is None:
472 return ''.join(traceback.format_tb(self.traceback))
475 class UnsupportedError(ExtractorError):
476 def __init__(self, url):
477 super(UnsupportedError, self).__init__(
478 'Unsupported URL: %s' % url, expected=True)
482 class RegexNotFoundError(ExtractorError):
483 """Error when a regex didn't match"""
487 class DownloadError(Exception):
488 """Download Error exception.
490 This exception may be thrown by FileDownloader objects if they are not
491 configured to continue on errors. They will contain the appropriate
495 def __init__(self, msg, exc_info=None):
496 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
497 super(DownloadError, self).__init__(msg)
498 self.exc_info = exc_info
501 class SameFileError(Exception):
502 """Same File exception.
504 This exception will be thrown by FileDownloader objects if they detect
505 multiple files would have to be downloaded to the same file on disk.
510 class PostProcessingError(Exception):
511 """Post Processing exception.
513 This exception may be raised by PostProcessor's .run() method to
514 indicate an error in the postprocessing task.
517 def __init__(self, msg):
521 class MaxDownloadsReached(Exception):
522 """ --max-downloads limit has been reached. """
526 class UnavailableVideoError(Exception):
527 """Unavailable Format exception.
529 This exception will be thrown when a video is requested
530 in a format that is not available for that video.
535 class ContentTooShortError(Exception):
536 """Content Too Short exception.
538 This exception may be raised by FileDownloader objects when a file they
539 download is too small for what the server announced first, indicating
540 the connection was probably interrupted.
546 def __init__(self, downloaded, expected):
547 self.downloaded = downloaded
548 self.expected = expected
551 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
552 hc = http_class(*args, **kwargs)
553 source_address = ydl_handler._params.get('source_address')
554 if source_address is not None:
555 sa = (source_address, 0)
556 if hasattr(hc, 'source_address'): # Python 2.7+
557 hc.source_address = sa
559 def _hc_connect(self, *args, **kwargs):
560 sock = compat_socket_create_connection(
561 (self.host, self.port), self.timeout, sa)
563 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
566 hc.connect = functools.partial(_hc_connect, hc)
571 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
572 """Handler for HTTP requests and responses.
574 This class, when installed with an OpenerDirector, automatically adds
575 the standard headers to every HTTP request and handles gzipped and
576 deflated responses from web servers. If compression is to be avoided in
577 a particular request, the original request in the program code only has
578 to include the HTTP header "Youtubedl-No-Compression", which will be
579 removed before making the real request.
581 Part of this code was copied from:
583 http://techknack.net/python-urllib2-handlers/
585 Andrew Rowls, the author of that code, agreed to release it to the
589 def __init__(self, params, *args, **kwargs):
590 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
591 self._params = params
593 def http_open(self, req):
594 return self.do_open(functools.partial(
595 _create_http_connection, self, compat_http_client.HTTPConnection, False),
601 return zlib.decompress(data, -zlib.MAX_WBITS)
603 return zlib.decompress(data)
606 def addinfourl_wrapper(stream, headers, url, code):
607 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
608 return compat_urllib_request.addinfourl(stream, headers, url, code)
609 ret = compat_urllib_request.addinfourl(stream, headers, url)
613 def http_request(self, req):
614 for h, v in std_headers.items():
615 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
616 # The dict keys are capitalized because of this bug by urllib
617 if h.capitalize() not in req.headers:
619 if 'Youtubedl-no-compression' in req.headers:
620 if 'Accept-encoding' in req.headers:
621 del req.headers['Accept-encoding']
622 del req.headers['Youtubedl-no-compression']
623 if 'Youtubedl-user-agent' in req.headers:
624 if 'User-agent' in req.headers:
625 del req.headers['User-agent']
626 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
627 del req.headers['Youtubedl-user-agent']
629 if sys.version_info < (2, 7) and '#' in req.get_full_url():
630 # Python 2.6 is brain-dead when it comes to fragments
631 req._Request__original = req._Request__original.partition('#')[0]
632 req._Request__r_type = req._Request__r_type.partition('#')[0]
636 def http_response(self, req, resp):
639 if resp.headers.get('Content-encoding', '') == 'gzip':
640 content = resp.read()
641 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
643 uncompressed = io.BytesIO(gz.read())
644 except IOError as original_ioerror:
645 # There may be junk add the end of the file
646 # See http://stackoverflow.com/q/4928560/35070 for details
647 for i in range(1, 1024):
649 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
650 uncompressed = io.BytesIO(gz.read())
655 raise original_ioerror
656 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
657 resp.msg = old_resp.msg
659 if resp.headers.get('Content-encoding', '') == 'deflate':
660 gz = io.BytesIO(self.deflate(resp.read()))
661 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
662 resp.msg = old_resp.msg
665 https_request = http_request
666 https_response = http_response
669 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
670 def __init__(self, params, https_conn_class=None, *args, **kwargs):
671 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
672 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
673 self._params = params
675 def https_open(self, req):
676 return self.do_open(functools.partial(
677 _create_http_connection, self, self._https_conn_class, True),
681 def parse_iso8601(date_str, delimiter='T'):
682 """ Return a UNIX timestamp from the given date """
688 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
691 timezone = datetime.timedelta()
693 date_str = date_str[:-len(m.group(0))]
694 if not m.group('sign'):
695 timezone = datetime.timedelta()
697 sign = 1 if m.group('sign') == '+' else -1
698 timezone = datetime.timedelta(
699 hours=sign * int(m.group('hours')),
700 minutes=sign * int(m.group('minutes')))
701 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
702 dt = datetime.datetime.strptime(date_str, date_format) - timezone
703 return calendar.timegm(dt.timetuple())
706 def unified_strdate(date_str, day_first=True):
707 """Return a string with the date in the format YYYYMMDD"""
713 date_str = date_str.replace(',', ' ')
714 # %z (UTC offset) is only supported in python>=3.2
715 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
716 # Remove AM/PM + timezone
717 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
719 format_expressions = [
724 '%b %dst %Y %I:%M%p',
725 '%b %dnd %Y %I:%M%p',
726 '%b %dth %Y %I:%M%p',
732 '%Y-%m-%d %H:%M:%S.%f',
735 '%Y-%m-%dT%H:%M:%SZ',
736 '%Y-%m-%dT%H:%M:%S.%fZ',
737 '%Y-%m-%dT%H:%M:%S.%f0Z',
739 '%Y-%m-%dT%H:%M:%S.%f',
743 format_expressions.extend([
750 format_expressions.extend([
756 for expression in format_expressions:
758 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
761 if upload_date is None:
762 timetuple = email.utils.parsedate_tz(date_str)
764 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
768 def determine_ext(url, default_ext='unknown_video'):
771 guess = url.partition('?')[0].rpartition('.')[2]
772 if re.match(r'^[A-Za-z0-9]+$', guess):
778 def subtitles_filename(filename, sub_lang, sub_format):
779 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
782 def date_from_str(date_str):
784 Return a datetime object from a string in the format YYYYMMDD or
785 (now|today)[+-][0-9](day|week|month|year)(s)?"""
786 today = datetime.date.today()
787 if date_str in ('now', 'today'):
789 if date_str == 'yesterday':
790 return today - datetime.timedelta(days=1)
791 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
792 if match is not None:
793 sign = match.group('sign')
794 time = int(match.group('time'))
797 unit = match.group('unit')
798 # A bad aproximation?
806 delta = datetime.timedelta(**{unit: time})
808 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
811 def hyphenate_date(date_str):
813 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
814 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
815 if match is not None:
816 return '-'.join(match.groups())
821 class DateRange(object):
822 """Represents a time interval between two dates"""
824 def __init__(self, start=None, end=None):
825 """start and end must be strings in the format accepted by date"""
826 if start is not None:
827 self.start = date_from_str(start)
829 self.start = datetime.datetime.min.date()
831 self.end = date_from_str(end)
833 self.end = datetime.datetime.max.date()
834 if self.start > self.end:
835 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
839 """Returns a range that only contains the given day"""
842 def __contains__(self, date):
843 """Check if the date is in the range"""
844 if not isinstance(date, datetime.date):
845 date = date_from_str(date)
846 return self.start <= date <= self.end
849 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
853 """ Returns the platform name as a compat_str """
854 res = platform.platform()
855 if isinstance(res, bytes):
856 res = res.decode(preferredencoding())
858 assert isinstance(res, compat_str)
862 def _windows_write_string(s, out):
863 """ Returns True if the string was written using special methods,
864 False if it has yet to be written out."""
865 # Adapted from http://stackoverflow.com/a/3259271/35070
868 import ctypes.wintypes
876 fileno = out.fileno()
877 except AttributeError:
878 # If the output stream doesn't have a fileno, it's virtual
880 if fileno not in WIN_OUTPUT_IDS:
883 GetStdHandle = ctypes.WINFUNCTYPE(
884 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
885 (b"GetStdHandle", ctypes.windll.kernel32))
886 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
888 WriteConsoleW = ctypes.WINFUNCTYPE(
889 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
890 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
891 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
892 written = ctypes.wintypes.DWORD(0)
894 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
895 FILE_TYPE_CHAR = 0x0002
896 FILE_TYPE_REMOTE = 0x8000
897 GetConsoleMode = ctypes.WINFUNCTYPE(
898 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
899 ctypes.POINTER(ctypes.wintypes.DWORD))(
900 (b"GetConsoleMode", ctypes.windll.kernel32))
901 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
903 def not_a_console(handle):
904 if handle == INVALID_HANDLE_VALUE or handle is None:
906 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
907 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
912 def next_nonbmp_pos(s):
914 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
915 except StopIteration:
919 count = min(next_nonbmp_pos(s), 1024)
922 h, s, count if count else 2, ctypes.byref(written), None)
924 raise OSError('Failed to write string')
925 if not count: # We just wrote a non-BMP character
926 assert written.value == 2
929 assert written.value > 0
930 s = s[written.value:]
934 def write_string(s, out=None, encoding=None):
937 assert type(s) == compat_str
939 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
940 if _windows_write_string(s, out):
943 if ('b' in getattr(out, 'mode', '') or
944 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
945 byt = s.encode(encoding or preferredencoding(), 'ignore')
947 elif hasattr(out, 'buffer'):
948 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
949 byt = s.encode(enc, 'ignore')
950 out.buffer.write(byt)
956 def bytes_to_intlist(bs):
959 if isinstance(bs[0], int): # Python 3
962 return [ord(c) for c in bs]
965 def intlist_to_bytes(xs):
968 return struct_pack('%dB' % len(xs), *xs)
971 # Cross-platform file locking
972 if sys.platform == 'win32':
973 import ctypes.wintypes
976 class OVERLAPPED(ctypes.Structure):
978 ('Internal', ctypes.wintypes.LPVOID),
979 ('InternalHigh', ctypes.wintypes.LPVOID),
980 ('Offset', ctypes.wintypes.DWORD),
981 ('OffsetHigh', ctypes.wintypes.DWORD),
982 ('hEvent', ctypes.wintypes.HANDLE),
985 kernel32 = ctypes.windll.kernel32
986 LockFileEx = kernel32.LockFileEx
987 LockFileEx.argtypes = [
988 ctypes.wintypes.HANDLE, # hFile
989 ctypes.wintypes.DWORD, # dwFlags
990 ctypes.wintypes.DWORD, # dwReserved
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
992 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
993 ctypes.POINTER(OVERLAPPED) # Overlapped
995 LockFileEx.restype = ctypes.wintypes.BOOL
996 UnlockFileEx = kernel32.UnlockFileEx
997 UnlockFileEx.argtypes = [
998 ctypes.wintypes.HANDLE, # hFile
999 ctypes.wintypes.DWORD, # dwReserved
1000 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1001 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1002 ctypes.POINTER(OVERLAPPED) # Overlapped
1004 UnlockFileEx.restype = ctypes.wintypes.BOOL
1005 whole_low = 0xffffffff
1006 whole_high = 0x7fffffff
1008 def _lock_file(f, exclusive):
1009 overlapped = OVERLAPPED()
1010 overlapped.Offset = 0
1011 overlapped.OffsetHigh = 0
1012 overlapped.hEvent = 0
1013 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1014 handle = msvcrt.get_osfhandle(f.fileno())
1015 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1016 whole_low, whole_high, f._lock_file_overlapped_p):
1017 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1019 def _unlock_file(f):
1020 assert f._lock_file_overlapped_p
1021 handle = msvcrt.get_osfhandle(f.fileno())
1022 if not UnlockFileEx(handle, 0,
1023 whole_low, whole_high, f._lock_file_overlapped_p):
1024 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1029 def _lock_file(f, exclusive):
1030 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1032 def _unlock_file(f):
1033 fcntl.flock(f, fcntl.LOCK_UN)
1036 class locked_file(object):
1037 def __init__(self, filename, mode, encoding=None):
1038 assert mode in ['r', 'a', 'w']
1039 self.f = io.open(filename, mode, encoding=encoding)
1042 def __enter__(self):
1043 exclusive = self.mode != 'r'
1045 _lock_file(self.f, exclusive)
1051 def __exit__(self, etype, value, traceback):
1053 _unlock_file(self.f)
1060 def write(self, *args):
1061 return self.f.write(*args)
1063 def read(self, *args):
1064 return self.f.read(*args)
1067 def get_filesystem_encoding():
1068 encoding = sys.getfilesystemencoding()
1069 return encoding if encoding is not None else 'utf-8'
1072 def shell_quote(args):
1074 encoding = get_filesystem_encoding()
1076 if isinstance(a, bytes):
1077 # We may get a filename encoded with 'encodeFilename'
1078 a = a.decode(encoding)
1079 quoted_args.append(pipes.quote(a))
1080 return ' '.join(quoted_args)
1083 def takewhile_inclusive(pred, seq):
1084 """ Like itertools.takewhile, but include the latest evaluated element
1085 (the first element so that Not pred(e)) """
1092 def smuggle_url(url, data):
1093 """ Pass additional data in a URL for internal use. """
1095 sdata = compat_urllib_parse.urlencode(
1096 {'__youtubedl_smuggle': json.dumps(data)})
1097 return url + '#' + sdata
1100 def unsmuggle_url(smug_url, default=None):
1101 if '#__youtubedl_smuggle' not in smug_url:
1102 return smug_url, default
1103 url, _, sdata = smug_url.rpartition('#')
1104 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1105 data = json.loads(jsond)
1109 def format_bytes(bytes):
1112 if type(bytes) is str:
1113 bytes = float(bytes)
1117 exponent = int(math.log(bytes, 1024.0))
1118 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1119 converted = float(bytes) / float(1024 ** exponent)
1120 return '%.2f%s' % (converted, suffix)
1123 def parse_filesize(s):
1127 # The lower-case forms are of course incorrect and inofficial,
1128 # but we support those too
1166 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1168 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1172 num_str = m.group('num').replace(',', '.')
1173 mult = _UNIT_TABLE[m.group('unit')]
1174 return int(float(num_str) * mult)
1177 def get_term_width():
1178 columns = compat_getenv('COLUMNS', None)
1183 sp = subprocess.Popen(
1185 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1186 out, err = sp.communicate()
1187 return int(out.split()[1])
1193 def month_by_name(name):
1194 """ Return the number of a month by (locale-independently) English name """
1197 'January', 'February', 'March', 'April', 'May', 'June',
1198 'July', 'August', 'September', 'October', 'November', 'December']
1200 return ENGLISH_NAMES.index(name) + 1
1205 def fix_xml_ampersands(xml_str):
1206 """Replace all the '&' by '&' in XML"""
1208 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1213 def setproctitle(title):
1214 assert isinstance(title, compat_str)
1216 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1219 title_bytes = title.encode('utf-8')
1220 buf = ctypes.create_string_buffer(len(title_bytes))
1221 buf.value = title_bytes
1223 libc.prctl(15, buf, 0, 0, 0)
1224 except AttributeError:
1225 return # Strange libc, just skip this
1228 def remove_start(s, start):
1229 if s.startswith(start):
1230 return s[len(start):]
1234 def remove_end(s, end):
1236 return s[:-len(end)]
1240 def url_basename(url):
1241 path = compat_urlparse.urlparse(url).path
1242 return path.strip('/').split('/')[-1]
1245 class HEADRequest(compat_urllib_request.Request):
1246 def get_method(self):
1250 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1253 v = getattr(v, get_attr, None)
1256 return default if v is None else (int(v) * invscale // scale)
1259 def str_or_none(v, default=None):
1260 return default if v is None else compat_str(v)
1263 def str_to_int(int_str):
1264 """ A more relaxed version of int_or_none """
1267 int_str = re.sub(r'[,\.\+]', '', int_str)
1271 def float_or_none(v, scale=1, invscale=1, default=None):
1272 return default if v is None else (float(v) * invscale / scale)
1275 def parse_duration(s):
1276 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1284 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1285 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1288 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1289 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1291 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1296 if m.group('only_mins'):
1297 return float_or_none(m.group('only_mins'), invscale=60)
1298 if m.group('only_hours'):
1299 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1301 res += int(m.group('secs'))
1303 res += int(m.group('mins')) * 60
1304 if m.group('hours'):
1305 res += int(m.group('hours')) * 60 * 60
1307 res += float(m.group('ms'))
1311 def prepend_extension(filename, ext):
1312 name, real_ext = os.path.splitext(filename)
1313 return '{0}.{1}{2}'.format(name, ext, real_ext)
1316 def check_executable(exe, args=[]):
1317 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1318 args can be a list of arguments for a short output (like -version) """
1320 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1326 def get_exe_version(exe, args=['--version'],
1327 version_re=None, unrecognized='present'):
1328 """ Returns the version of the specified executable,
1329 or False if the executable is not present """
1331 out, _ = subprocess.Popen(
1333 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1336 if isinstance(out, bytes): # Python 2.x
1337 out = out.decode('ascii', 'ignore')
1338 return detect_exe_version(out, version_re, unrecognized)
1341 def detect_exe_version(output, version_re=None, unrecognized='present'):
1342 assert isinstance(output, compat_str)
1343 if version_re is None:
1344 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1345 m = re.search(version_re, output)
1352 class PagedList(object):
1354 # This is only useful for tests
1355 return len(self.getslice())
1358 class OnDemandPagedList(PagedList):
1359 def __init__(self, pagefunc, pagesize):
1360 self._pagefunc = pagefunc
1361 self._pagesize = pagesize
1363 def getslice(self, start=0, end=None):
1365 for pagenum in itertools.count(start // self._pagesize):
1366 firstid = pagenum * self._pagesize
1367 nextfirstid = pagenum * self._pagesize + self._pagesize
1368 if start >= nextfirstid:
1371 page_results = list(self._pagefunc(pagenum))
1374 start % self._pagesize
1375 if firstid <= start < nextfirstid
1379 ((end - 1) % self._pagesize) + 1
1380 if (end is not None and firstid <= end <= nextfirstid)
1383 if startv != 0 or endv is not None:
1384 page_results = page_results[startv:endv]
1385 res.extend(page_results)
1387 # A little optimization - if current page is not "full", ie. does
1388 # not contain page_size videos then we can assume that this page
1389 # is the last one - there are no more ids on further pages -
1390 # i.e. no need to query again.
1391 if len(page_results) + startv < self._pagesize:
1394 # If we got the whole page, but the next page is not interesting,
1395 # break out early as well
1396 if end == nextfirstid:
1401 class InAdvancePagedList(PagedList):
1402 def __init__(self, pagefunc, pagecount, pagesize):
1403 self._pagefunc = pagefunc
1404 self._pagecount = pagecount
1405 self._pagesize = pagesize
1407 def getslice(self, start=0, end=None):
1409 start_page = start // self._pagesize
1411 self._pagecount if end is None else (end // self._pagesize + 1))
1412 skip_elems = start - start_page * self._pagesize
1413 only_more = None if end is None else end - start
1414 for pagenum in range(start_page, end_page):
1415 page = list(self._pagefunc(pagenum))
1417 page = page[skip_elems:]
1419 if only_more is not None:
1420 if len(page) < only_more:
1421 only_more -= len(page)
1423 page = page[:only_more]
1430 def uppercase_escape(s):
1431 unicode_escape = codecs.getdecoder('unicode_escape')
1433 r'\\U[0-9a-fA-F]{8}',
1434 lambda m: unicode_escape(m.group(0))[0],
1438 def escape_rfc3986(s):
1439 """Escape non-ASCII characters as suggested by RFC 3986"""
1440 if sys.version_info < (3, 0) and isinstance(s, unicode):
1441 s = s.encode('utf-8')
1442 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1445 def escape_url(url):
1446 """Escape URL as suggested by RFC 3986"""
1447 url_parsed = compat_urllib_parse_urlparse(url)
1448 return url_parsed._replace(
1449 path=escape_rfc3986(url_parsed.path),
1450 params=escape_rfc3986(url_parsed.params),
1451 query=escape_rfc3986(url_parsed.query),
1452 fragment=escape_rfc3986(url_parsed.fragment)
1456 struct.pack('!I', 0)
1458 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1459 def struct_pack(spec, *args):
1460 if isinstance(spec, compat_str):
1461 spec = spec.encode('ascii')
1462 return struct.pack(spec, *args)
1464 def struct_unpack(spec, *args):
1465 if isinstance(spec, compat_str):
1466 spec = spec.encode('ascii')
1467 return struct.unpack(spec, *args)
1469 struct_pack = struct.pack
1470 struct_unpack = struct.unpack
1473 def read_batch_urls(batch_fd):
1475 if not isinstance(url, compat_str):
1476 url = url.decode('utf-8', 'replace')
1477 BOM_UTF8 = '\xef\xbb\xbf'
1478 if url.startswith(BOM_UTF8):
1479 url = url[len(BOM_UTF8):]
1481 if url.startswith(('#', ';', ']')):
1485 with contextlib.closing(batch_fd) as fd:
1486 return [url for url in map(fixup, fd) if url]
1489 def urlencode_postdata(*args, **kargs):
1490 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1494 etree_iter = xml.etree.ElementTree.Element.iter
1495 except AttributeError: # Python <=2.6
1496 etree_iter = lambda n: n.findall('.//*')
1500 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1501 def doctype(self, name, pubid, system):
1502 pass # Ignore doctypes
1504 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1505 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1506 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1507 # Fix up XML parser in Python 2.x
1508 if sys.version_info < (3, 0):
1509 for n in etree_iter(tree):
1510 if n.text is not None:
1511 if not isinstance(n.text, compat_str):
1512 n.text = n.text.decode('utf-8')
1525 def parse_age_limit(s):
1528 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1529 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1532 def strip_jsonp(code):
1534 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1537 def js_to_json(code):
1540 if v in ('true', 'false', 'null'):
1542 if v.startswith('"'):
1544 if v.startswith("'"):
1546 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1553 res = re.sub(r'''(?x)
1554 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1555 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1556 [a-zA-Z_][a-zA-Z_0-9]*
1558 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1562 def qualities(quality_ids):
1563 """ Get a numeric quality value out of a list of possible values """
1566 return quality_ids.index(qid)
1572 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1575 def limit_length(s, length):
1576 """ Add ellipses to overly long strings """
1581 return s[:length - len(ELLIPSES)] + ELLIPSES
1585 def version_tuple(v):
1586 return tuple(int(e) for e in re.split(r'[-.]', v))
1589 def is_outdated_version(version, limit, assume_new=True):
1591 return not assume_new
1593 return version_tuple(version) < version_tuple(limit)
1595 return not assume_new
1598 def ytdl_is_updateable():
1599 """ Returns if youtube-dl can be updated with -U """
1600 from zipimport import zipimporter
1602 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1605 def args_to_str(args):
1606 # Get a short string representation for a subprocess command
1607 return ' '.join(shlex_quote(a) for a in args)
1610 def urlhandle_detect_ext(url_handle):
1613 getheader = lambda h: url_handle.headers[h]
1614 except AttributeError: # Python < 3
1615 getheader = url_handle.info().getheader
1617 cd = getheader('Content-Disposition')
1619 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1621 e = determine_ext(m.group('filename'), default_ext=None)
1625 return getheader('Content-Type').split("/")[1]
1628 def age_restricted(content_limit, age_limit):
1629 """ Returns True iff the content should be blocked """
1631 if age_limit is None: # No limit set
1633 if content_limit is None:
1634 return False # Content available for everyone
1635 return age_limit < content_limit
1638 def is_html(first_bytes):
1639 """ Detect whether a file contains HTML by examining its first bytes. """
1642 (b'\xef\xbb\xbf', 'utf-8'),
1643 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1644 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1645 (b'\xff\xfe', 'utf-16-le'),
1646 (b'\xfe\xff', 'utf-16-be'),
1648 for bom, enc in BOMS:
1649 if first_bytes.startswith(bom):
1650 s = first_bytes[len(bom):].decode(enc, 'replace')
1653 s = first_bytes.decode('utf-8', 'replace')
1655 return re.match(r'^\s*<', s)