2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = os.path.join(
256 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
257 for path_part in os.path.split(filename)
259 if alt_filename == filename:
262 # An exception here should be caught in the caller
263 stream = open(encodeFilename(filename), open_mode)
264 return (stream, alt_filename)
267 def timeconvert(timestr):
268 """Convert RFC 2822 defined time string into system timestamp"""
270 timetuple = email.utils.parsedate_tz(timestr)
271 if timetuple is not None:
272 timestamp = email.utils.mktime_tz(timetuple)
276 def sanitize_filename(s, restricted=False, is_id=False):
277 """Sanitizes a string so it could be used as part of a filename.
278 If restricted is set, use a stricter subset of allowed characters.
279 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
281 def replace_insane(char):
282 if char == '?' or ord(char) < 32 or ord(char) == 127:
285 return '' if restricted else '\''
287 return '_-' if restricted else ' -'
288 elif char in '\\/|*<>':
290 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
292 if restricted and ord(char) > 127:
297 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
298 result = ''.join(map(replace_insane, s))
300 while '__' in result:
301 result = result.replace('__', '_')
302 result = result.strip('_')
303 # Common case of "Foreign band name - English song title"
304 if restricted and result.startswith('-_'):
306 if result.startswith('-'):
307 result = '_' + result[len('-'):]
313 def orderedSet(iterable):
314 """ Remove all duplicates from the input iterable """
322 def _htmlentity_transform(entity):
323 """Transforms an HTML entity to a character."""
324 # Known non-numeric HTML entity
325 if entity in compat_html_entities.name2codepoint:
326 return compat_chr(compat_html_entities.name2codepoint[entity])
328 mobj = re.match(r'#(x?[0-9]+)', entity)
330 numstr = mobj.group(1)
331 if numstr.startswith('x'):
333 numstr = '0%s' % numstr
336 return compat_chr(int(numstr, base))
338 # Unknown entity in name, return its literal representation
339 return ('&%s;' % entity)
345 assert type(s) == compat_str
348 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
351 def encodeFilename(s, for_subprocess=False):
353 @param s The name of the file
356 assert type(s) == compat_str
358 # Python 3 has a Unicode API
359 if sys.version_info >= (3, 0):
362 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
363 # Pass '' directly to use Unicode APIs on Windows 2000 and up
364 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
365 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
366 if not for_subprocess:
369 # For subprocess calls, encode with locale encoding
370 # Refer to http://stackoverflow.com/a/9951851/35070
371 encoding = preferredencoding()
373 encoding = sys.getfilesystemencoding()
376 return s.encode(encoding, 'ignore')
379 def encodeArgument(s):
380 if not isinstance(s, compat_str):
381 # Legacy code that uses byte strings
382 # Uncomment the following line after fixing all post processors
383 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
384 s = s.decode('ascii')
385 return encodeFilename(s, True)
388 def decodeOption(optval):
391 if isinstance(optval, bytes):
392 optval = optval.decode(preferredencoding())
394 assert isinstance(optval, compat_str)
398 def formatSeconds(secs):
400 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
402 return '%d:%02d' % (secs // 60, secs % 60)
407 def make_HTTPS_handler(params, **kwargs):
408 opts_no_check_certificate = params.get('nocheckcertificate', False)
409 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
410 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
411 if opts_no_check_certificate:
412 context.check_hostname = False
413 context.verify_mode = ssl.CERT_NONE
415 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
418 # (create_default_context present but HTTPSHandler has no context=)
421 if sys.version_info < (3, 2):
422 return YoutubeDLHTTPSHandler(params, **kwargs)
424 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
425 context.verify_mode = (ssl.CERT_NONE
426 if opts_no_check_certificate
427 else ssl.CERT_REQUIRED)
428 context.set_default_verify_paths()
429 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
432 class ExtractorError(Exception):
433 """Error during info extraction."""
435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 if video_id is not None:
443 msg = video_id + ': ' + msg
445 msg += ' (caused by %r)' % cause
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
454 super(ExtractorError, self).__init__(msg)
457 self.exc_info = sys.exc_info() # preserve original exception
459 self.video_id = video_id
461 def format_traceback(self):
462 if self.traceback is None:
464 return ''.join(traceback.format_tb(self.traceback))
467 class UnsupportedError(ExtractorError):
468 def __init__(self, url):
469 super(UnsupportedError, self).__init__(
470 'Unsupported URL: %s' % url, expected=True)
474 class RegexNotFoundError(ExtractorError):
475 """Error when a regex didn't match"""
479 class DownloadError(Exception):
480 """Download Error exception.
482 This exception may be thrown by FileDownloader objects if they are not
483 configured to continue on errors. They will contain the appropriate
487 def __init__(self, msg, exc_info=None):
488 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
489 super(DownloadError, self).__init__(msg)
490 self.exc_info = exc_info
493 class SameFileError(Exception):
494 """Same File exception.
496 This exception will be thrown by FileDownloader objects if they detect
497 multiple files would have to be downloaded to the same file on disk.
502 class PostProcessingError(Exception):
503 """Post Processing exception.
505 This exception may be raised by PostProcessor's .run() method to
506 indicate an error in the postprocessing task.
509 def __init__(self, msg):
513 class MaxDownloadsReached(Exception):
514 """ --max-downloads limit has been reached. """
518 class UnavailableVideoError(Exception):
519 """Unavailable Format exception.
521 This exception will be thrown when a video is requested
522 in a format that is not available for that video.
527 class ContentTooShortError(Exception):
528 """Content Too Short exception.
530 This exception may be raised by FileDownloader objects when a file they
531 download is too small for what the server announced first, indicating
532 the connection was probably interrupted.
538 def __init__(self, downloaded, expected):
539 self.downloaded = downloaded
540 self.expected = expected
543 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
544 hc = http_class(*args, **kwargs)
545 source_address = ydl_handler._params.get('source_address')
546 if source_address is not None:
547 sa = (source_address, 0)
548 if hasattr(hc, 'source_address'): # Python 2.7+
549 hc.source_address = sa
551 def _hc_connect(self, *args, **kwargs):
552 sock = compat_socket_create_connection(
553 (self.host, self.port), self.timeout, sa)
555 self.sock = ssl.wrap_socket(
556 sock, self.key_file, self.cert_file,
557 ssl_version=ssl.PROTOCOL_TLSv1)
560 hc.connect = functools.partial(_hc_connect, hc)
565 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
566 """Handler for HTTP requests and responses.
568 This class, when installed with an OpenerDirector, automatically adds
569 the standard headers to every HTTP request and handles gzipped and
570 deflated responses from web servers. If compression is to be avoided in
571 a particular request, the original request in the program code only has
572 to include the HTTP header "Youtubedl-No-Compression", which will be
573 removed before making the real request.
575 Part of this code was copied from:
577 http://techknack.net/python-urllib2-handlers/
579 Andrew Rowls, the author of that code, agreed to release it to the
583 def __init__(self, params, *args, **kwargs):
584 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
585 self._params = params
587 def http_open(self, req):
588 return self.do_open(functools.partial(
589 _create_http_connection, self, compat_http_client.HTTPConnection, False),
595 return zlib.decompress(data, -zlib.MAX_WBITS)
597 return zlib.decompress(data)
600 def addinfourl_wrapper(stream, headers, url, code):
601 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
602 return compat_urllib_request.addinfourl(stream, headers, url, code)
603 ret = compat_urllib_request.addinfourl(stream, headers, url)
607 def http_request(self, req):
608 for h, v in std_headers.items():
609 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
610 # The dict keys are capitalized because of this bug by urllib
611 if h.capitalize() not in req.headers:
613 if 'Youtubedl-no-compression' in req.headers:
614 if 'Accept-encoding' in req.headers:
615 del req.headers['Accept-encoding']
616 del req.headers['Youtubedl-no-compression']
618 if sys.version_info < (2, 7) and '#' in req.get_full_url():
619 # Python 2.6 is brain-dead when it comes to fragments
620 req._Request__original = req._Request__original.partition('#')[0]
621 req._Request__r_type = req._Request__r_type.partition('#')[0]
625 def http_response(self, req, resp):
628 if resp.headers.get('Content-encoding', '') == 'gzip':
629 content = resp.read()
630 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
632 uncompressed = io.BytesIO(gz.read())
633 except IOError as original_ioerror:
634 # There may be junk add the end of the file
635 # See http://stackoverflow.com/q/4928560/35070 for details
636 for i in range(1, 1024):
638 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
639 uncompressed = io.BytesIO(gz.read())
644 raise original_ioerror
645 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
646 resp.msg = old_resp.msg
648 if resp.headers.get('Content-encoding', '') == 'deflate':
649 gz = io.BytesIO(self.deflate(resp.read()))
650 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
651 resp.msg = old_resp.msg
654 https_request = http_request
655 https_response = http_response
658 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
659 def __init__(self, params, https_conn_class=None, *args, **kwargs):
660 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
661 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
662 self._params = params
664 def https_open(self, req):
666 if hasattr(self, '_context'): # python > 2.6
667 kwargs['context'] = self._context
668 if hasattr(self, '_check_hostname'): # python 3.x
669 kwargs['check_hostname'] = self._check_hostname
670 return self.do_open(functools.partial(
671 _create_http_connection, self, self._https_conn_class, True),
675 def parse_iso8601(date_str, delimiter='T', timezone=None):
676 """ Return a UNIX timestamp from the given date """
683 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
686 timezone = datetime.timedelta()
688 date_str = date_str[:-len(m.group(0))]
689 if not m.group('sign'):
690 timezone = datetime.timedelta()
692 sign = 1 if m.group('sign') == '+' else -1
693 timezone = datetime.timedelta(
694 hours=sign * int(m.group('hours')),
695 minutes=sign * int(m.group('minutes')))
696 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
697 dt = datetime.datetime.strptime(date_str, date_format) - timezone
698 return calendar.timegm(dt.timetuple())
701 def unified_strdate(date_str, day_first=True):
702 """Return a string with the date in the format YYYYMMDD"""
708 date_str = date_str.replace(',', ' ')
709 # %z (UTC offset) is only supported in python>=3.2
710 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
711 # Remove AM/PM + timezone
712 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
714 format_expressions = [
719 '%b %dst %Y %I:%M%p',
720 '%b %dnd %Y %I:%M%p',
721 '%b %dth %Y %I:%M%p',
727 '%Y-%m-%d %H:%M:%S.%f',
730 '%Y-%m-%dT%H:%M:%SZ',
731 '%Y-%m-%dT%H:%M:%S.%fZ',
732 '%Y-%m-%dT%H:%M:%S.%f0Z',
734 '%Y-%m-%dT%H:%M:%S.%f',
738 format_expressions.extend([
745 format_expressions.extend([
751 for expression in format_expressions:
753 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
756 if upload_date is None:
757 timetuple = email.utils.parsedate_tz(date_str)
759 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
763 def determine_ext(url, default_ext='unknown_video'):
766 guess = url.partition('?')[0].rpartition('.')[2]
767 if re.match(r'^[A-Za-z0-9]+$', guess):
773 def subtitles_filename(filename, sub_lang, sub_format):
774 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
777 def date_from_str(date_str):
779 Return a datetime object from a string in the format YYYYMMDD or
780 (now|today)[+-][0-9](day|week|month|year)(s)?"""
781 today = datetime.date.today()
782 if date_str in ('now', 'today'):
784 if date_str == 'yesterday':
785 return today - datetime.timedelta(days=1)
786 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
787 if match is not None:
788 sign = match.group('sign')
789 time = int(match.group('time'))
792 unit = match.group('unit')
793 # A bad aproximation?
801 delta = datetime.timedelta(**{unit: time})
803 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
806 def hyphenate_date(date_str):
808 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
809 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
810 if match is not None:
811 return '-'.join(match.groups())
816 class DateRange(object):
817 """Represents a time interval between two dates"""
819 def __init__(self, start=None, end=None):
820 """start and end must be strings in the format accepted by date"""
821 if start is not None:
822 self.start = date_from_str(start)
824 self.start = datetime.datetime.min.date()
826 self.end = date_from_str(end)
828 self.end = datetime.datetime.max.date()
829 if self.start > self.end:
830 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
834 """Returns a range that only contains the given day"""
837 def __contains__(self, date):
838 """Check if the date is in the range"""
839 if not isinstance(date, datetime.date):
840 date = date_from_str(date)
841 return self.start <= date <= self.end
844 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
848 """ Returns the platform name as a compat_str """
849 res = platform.platform()
850 if isinstance(res, bytes):
851 res = res.decode(preferredencoding())
853 assert isinstance(res, compat_str)
857 def _windows_write_string(s, out):
858 """ Returns True if the string was written using special methods,
859 False if it has yet to be written out."""
860 # Adapted from http://stackoverflow.com/a/3259271/35070
863 import ctypes.wintypes
871 fileno = out.fileno()
872 except AttributeError:
873 # If the output stream doesn't have a fileno, it's virtual
875 except io.UnsupportedOperation:
876 # Some strange Windows pseudo files?
878 if fileno not in WIN_OUTPUT_IDS:
881 GetStdHandle = ctypes.WINFUNCTYPE(
882 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
883 (b"GetStdHandle", ctypes.windll.kernel32))
884 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
886 WriteConsoleW = ctypes.WINFUNCTYPE(
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
888 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
889 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
890 written = ctypes.wintypes.DWORD(0)
892 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
893 FILE_TYPE_CHAR = 0x0002
894 FILE_TYPE_REMOTE = 0x8000
895 GetConsoleMode = ctypes.WINFUNCTYPE(
896 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
897 ctypes.POINTER(ctypes.wintypes.DWORD))(
898 (b"GetConsoleMode", ctypes.windll.kernel32))
899 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
901 def not_a_console(handle):
902 if handle == INVALID_HANDLE_VALUE or handle is None:
904 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
905 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
910 def next_nonbmp_pos(s):
912 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
913 except StopIteration:
917 count = min(next_nonbmp_pos(s), 1024)
920 h, s, count if count else 2, ctypes.byref(written), None)
922 raise OSError('Failed to write string')
923 if not count: # We just wrote a non-BMP character
924 assert written.value == 2
927 assert written.value > 0
928 s = s[written.value:]
932 def write_string(s, out=None, encoding=None):
935 assert type(s) == compat_str
937 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
938 if _windows_write_string(s, out):
941 if ('b' in getattr(out, 'mode', '') or
942 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
943 byt = s.encode(encoding or preferredencoding(), 'ignore')
945 elif hasattr(out, 'buffer'):
946 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
947 byt = s.encode(enc, 'ignore')
948 out.buffer.write(byt)
954 def bytes_to_intlist(bs):
957 if isinstance(bs[0], int): # Python 3
960 return [ord(c) for c in bs]
963 def intlist_to_bytes(xs):
966 return struct_pack('%dB' % len(xs), *xs)
969 # Cross-platform file locking
970 if sys.platform == 'win32':
971 import ctypes.wintypes
974 class OVERLAPPED(ctypes.Structure):
976 ('Internal', ctypes.wintypes.LPVOID),
977 ('InternalHigh', ctypes.wintypes.LPVOID),
978 ('Offset', ctypes.wintypes.DWORD),
979 ('OffsetHigh', ctypes.wintypes.DWORD),
980 ('hEvent', ctypes.wintypes.HANDLE),
983 kernel32 = ctypes.windll.kernel32
984 LockFileEx = kernel32.LockFileEx
985 LockFileEx.argtypes = [
986 ctypes.wintypes.HANDLE, # hFile
987 ctypes.wintypes.DWORD, # dwFlags
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
993 LockFileEx.restype = ctypes.wintypes.BOOL
994 UnlockFileEx = kernel32.UnlockFileEx
995 UnlockFileEx.argtypes = [
996 ctypes.wintypes.HANDLE, # hFile
997 ctypes.wintypes.DWORD, # dwReserved
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1000 ctypes.POINTER(OVERLAPPED) # Overlapped
1002 UnlockFileEx.restype = ctypes.wintypes.BOOL
1003 whole_low = 0xffffffff
1004 whole_high = 0x7fffffff
1006 def _lock_file(f, exclusive):
1007 overlapped = OVERLAPPED()
1008 overlapped.Offset = 0
1009 overlapped.OffsetHigh = 0
1010 overlapped.hEvent = 0
1011 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1017 def _unlock_file(f):
1018 assert f._lock_file_overlapped_p
1019 handle = msvcrt.get_osfhandle(f.fileno())
1020 if not UnlockFileEx(handle, 0,
1021 whole_low, whole_high, f._lock_file_overlapped_p):
1022 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1027 def _lock_file(f, exclusive):
1028 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1030 def _unlock_file(f):
1031 fcntl.flock(f, fcntl.LOCK_UN)
1034 class locked_file(object):
1035 def __init__(self, filename, mode, encoding=None):
1036 assert mode in ['r', 'a', 'w']
1037 self.f = io.open(filename, mode, encoding=encoding)
1040 def __enter__(self):
1041 exclusive = self.mode != 'r'
1043 _lock_file(self.f, exclusive)
1049 def __exit__(self, etype, value, traceback):
1051 _unlock_file(self.f)
1058 def write(self, *args):
1059 return self.f.write(*args)
1061 def read(self, *args):
1062 return self.f.read(*args)
1065 def get_filesystem_encoding():
1066 encoding = sys.getfilesystemencoding()
1067 return encoding if encoding is not None else 'utf-8'
1070 def shell_quote(args):
1072 encoding = get_filesystem_encoding()
1074 if isinstance(a, bytes):
1075 # We may get a filename encoded with 'encodeFilename'
1076 a = a.decode(encoding)
1077 quoted_args.append(pipes.quote(a))
1078 return ' '.join(quoted_args)
1081 def takewhile_inclusive(pred, seq):
1082 """ Like itertools.takewhile, but include the latest evaluated element
1083 (the first element so that Not pred(e)) """
1090 def smuggle_url(url, data):
1091 """ Pass additional data in a URL for internal use. """
1093 sdata = compat_urllib_parse.urlencode(
1094 {'__youtubedl_smuggle': json.dumps(data)})
1095 return url + '#' + sdata
1098 def unsmuggle_url(smug_url, default=None):
1099 if '#__youtubedl_smuggle' not in smug_url:
1100 return smug_url, default
1101 url, _, sdata = smug_url.rpartition('#')
1102 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1103 data = json.loads(jsond)
1107 def format_bytes(bytes):
1110 if type(bytes) is str:
1111 bytes = float(bytes)
1115 exponent = int(math.log(bytes, 1024.0))
1116 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1117 converted = float(bytes) / float(1024 ** exponent)
1118 return '%.2f%s' % (converted, suffix)
1121 def parse_filesize(s):
1125 # The lower-case forms are of course incorrect and inofficial,
1126 # but we support those too
1164 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1166 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1170 num_str = m.group('num').replace(',', '.')
1171 mult = _UNIT_TABLE[m.group('unit')]
1172 return int(float(num_str) * mult)
1175 def month_by_name(name):
1176 """ Return the number of a month by (locale-independently) English name """
1179 return ENGLISH_MONTH_NAMES.index(name) + 1
1184 def month_by_abbreviation(abbrev):
1185 """ Return the number of a month by (locale-independently) English
1189 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1194 def fix_xml_ampersands(xml_str):
1195 """Replace all the '&' by '&' in XML"""
1197 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1202 def setproctitle(title):
1203 assert isinstance(title, compat_str)
1205 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1208 title_bytes = title.encode('utf-8')
1209 buf = ctypes.create_string_buffer(len(title_bytes))
1210 buf.value = title_bytes
1212 libc.prctl(15, buf, 0, 0, 0)
1213 except AttributeError:
1214 return # Strange libc, just skip this
1217 def remove_start(s, start):
1218 if s.startswith(start):
1219 return s[len(start):]
1223 def remove_end(s, end):
1225 return s[:-len(end)]
1229 def url_basename(url):
1230 path = compat_urlparse.urlparse(url).path
1231 return path.strip('/').split('/')[-1]
1234 class HEADRequest(compat_urllib_request.Request):
1235 def get_method(self):
1239 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1242 v = getattr(v, get_attr, None)
1245 return default if v is None else (int(v) * invscale // scale)
1248 def str_or_none(v, default=None):
1249 return default if v is None else compat_str(v)
1252 def str_to_int(int_str):
1253 """ A more relaxed version of int_or_none """
1256 int_str = re.sub(r'[,\.\+]', '', int_str)
1260 def float_or_none(v, scale=1, invscale=1, default=None):
1261 return default if v is None else (float(v) * invscale / scale)
1264 def parse_duration(s):
1265 if not isinstance(s, compat_basestring):
1273 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1276 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1279 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1280 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1282 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1284 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1289 if m.group('only_mins'):
1290 return float_or_none(m.group('only_mins'), invscale=60)
1291 if m.group('only_hours'):
1292 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1294 res += int(m.group('secs'))
1295 if m.group('mins_reversed'):
1296 res += int(m.group('mins_reversed')) * 60
1298 res += int(m.group('mins')) * 60
1299 if m.group('hours'):
1300 res += int(m.group('hours')) * 60 * 60
1301 if m.group('hours_reversed'):
1302 res += int(m.group('hours_reversed')) * 60 * 60
1304 res += int(m.group('days')) * 24 * 60 * 60
1306 res += float(m.group('ms'))
1310 def prepend_extension(filename, ext):
1311 name, real_ext = os.path.splitext(filename)
1312 return '{0}.{1}{2}'.format(name, ext, real_ext)
1315 def check_executable(exe, args=[]):
1316 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1317 args can be a list of arguments for a short output (like -version) """
1319 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1325 def get_exe_version(exe, args=['--version'],
1326 version_re=None, unrecognized='present'):
1327 """ Returns the version of the specified executable,
1328 or False if the executable is not present """
1330 out, _ = subprocess.Popen(
1332 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1335 if isinstance(out, bytes): # Python 2.x
1336 out = out.decode('ascii', 'ignore')
1337 return detect_exe_version(out, version_re, unrecognized)
1340 def detect_exe_version(output, version_re=None, unrecognized='present'):
1341 assert isinstance(output, compat_str)
1342 if version_re is None:
1343 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1344 m = re.search(version_re, output)
1351 class PagedList(object):
1353 # This is only useful for tests
1354 return len(self.getslice())
1357 class OnDemandPagedList(PagedList):
1358 def __init__(self, pagefunc, pagesize):
1359 self._pagefunc = pagefunc
1360 self._pagesize = pagesize
1362 def getslice(self, start=0, end=None):
1364 for pagenum in itertools.count(start // self._pagesize):
1365 firstid = pagenum * self._pagesize
1366 nextfirstid = pagenum * self._pagesize + self._pagesize
1367 if start >= nextfirstid:
1370 page_results = list(self._pagefunc(pagenum))
1373 start % self._pagesize
1374 if firstid <= start < nextfirstid
1378 ((end - 1) % self._pagesize) + 1
1379 if (end is not None and firstid <= end <= nextfirstid)
1382 if startv != 0 or endv is not None:
1383 page_results = page_results[startv:endv]
1384 res.extend(page_results)
1386 # A little optimization - if current page is not "full", ie. does
1387 # not contain page_size videos then we can assume that this page
1388 # is the last one - there are no more ids on further pages -
1389 # i.e. no need to query again.
1390 if len(page_results) + startv < self._pagesize:
1393 # If we got the whole page, but the next page is not interesting,
1394 # break out early as well
1395 if end == nextfirstid:
1400 class InAdvancePagedList(PagedList):
1401 def __init__(self, pagefunc, pagecount, pagesize):
1402 self._pagefunc = pagefunc
1403 self._pagecount = pagecount
1404 self._pagesize = pagesize
1406 def getslice(self, start=0, end=None):
1408 start_page = start // self._pagesize
1410 self._pagecount if end is None else (end // self._pagesize + 1))
1411 skip_elems = start - start_page * self._pagesize
1412 only_more = None if end is None else end - start
1413 for pagenum in range(start_page, end_page):
1414 page = list(self._pagefunc(pagenum))
1416 page = page[skip_elems:]
1418 if only_more is not None:
1419 if len(page) < only_more:
1420 only_more -= len(page)
1422 page = page[:only_more]
1429 def uppercase_escape(s):
1430 unicode_escape = codecs.getdecoder('unicode_escape')
1432 r'\\U[0-9a-fA-F]{8}',
1433 lambda m: unicode_escape(m.group(0))[0],
1437 def escape_rfc3986(s):
1438 """Escape non-ASCII characters as suggested by RFC 3986"""
1439 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1440 s = s.encode('utf-8')
1441 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1444 def escape_url(url):
1445 """Escape URL as suggested by RFC 3986"""
1446 url_parsed = compat_urllib_parse_urlparse(url)
1447 return url_parsed._replace(
1448 path=escape_rfc3986(url_parsed.path),
1449 params=escape_rfc3986(url_parsed.params),
1450 query=escape_rfc3986(url_parsed.query),
1451 fragment=escape_rfc3986(url_parsed.fragment)
1455 struct.pack('!I', 0)
1457 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1458 def struct_pack(spec, *args):
1459 if isinstance(spec, compat_str):
1460 spec = spec.encode('ascii')
1461 return struct.pack(spec, *args)
1463 def struct_unpack(spec, *args):
1464 if isinstance(spec, compat_str):
1465 spec = spec.encode('ascii')
1466 return struct.unpack(spec, *args)
1468 struct_pack = struct.pack
1469 struct_unpack = struct.unpack
1472 def read_batch_urls(batch_fd):
1474 if not isinstance(url, compat_str):
1475 url = url.decode('utf-8', 'replace')
1476 BOM_UTF8 = '\xef\xbb\xbf'
1477 if url.startswith(BOM_UTF8):
1478 url = url[len(BOM_UTF8):]
1480 if url.startswith(('#', ';', ']')):
1484 with contextlib.closing(batch_fd) as fd:
1485 return [url for url in map(fixup, fd) if url]
1488 def urlencode_postdata(*args, **kargs):
1489 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1493 etree_iter = xml.etree.ElementTree.Element.iter
1494 except AttributeError: # Python <=2.6
1495 etree_iter = lambda n: n.findall('.//*')
1499 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1500 def doctype(self, name, pubid, system):
1501 pass # Ignore doctypes
1503 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1504 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1505 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1506 # Fix up XML parser in Python 2.x
1507 if sys.version_info < (3, 0):
1508 for n in etree_iter(tree):
1509 if n.text is not None:
1510 if not isinstance(n.text, compat_str):
1511 n.text = n.text.decode('utf-8')
1524 def parse_age_limit(s):
1527 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1528 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1531 def strip_jsonp(code):
1533 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1536 def js_to_json(code):
1539 if v in ('true', 'false', 'null'):
1541 if v.startswith('"'):
1543 if v.startswith("'"):
1545 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1552 res = re.sub(r'''(?x)
1553 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1554 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1555 [a-zA-Z_][.a-zA-Z_0-9]*
1557 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1561 def qualities(quality_ids):
1562 """ Get a numeric quality value out of a list of possible values """
1565 return quality_ids.index(qid)
1571 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1574 def limit_length(s, length):
1575 """ Add ellipses to overly long strings """
1580 return s[:length - len(ELLIPSES)] + ELLIPSES
1584 def version_tuple(v):
1585 return tuple(int(e) for e in re.split(r'[-.]', v))
1588 def is_outdated_version(version, limit, assume_new=True):
1590 return not assume_new
1592 return version_tuple(version) < version_tuple(limit)
1594 return not assume_new
1597 def ytdl_is_updateable():
1598 """ Returns if youtube-dl can be updated with -U """
1599 from zipimport import zipimporter
1601 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1604 def args_to_str(args):
1605 # Get a short string representation for a subprocess command
1606 return ' '.join(shlex_quote(a) for a in args)
1609 def mimetype2ext(mt):
1610 _, _, res = mt.rpartition('/')
1614 'x-mp4-fragmented': 'mp4',
1618 def urlhandle_detect_ext(url_handle):
1621 getheader = lambda h: url_handle.headers[h]
1622 except AttributeError: # Python < 3
1623 getheader = url_handle.info().getheader
1625 cd = getheader('Content-Disposition')
1627 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1629 e = determine_ext(m.group('filename'), default_ext=None)
1633 return mimetype2ext(getheader('Content-Type'))
1636 def age_restricted(content_limit, age_limit):
1637 """ Returns True iff the content should be blocked """
1639 if age_limit is None: # No limit set
1641 if content_limit is None:
1642 return False # Content available for everyone
1643 return age_limit < content_limit
1646 def is_html(first_bytes):
1647 """ Detect whether a file contains HTML by examining its first bytes. """
1650 (b'\xef\xbb\xbf', 'utf-8'),
1651 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1652 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1653 (b'\xff\xfe', 'utf-16-le'),
1654 (b'\xfe\xff', 'utf-16-be'),
1656 for bom, enc in BOMS:
1657 if first_bytes.startswith(bom):
1658 s = first_bytes[len(bom):].decode(enc, 'replace')
1661 s = first_bytes.decode('utf-8', 'replace')
1663 return re.match(r'^\s*<', s)
1666 def determine_protocol(info_dict):
1667 protocol = info_dict.get('protocol')
1668 if protocol is not None:
1671 url = info_dict['url']
1672 if url.startswith('rtmp'):
1674 elif url.startswith('mms'):
1676 elif url.startswith('rtsp'):
1679 ext = determine_ext(url)
1685 return compat_urllib_parse_urlparse(url).scheme
1688 def render_table(header_row, data):
1689 """ Render a list of rows, each as a list of values """
1690 table = [header_row] + data
1691 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1692 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1693 return '\n'.join(format_str % tuple(row) for row in table)
1696 def _match_one(filter_part, dct):
1697 COMPARISON_OPERATORS = {
1705 operator_rex = re.compile(r'''(?x)\s*
1707 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1709 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1710 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1713 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1714 m = operator_rex.search(filter_part)
1716 op = COMPARISON_OPERATORS[m.group('op')]
1717 if m.group('strval') is not None:
1718 if m.group('op') not in ('=', '!='):
1720 'Operator %s does not support string values!' % m.group('op'))
1721 comparison_value = m.group('strval')
1724 comparison_value = int(m.group('intval'))
1726 comparison_value = parse_filesize(m.group('intval'))
1727 if comparison_value is None:
1728 comparison_value = parse_filesize(m.group('intval') + 'B')
1729 if comparison_value is None:
1731 'Invalid integer value %r in filter part %r' % (
1732 m.group('intval'), filter_part))
1733 actual_value = dct.get(m.group('key'))
1734 if actual_value is None:
1735 return m.group('none_inclusive')
1736 return op(actual_value, comparison_value)
1739 '': lambda v: v is not None,
1740 '!': lambda v: v is None,
1742 operator_rex = re.compile(r'''(?x)\s*
1743 (?P<op>%s)\s*(?P<key>[a-z_]+)
1745 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1746 m = operator_rex.search(filter_part)
1748 op = UNARY_OPERATORS[m.group('op')]
1749 actual_value = dct.get(m.group('key'))
1750 return op(actual_value)
1752 raise ValueError('Invalid filter part %r' % filter_part)
1755 def match_str(filter_str, dct):
1756 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1759 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1762 def match_filter_func(filter_str):
1763 def _match_func(info_dict):
1764 if match_str(filter_str, info_dict):
1767 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1768 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)