2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = sanitize_path(filename)
256 if alt_filename == filename:
259 # An exception here should be caught in the caller
260 stream = open(encodeFilename(alt_filename), open_mode)
261 return (stream, alt_filename)
264 def timeconvert(timestr):
265 """Convert RFC 2822 defined time string into system timestamp"""
267 timetuple = email.utils.parsedate_tz(timestr)
268 if timetuple is not None:
269 timestamp = email.utils.mktime_tz(timetuple)
273 def sanitize_filename(s, restricted=False, is_id=False):
274 """Sanitizes a string so it could be used as part of a filename.
275 If restricted is set, use a stricter subset of allowed characters.
276 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
278 def replace_insane(char):
279 if char == '?' or ord(char) < 32 or ord(char) == 127:
282 return '' if restricted else '\''
284 return '_-' if restricted else ' -'
285 elif char in '\\/|*<>':
287 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
289 if restricted and ord(char) > 127:
294 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
295 result = ''.join(map(replace_insane, s))
297 while '__' in result:
298 result = result.replace('__', '_')
299 result = result.strip('_')
300 # Common case of "Foreign band name - English song title"
301 if restricted and result.startswith('-_'):
303 if result.startswith('-'):
304 result = '_' + result[len('-'):]
305 result = result.lstrip('.')
311 def sanitize_path(s):
312 """Sanitizes and normalizes path on Windows"""
313 if sys.platform != 'win32':
315 drive, _ = os.path.splitdrive(s)
316 unc, _ = os.path.splitunc(s)
317 unc_or_drive = unc or drive
318 norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
322 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
323 for path_part in norm_path]
325 sanitized_path.insert(0, unc_or_drive + os.path.sep)
326 return os.path.join(*sanitized_path)
329 def orderedSet(iterable):
330 """ Remove all duplicates from the input iterable """
338 def _htmlentity_transform(entity):
339 """Transforms an HTML entity to a character."""
340 # Known non-numeric HTML entity
341 if entity in compat_html_entities.name2codepoint:
342 return compat_chr(compat_html_entities.name2codepoint[entity])
344 mobj = re.match(r'#(x?[0-9]+)', entity)
346 numstr = mobj.group(1)
347 if numstr.startswith('x'):
349 numstr = '0%s' % numstr
352 return compat_chr(int(numstr, base))
354 # Unknown entity in name, return its literal representation
355 return ('&%s;' % entity)
361 assert type(s) == compat_str
364 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
367 def encodeFilename(s, for_subprocess=False):
369 @param s The name of the file
372 assert type(s) == compat_str
374 # Python 3 has a Unicode API
375 if sys.version_info >= (3, 0):
378 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
379 # Pass '' directly to use Unicode APIs on Windows 2000 and up
380 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
381 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
382 if not for_subprocess:
385 # For subprocess calls, encode with locale encoding
386 # Refer to http://stackoverflow.com/a/9951851/35070
387 encoding = preferredencoding()
389 encoding = sys.getfilesystemencoding()
392 return s.encode(encoding, 'ignore')
395 def encodeArgument(s):
396 if not isinstance(s, compat_str):
397 # Legacy code that uses byte strings
398 # Uncomment the following line after fixing all post processors
399 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
400 s = s.decode('ascii')
401 return encodeFilename(s, True)
404 def decodeOption(optval):
407 if isinstance(optval, bytes):
408 optval = optval.decode(preferredencoding())
410 assert isinstance(optval, compat_str)
414 def formatSeconds(secs):
416 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
418 return '%d:%02d' % (secs // 60, secs % 60)
423 def make_HTTPS_handler(params, **kwargs):
424 opts_no_check_certificate = params.get('nocheckcertificate', False)
425 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
426 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
427 if opts_no_check_certificate:
428 context.check_hostname = False
429 context.verify_mode = ssl.CERT_NONE
431 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
434 # (create_default_context present but HTTPSHandler has no context=)
437 if sys.version_info < (3, 2):
438 return YoutubeDLHTTPSHandler(params, **kwargs)
440 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
441 context.verify_mode = (ssl.CERT_NONE
442 if opts_no_check_certificate
443 else ssl.CERT_REQUIRED)
444 context.set_default_verify_paths()
445 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
448 class ExtractorError(Exception):
449 """Error during info extraction."""
451 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
452 """ tb, if given, is the original traceback (so that it can be printed out).
453 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
456 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
458 if video_id is not None:
459 msg = video_id + ': ' + msg
461 msg += ' (caused by %r)' % cause
463 if ytdl_is_updateable():
464 update_cmd = 'type youtube-dl -U to update'
466 update_cmd = 'see https://yt-dl.org/update on how to update'
467 msg += '; please report this issue on https://yt-dl.org/bug .'
468 msg += ' Make sure you are using the latest version; %s.' % update_cmd
469 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
470 super(ExtractorError, self).__init__(msg)
473 self.exc_info = sys.exc_info() # preserve original exception
475 self.video_id = video_id
477 def format_traceback(self):
478 if self.traceback is None:
480 return ''.join(traceback.format_tb(self.traceback))
483 class UnsupportedError(ExtractorError):
484 def __init__(self, url):
485 super(UnsupportedError, self).__init__(
486 'Unsupported URL: %s' % url, expected=True)
490 class RegexNotFoundError(ExtractorError):
491 """Error when a regex didn't match"""
495 class DownloadError(Exception):
496 """Download Error exception.
498 This exception may be thrown by FileDownloader objects if they are not
499 configured to continue on errors. They will contain the appropriate
503 def __init__(self, msg, exc_info=None):
504 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
505 super(DownloadError, self).__init__(msg)
506 self.exc_info = exc_info
509 class SameFileError(Exception):
510 """Same File exception.
512 This exception will be thrown by FileDownloader objects if they detect
513 multiple files would have to be downloaded to the same file on disk.
518 class PostProcessingError(Exception):
519 """Post Processing exception.
521 This exception may be raised by PostProcessor's .run() method to
522 indicate an error in the postprocessing task.
525 def __init__(self, msg):
529 class MaxDownloadsReached(Exception):
530 """ --max-downloads limit has been reached. """
534 class UnavailableVideoError(Exception):
535 """Unavailable Format exception.
537 This exception will be thrown when a video is requested
538 in a format that is not available for that video.
543 class ContentTooShortError(Exception):
544 """Content Too Short exception.
546 This exception may be raised by FileDownloader objects when a file they
547 download is too small for what the server announced first, indicating
548 the connection was probably interrupted.
554 def __init__(self, downloaded, expected):
555 self.downloaded = downloaded
556 self.expected = expected
559 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
560 hc = http_class(*args, **kwargs)
561 source_address = ydl_handler._params.get('source_address')
562 if source_address is not None:
563 sa = (source_address, 0)
564 if hasattr(hc, 'source_address'): # Python 2.7+
565 hc.source_address = sa
567 def _hc_connect(self, *args, **kwargs):
568 sock = compat_socket_create_connection(
569 (self.host, self.port), self.timeout, sa)
571 self.sock = ssl.wrap_socket(
572 sock, self.key_file, self.cert_file,
573 ssl_version=ssl.PROTOCOL_TLSv1)
576 hc.connect = functools.partial(_hc_connect, hc)
581 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
582 """Handler for HTTP requests and responses.
584 This class, when installed with an OpenerDirector, automatically adds
585 the standard headers to every HTTP request and handles gzipped and
586 deflated responses from web servers. If compression is to be avoided in
587 a particular request, the original request in the program code only has
588 to include the HTTP header "Youtubedl-No-Compression", which will be
589 removed before making the real request.
591 Part of this code was copied from:
593 http://techknack.net/python-urllib2-handlers/
595 Andrew Rowls, the author of that code, agreed to release it to the
599 def __init__(self, params, *args, **kwargs):
600 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
601 self._params = params
603 def http_open(self, req):
604 return self.do_open(functools.partial(
605 _create_http_connection, self, compat_http_client.HTTPConnection, False),
611 return zlib.decompress(data, -zlib.MAX_WBITS)
613 return zlib.decompress(data)
616 def addinfourl_wrapper(stream, headers, url, code):
617 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
618 return compat_urllib_request.addinfourl(stream, headers, url, code)
619 ret = compat_urllib_request.addinfourl(stream, headers, url)
623 def http_request(self, req):
624 for h, v in std_headers.items():
625 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
626 # The dict keys are capitalized because of this bug by urllib
627 if h.capitalize() not in req.headers:
629 if 'Youtubedl-no-compression' in req.headers:
630 if 'Accept-encoding' in req.headers:
631 del req.headers['Accept-encoding']
632 del req.headers['Youtubedl-no-compression']
634 if sys.version_info < (2, 7) and '#' in req.get_full_url():
635 # Python 2.6 is brain-dead when it comes to fragments
636 req._Request__original = req._Request__original.partition('#')[0]
637 req._Request__r_type = req._Request__r_type.partition('#')[0]
641 def http_response(self, req, resp):
644 if resp.headers.get('Content-encoding', '') == 'gzip':
645 content = resp.read()
646 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
648 uncompressed = io.BytesIO(gz.read())
649 except IOError as original_ioerror:
650 # There may be junk add the end of the file
651 # See http://stackoverflow.com/q/4928560/35070 for details
652 for i in range(1, 1024):
654 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
655 uncompressed = io.BytesIO(gz.read())
660 raise original_ioerror
661 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
662 resp.msg = old_resp.msg
664 if resp.headers.get('Content-encoding', '') == 'deflate':
665 gz = io.BytesIO(self.deflate(resp.read()))
666 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
667 resp.msg = old_resp.msg
670 https_request = http_request
671 https_response = http_response
674 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
675 def __init__(self, params, https_conn_class=None, *args, **kwargs):
676 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
677 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
678 self._params = params
680 def https_open(self, req):
682 if hasattr(self, '_context'): # python > 2.6
683 kwargs['context'] = self._context
684 if hasattr(self, '_check_hostname'): # python 3.x
685 kwargs['check_hostname'] = self._check_hostname
686 return self.do_open(functools.partial(
687 _create_http_connection, self, self._https_conn_class, True),
691 def parse_iso8601(date_str, delimiter='T', timezone=None):
692 """ Return a UNIX timestamp from the given date """
699 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
702 timezone = datetime.timedelta()
704 date_str = date_str[:-len(m.group(0))]
705 if not m.group('sign'):
706 timezone = datetime.timedelta()
708 sign = 1 if m.group('sign') == '+' else -1
709 timezone = datetime.timedelta(
710 hours=sign * int(m.group('hours')),
711 minutes=sign * int(m.group('minutes')))
712 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
713 dt = datetime.datetime.strptime(date_str, date_format) - timezone
714 return calendar.timegm(dt.timetuple())
717 def unified_strdate(date_str, day_first=True):
718 """Return a string with the date in the format YYYYMMDD"""
724 date_str = date_str.replace(',', ' ')
725 # %z (UTC offset) is only supported in python>=3.2
726 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
727 # Remove AM/PM + timezone
728 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
730 format_expressions = [
735 '%b %dst %Y %I:%M%p',
736 '%b %dnd %Y %I:%M%p',
737 '%b %dth %Y %I:%M%p',
743 '%Y-%m-%d %H:%M:%S.%f',
746 '%Y-%m-%dT%H:%M:%SZ',
747 '%Y-%m-%dT%H:%M:%S.%fZ',
748 '%Y-%m-%dT%H:%M:%S.%f0Z',
750 '%Y-%m-%dT%H:%M:%S.%f',
754 format_expressions.extend([
761 format_expressions.extend([
767 for expression in format_expressions:
769 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
772 if upload_date is None:
773 timetuple = email.utils.parsedate_tz(date_str)
775 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
779 def determine_ext(url, default_ext='unknown_video'):
782 guess = url.partition('?')[0].rpartition('.')[2]
783 if re.match(r'^[A-Za-z0-9]+$', guess):
789 def subtitles_filename(filename, sub_lang, sub_format):
790 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
793 def date_from_str(date_str):
795 Return a datetime object from a string in the format YYYYMMDD or
796 (now|today)[+-][0-9](day|week|month|year)(s)?"""
797 today = datetime.date.today()
798 if date_str in ('now', 'today'):
800 if date_str == 'yesterday':
801 return today - datetime.timedelta(days=1)
802 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
803 if match is not None:
804 sign = match.group('sign')
805 time = int(match.group('time'))
808 unit = match.group('unit')
809 # A bad aproximation?
817 delta = datetime.timedelta(**{unit: time})
819 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
822 def hyphenate_date(date_str):
824 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
825 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
826 if match is not None:
827 return '-'.join(match.groups())
832 class DateRange(object):
833 """Represents a time interval between two dates"""
835 def __init__(self, start=None, end=None):
836 """start and end must be strings in the format accepted by date"""
837 if start is not None:
838 self.start = date_from_str(start)
840 self.start = datetime.datetime.min.date()
842 self.end = date_from_str(end)
844 self.end = datetime.datetime.max.date()
845 if self.start > self.end:
846 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
850 """Returns a range that only contains the given day"""
853 def __contains__(self, date):
854 """Check if the date is in the range"""
855 if not isinstance(date, datetime.date):
856 date = date_from_str(date)
857 return self.start <= date <= self.end
860 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
864 """ Returns the platform name as a compat_str """
865 res = platform.platform()
866 if isinstance(res, bytes):
867 res = res.decode(preferredencoding())
869 assert isinstance(res, compat_str)
873 def _windows_write_string(s, out):
874 """ Returns True if the string was written using special methods,
875 False if it has yet to be written out."""
876 # Adapted from http://stackoverflow.com/a/3259271/35070
879 import ctypes.wintypes
887 fileno = out.fileno()
888 except AttributeError:
889 # If the output stream doesn't have a fileno, it's virtual
891 except io.UnsupportedOperation:
892 # Some strange Windows pseudo files?
894 if fileno not in WIN_OUTPUT_IDS:
897 GetStdHandle = ctypes.WINFUNCTYPE(
898 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
899 (b"GetStdHandle", ctypes.windll.kernel32))
900 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
902 WriteConsoleW = ctypes.WINFUNCTYPE(
903 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
904 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
905 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
906 written = ctypes.wintypes.DWORD(0)
908 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
909 FILE_TYPE_CHAR = 0x0002
910 FILE_TYPE_REMOTE = 0x8000
911 GetConsoleMode = ctypes.WINFUNCTYPE(
912 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
913 ctypes.POINTER(ctypes.wintypes.DWORD))(
914 (b"GetConsoleMode", ctypes.windll.kernel32))
915 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
917 def not_a_console(handle):
918 if handle == INVALID_HANDLE_VALUE or handle is None:
920 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
921 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
926 def next_nonbmp_pos(s):
928 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
929 except StopIteration:
933 count = min(next_nonbmp_pos(s), 1024)
936 h, s, count if count else 2, ctypes.byref(written), None)
938 raise OSError('Failed to write string')
939 if not count: # We just wrote a non-BMP character
940 assert written.value == 2
943 assert written.value > 0
944 s = s[written.value:]
948 def write_string(s, out=None, encoding=None):
951 assert type(s) == compat_str
953 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
954 if _windows_write_string(s, out):
957 if ('b' in getattr(out, 'mode', '') or
958 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
959 byt = s.encode(encoding or preferredencoding(), 'ignore')
961 elif hasattr(out, 'buffer'):
962 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
963 byt = s.encode(enc, 'ignore')
964 out.buffer.write(byt)
970 def bytes_to_intlist(bs):
973 if isinstance(bs[0], int): # Python 3
976 return [ord(c) for c in bs]
979 def intlist_to_bytes(xs):
982 return struct_pack('%dB' % len(xs), *xs)
985 # Cross-platform file locking
986 if sys.platform == 'win32':
987 import ctypes.wintypes
990 class OVERLAPPED(ctypes.Structure):
992 ('Internal', ctypes.wintypes.LPVOID),
993 ('InternalHigh', ctypes.wintypes.LPVOID),
994 ('Offset', ctypes.wintypes.DWORD),
995 ('OffsetHigh', ctypes.wintypes.DWORD),
996 ('hEvent', ctypes.wintypes.HANDLE),
999 kernel32 = ctypes.windll.kernel32
1000 LockFileEx = kernel32.LockFileEx
1001 LockFileEx.argtypes = [
1002 ctypes.wintypes.HANDLE, # hFile
1003 ctypes.wintypes.DWORD, # dwFlags
1004 ctypes.wintypes.DWORD, # dwReserved
1005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1007 ctypes.POINTER(OVERLAPPED) # Overlapped
1009 LockFileEx.restype = ctypes.wintypes.BOOL
1010 UnlockFileEx = kernel32.UnlockFileEx
1011 UnlockFileEx.argtypes = [
1012 ctypes.wintypes.HANDLE, # hFile
1013 ctypes.wintypes.DWORD, # dwReserved
1014 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1015 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1016 ctypes.POINTER(OVERLAPPED) # Overlapped
1018 UnlockFileEx.restype = ctypes.wintypes.BOOL
1019 whole_low = 0xffffffff
1020 whole_high = 0x7fffffff
1022 def _lock_file(f, exclusive):
1023 overlapped = OVERLAPPED()
1024 overlapped.Offset = 0
1025 overlapped.OffsetHigh = 0
1026 overlapped.hEvent = 0
1027 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1028 handle = msvcrt.get_osfhandle(f.fileno())
1029 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1030 whole_low, whole_high, f._lock_file_overlapped_p):
1031 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1033 def _unlock_file(f):
1034 assert f._lock_file_overlapped_p
1035 handle = msvcrt.get_osfhandle(f.fileno())
1036 if not UnlockFileEx(handle, 0,
1037 whole_low, whole_high, f._lock_file_overlapped_p):
1038 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1043 def _lock_file(f, exclusive):
1044 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1046 def _unlock_file(f):
1047 fcntl.flock(f, fcntl.LOCK_UN)
1050 class locked_file(object):
1051 def __init__(self, filename, mode, encoding=None):
1052 assert mode in ['r', 'a', 'w']
1053 self.f = io.open(filename, mode, encoding=encoding)
1056 def __enter__(self):
1057 exclusive = self.mode != 'r'
1059 _lock_file(self.f, exclusive)
1065 def __exit__(self, etype, value, traceback):
1067 _unlock_file(self.f)
1074 def write(self, *args):
1075 return self.f.write(*args)
1077 def read(self, *args):
1078 return self.f.read(*args)
1081 def get_filesystem_encoding():
1082 encoding = sys.getfilesystemencoding()
1083 return encoding if encoding is not None else 'utf-8'
1086 def shell_quote(args):
1088 encoding = get_filesystem_encoding()
1090 if isinstance(a, bytes):
1091 # We may get a filename encoded with 'encodeFilename'
1092 a = a.decode(encoding)
1093 quoted_args.append(pipes.quote(a))
1094 return ' '.join(quoted_args)
1097 def takewhile_inclusive(pred, seq):
1098 """ Like itertools.takewhile, but include the latest evaluated element
1099 (the first element so that Not pred(e)) """
1106 def smuggle_url(url, data):
1107 """ Pass additional data in a URL for internal use. """
1109 sdata = compat_urllib_parse.urlencode(
1110 {'__youtubedl_smuggle': json.dumps(data)})
1111 return url + '#' + sdata
1114 def unsmuggle_url(smug_url, default=None):
1115 if '#__youtubedl_smuggle' not in smug_url:
1116 return smug_url, default
1117 url, _, sdata = smug_url.rpartition('#')
1118 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1119 data = json.loads(jsond)
1123 def format_bytes(bytes):
1126 if type(bytes) is str:
1127 bytes = float(bytes)
1131 exponent = int(math.log(bytes, 1024.0))
1132 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1133 converted = float(bytes) / float(1024 ** exponent)
1134 return '%.2f%s' % (converted, suffix)
1137 def parse_filesize(s):
1141 # The lower-case forms are of course incorrect and inofficial,
1142 # but we support those too
1180 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1182 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1186 num_str = m.group('num').replace(',', '.')
1187 mult = _UNIT_TABLE[m.group('unit')]
1188 return int(float(num_str) * mult)
1191 def month_by_name(name):
1192 """ Return the number of a month by (locale-independently) English name """
1195 return ENGLISH_MONTH_NAMES.index(name) + 1
1200 def month_by_abbreviation(abbrev):
1201 """ Return the number of a month by (locale-independently) English
1205 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1210 def fix_xml_ampersands(xml_str):
1211 """Replace all the '&' by '&' in XML"""
1213 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1218 def setproctitle(title):
1219 assert isinstance(title, compat_str)
1221 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1224 title_bytes = title.encode('utf-8')
1225 buf = ctypes.create_string_buffer(len(title_bytes))
1226 buf.value = title_bytes
1228 libc.prctl(15, buf, 0, 0, 0)
1229 except AttributeError:
1230 return # Strange libc, just skip this
1233 def remove_start(s, start):
1234 if s.startswith(start):
1235 return s[len(start):]
1239 def remove_end(s, end):
1241 return s[:-len(end)]
1245 def url_basename(url):
1246 path = compat_urlparse.urlparse(url).path
1247 return path.strip('/').split('/')[-1]
1250 class HEADRequest(compat_urllib_request.Request):
1251 def get_method(self):
1255 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1258 v = getattr(v, get_attr, None)
1261 return default if v is None else (int(v) * invscale // scale)
1264 def str_or_none(v, default=None):
1265 return default if v is None else compat_str(v)
1268 def str_to_int(int_str):
1269 """ A more relaxed version of int_or_none """
1272 int_str = re.sub(r'[,\.\+]', '', int_str)
1276 def float_or_none(v, scale=1, invscale=1, default=None):
1277 return default if v is None else (float(v) * invscale / scale)
1280 def parse_duration(s):
1281 if not isinstance(s, compat_basestring):
1289 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1290 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1292 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1295 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1296 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1298 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1300 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1305 if m.group('only_mins'):
1306 return float_or_none(m.group('only_mins'), invscale=60)
1307 if m.group('only_hours'):
1308 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1310 res += int(m.group('secs'))
1311 if m.group('mins_reversed'):
1312 res += int(m.group('mins_reversed')) * 60
1314 res += int(m.group('mins')) * 60
1315 if m.group('hours'):
1316 res += int(m.group('hours')) * 60 * 60
1317 if m.group('hours_reversed'):
1318 res += int(m.group('hours_reversed')) * 60 * 60
1320 res += int(m.group('days')) * 24 * 60 * 60
1322 res += float(m.group('ms'))
1326 def prepend_extension(filename, ext):
1327 name, real_ext = os.path.splitext(filename)
1328 return '{0}.{1}{2}'.format(name, ext, real_ext)
1331 def check_executable(exe, args=[]):
1332 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1333 args can be a list of arguments for a short output (like -version) """
1335 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1341 def get_exe_version(exe, args=['--version'],
1342 version_re=None, unrecognized='present'):
1343 """ Returns the version of the specified executable,
1344 or False if the executable is not present """
1346 out, _ = subprocess.Popen(
1348 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1351 if isinstance(out, bytes): # Python 2.x
1352 out = out.decode('ascii', 'ignore')
1353 return detect_exe_version(out, version_re, unrecognized)
1356 def detect_exe_version(output, version_re=None, unrecognized='present'):
1357 assert isinstance(output, compat_str)
1358 if version_re is None:
1359 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1360 m = re.search(version_re, output)
1367 class PagedList(object):
1369 # This is only useful for tests
1370 return len(self.getslice())
1373 class OnDemandPagedList(PagedList):
1374 def __init__(self, pagefunc, pagesize):
1375 self._pagefunc = pagefunc
1376 self._pagesize = pagesize
1378 def getslice(self, start=0, end=None):
1380 for pagenum in itertools.count(start // self._pagesize):
1381 firstid = pagenum * self._pagesize
1382 nextfirstid = pagenum * self._pagesize + self._pagesize
1383 if start >= nextfirstid:
1386 page_results = list(self._pagefunc(pagenum))
1389 start % self._pagesize
1390 if firstid <= start < nextfirstid
1394 ((end - 1) % self._pagesize) + 1
1395 if (end is not None and firstid <= end <= nextfirstid)
1398 if startv != 0 or endv is not None:
1399 page_results = page_results[startv:endv]
1400 res.extend(page_results)
1402 # A little optimization - if current page is not "full", ie. does
1403 # not contain page_size videos then we can assume that this page
1404 # is the last one - there are no more ids on further pages -
1405 # i.e. no need to query again.
1406 if len(page_results) + startv < self._pagesize:
1409 # If we got the whole page, but the next page is not interesting,
1410 # break out early as well
1411 if end == nextfirstid:
1416 class InAdvancePagedList(PagedList):
1417 def __init__(self, pagefunc, pagecount, pagesize):
1418 self._pagefunc = pagefunc
1419 self._pagecount = pagecount
1420 self._pagesize = pagesize
1422 def getslice(self, start=0, end=None):
1424 start_page = start // self._pagesize
1426 self._pagecount if end is None else (end // self._pagesize + 1))
1427 skip_elems = start - start_page * self._pagesize
1428 only_more = None if end is None else end - start
1429 for pagenum in range(start_page, end_page):
1430 page = list(self._pagefunc(pagenum))
1432 page = page[skip_elems:]
1434 if only_more is not None:
1435 if len(page) < only_more:
1436 only_more -= len(page)
1438 page = page[:only_more]
1445 def uppercase_escape(s):
1446 unicode_escape = codecs.getdecoder('unicode_escape')
1448 r'\\U[0-9a-fA-F]{8}',
1449 lambda m: unicode_escape(m.group(0))[0],
1453 def escape_rfc3986(s):
1454 """Escape non-ASCII characters as suggested by RFC 3986"""
1455 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1456 s = s.encode('utf-8')
1457 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1460 def escape_url(url):
1461 """Escape URL as suggested by RFC 3986"""
1462 url_parsed = compat_urllib_parse_urlparse(url)
1463 return url_parsed._replace(
1464 path=escape_rfc3986(url_parsed.path),
1465 params=escape_rfc3986(url_parsed.params),
1466 query=escape_rfc3986(url_parsed.query),
1467 fragment=escape_rfc3986(url_parsed.fragment)
1471 struct.pack('!I', 0)
1473 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1474 def struct_pack(spec, *args):
1475 if isinstance(spec, compat_str):
1476 spec = spec.encode('ascii')
1477 return struct.pack(spec, *args)
1479 def struct_unpack(spec, *args):
1480 if isinstance(spec, compat_str):
1481 spec = spec.encode('ascii')
1482 return struct.unpack(spec, *args)
1484 struct_pack = struct.pack
1485 struct_unpack = struct.unpack
1488 def read_batch_urls(batch_fd):
1490 if not isinstance(url, compat_str):
1491 url = url.decode('utf-8', 'replace')
1492 BOM_UTF8 = '\xef\xbb\xbf'
1493 if url.startswith(BOM_UTF8):
1494 url = url[len(BOM_UTF8):]
1496 if url.startswith(('#', ';', ']')):
1500 with contextlib.closing(batch_fd) as fd:
1501 return [url for url in map(fixup, fd) if url]
1504 def urlencode_postdata(*args, **kargs):
1505 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1509 etree_iter = xml.etree.ElementTree.Element.iter
1510 except AttributeError: # Python <=2.6
1511 etree_iter = lambda n: n.findall('.//*')
1515 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1516 def doctype(self, name, pubid, system):
1517 pass # Ignore doctypes
1519 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1520 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1521 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1522 # Fix up XML parser in Python 2.x
1523 if sys.version_info < (3, 0):
1524 for n in etree_iter(tree):
1525 if n.text is not None:
1526 if not isinstance(n.text, compat_str):
1527 n.text = n.text.decode('utf-8')
1540 def parse_age_limit(s):
1543 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1544 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1547 def strip_jsonp(code):
1549 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1552 def js_to_json(code):
1555 if v in ('true', 'false', 'null'):
1557 if v.startswith('"'):
1559 if v.startswith("'"):
1561 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1568 res = re.sub(r'''(?x)
1569 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1570 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1571 [a-zA-Z_][.a-zA-Z_0-9]*
1573 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1577 def qualities(quality_ids):
1578 """ Get a numeric quality value out of a list of possible values """
1581 return quality_ids.index(qid)
1587 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1590 def limit_length(s, length):
1591 """ Add ellipses to overly long strings """
1596 return s[:length - len(ELLIPSES)] + ELLIPSES
1600 def version_tuple(v):
1601 return tuple(int(e) for e in re.split(r'[-.]', v))
1604 def is_outdated_version(version, limit, assume_new=True):
1606 return not assume_new
1608 return version_tuple(version) < version_tuple(limit)
1610 return not assume_new
1613 def ytdl_is_updateable():
1614 """ Returns if youtube-dl can be updated with -U """
1615 from zipimport import zipimporter
1617 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1620 def args_to_str(args):
1621 # Get a short string representation for a subprocess command
1622 return ' '.join(shlex_quote(a) for a in args)
1625 def mimetype2ext(mt):
1626 _, _, res = mt.rpartition('/')
1630 'x-mp4-fragmented': 'mp4',
1634 def urlhandle_detect_ext(url_handle):
1637 getheader = lambda h: url_handle.headers[h]
1638 except AttributeError: # Python < 3
1639 getheader = url_handle.info().getheader
1641 cd = getheader('Content-Disposition')
1643 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1645 e = determine_ext(m.group('filename'), default_ext=None)
1649 return mimetype2ext(getheader('Content-Type'))
1652 def age_restricted(content_limit, age_limit):
1653 """ Returns True iff the content should be blocked """
1655 if age_limit is None: # No limit set
1657 if content_limit is None:
1658 return False # Content available for everyone
1659 return age_limit < content_limit
1662 def is_html(first_bytes):
1663 """ Detect whether a file contains HTML by examining its first bytes. """
1666 (b'\xef\xbb\xbf', 'utf-8'),
1667 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1668 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1669 (b'\xff\xfe', 'utf-16-le'),
1670 (b'\xfe\xff', 'utf-16-be'),
1672 for bom, enc in BOMS:
1673 if first_bytes.startswith(bom):
1674 s = first_bytes[len(bom):].decode(enc, 'replace')
1677 s = first_bytes.decode('utf-8', 'replace')
1679 return re.match(r'^\s*<', s)
1682 def determine_protocol(info_dict):
1683 protocol = info_dict.get('protocol')
1684 if protocol is not None:
1687 url = info_dict['url']
1688 if url.startswith('rtmp'):
1690 elif url.startswith('mms'):
1692 elif url.startswith('rtsp'):
1695 ext = determine_ext(url)
1701 return compat_urllib_parse_urlparse(url).scheme
1704 def render_table(header_row, data):
1705 """ Render a list of rows, each as a list of values """
1706 table = [header_row] + data
1707 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1708 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1709 return '\n'.join(format_str % tuple(row) for row in table)
1712 def _match_one(filter_part, dct):
1713 COMPARISON_OPERATORS = {
1721 operator_rex = re.compile(r'''(?x)\s*
1723 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1725 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1726 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1729 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1730 m = operator_rex.search(filter_part)
1732 op = COMPARISON_OPERATORS[m.group('op')]
1733 if m.group('strval') is not None:
1734 if m.group('op') not in ('=', '!='):
1736 'Operator %s does not support string values!' % m.group('op'))
1737 comparison_value = m.group('strval')
1740 comparison_value = int(m.group('intval'))
1742 comparison_value = parse_filesize(m.group('intval'))
1743 if comparison_value is None:
1744 comparison_value = parse_filesize(m.group('intval') + 'B')
1745 if comparison_value is None:
1747 'Invalid integer value %r in filter part %r' % (
1748 m.group('intval'), filter_part))
1749 actual_value = dct.get(m.group('key'))
1750 if actual_value is None:
1751 return m.group('none_inclusive')
1752 return op(actual_value, comparison_value)
1755 '': lambda v: v is not None,
1756 '!': lambda v: v is None,
1758 operator_rex = re.compile(r'''(?x)\s*
1759 (?P<op>%s)\s*(?P<key>[a-z_]+)
1761 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1762 m = operator_rex.search(filter_part)
1764 op = UNARY_OPERATORS[m.group('op')]
1765 actual_value = dct.get(m.group('key'))
1766 return op(actual_value)
1768 raise ValueError('Invalid filter part %r' % filter_part)
1771 def match_str(filter_str, dct):
1772 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1775 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1778 def match_filter_func(filter_str):
1779 def _match_func(info_dict):
1780 if match_str(filter_str, info_dict):
1783 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1784 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1788 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1789 def __init__(self, proxies=None):
1790 # Set default handlers
1791 for type in ('http', 'https'):
1792 setattr(self, '%s_open' % type,
1793 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1794 meth(r, proxy, type))
1795 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1797 def proxy_open(self, req, proxy, type):
1798 req_proxy = req.headers.get('Ytdl-request-proxy')
1799 if req_proxy is not None:
1801 del req.headers['Ytdl-request-proxy']
1803 if proxy == '__noproxy__':
1804 return None # No Proxy
1805 return compat_urllib_request.ProxyHandler.proxy_open(
1806 self, req, proxy, type)