2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
410 # (create_default_context present but HTTPSHandler has no context=)
413 if sys.version_info < (3, 2):
414 return YoutubeDLHTTPSHandler(params, **kwargs)
416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
417 context.verify_mode = (ssl.CERT_NONE
418 if opts_no_check_certificate
419 else ssl.CERT_REQUIRED)
420 context.set_default_verify_paths()
421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
424 class ExtractorError(Exception):
425 """Error during info extraction."""
427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
434 if video_id is not None:
435 msg = video_id + ': ' + msg
437 msg += ' (caused by %r)' % cause
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
446 super(ExtractorError, self).__init__(msg)
449 self.exc_info = sys.exc_info() # preserve original exception
451 self.video_id = video_id
453 def format_traceback(self):
454 if self.traceback is None:
456 return ''.join(traceback.format_tb(self.traceback))
459 class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
466 class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self, msg):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
535 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
552 hc.connect = functools.partial(_hc_connect, hc)
557 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
558 """Handler for HTTP requests and responses.
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
567 Part of this code was copied from:
569 http://techknack.net/python-urllib2-handlers/
571 Andrew Rowls, the author of that code, agreed to release it to the
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
579 def http_open(self, req):
580 return self.do_open(functools.partial(
581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
587 return zlib.decompress(data, -zlib.MAX_WBITS)
589 return zlib.decompress(data)
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
599 def http_request(self, req):
600 for h, v in std_headers.items():
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
609 if 'Youtubedl-user-agent' in req.headers:
610 if 'User-agent' in req.headers:
611 del req.headers['User-agent']
612 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
613 del req.headers['Youtubedl-user-agent']
615 if sys.version_info < (2, 7) and '#' in req.get_full_url():
616 # Python 2.6 is brain-dead when it comes to fragments
617 req._Request__original = req._Request__original.partition('#')[0]
618 req._Request__r_type = req._Request__r_type.partition('#')[0]
622 def http_response(self, req, resp):
625 if resp.headers.get('Content-encoding', '') == 'gzip':
626 content = resp.read()
627 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
629 uncompressed = io.BytesIO(gz.read())
630 except IOError as original_ioerror:
631 # There may be junk add the end of the file
632 # See http://stackoverflow.com/q/4928560/35070 for details
633 for i in range(1, 1024):
635 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
636 uncompressed = io.BytesIO(gz.read())
641 raise original_ioerror
642 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
643 resp.msg = old_resp.msg
645 if resp.headers.get('Content-encoding', '') == 'deflate':
646 gz = io.BytesIO(self.deflate(resp.read()))
647 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
648 resp.msg = old_resp.msg
651 https_request = http_request
652 https_response = http_response
655 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
656 def __init__(self, params, https_conn_class=None, *args, **kwargs):
657 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
658 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
659 self._params = params
661 def https_open(self, req):
662 return self.do_open(functools.partial(
663 _create_http_connection, self, self._https_conn_class, True),
667 def parse_iso8601(date_str, delimiter='T'):
668 """ Return a UNIX timestamp from the given date """
674 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
677 timezone = datetime.timedelta()
679 date_str = date_str[:-len(m.group(0))]
680 if not m.group('sign'):
681 timezone = datetime.timedelta()
683 sign = 1 if m.group('sign') == '+' else -1
684 timezone = datetime.timedelta(
685 hours=sign * int(m.group('hours')),
686 minutes=sign * int(m.group('minutes')))
687 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
688 dt = datetime.datetime.strptime(date_str, date_format) - timezone
689 return calendar.timegm(dt.timetuple())
692 def unified_strdate(date_str, day_first=True):
693 """Return a string with the date in the format YYYYMMDD"""
699 date_str = date_str.replace(',', ' ')
700 # %z (UTC offset) is only supported in python>=3.2
701 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
702 # Remove AM/PM + timezone
703 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
705 format_expressions = [
710 '%b %dst %Y %I:%M%p',
711 '%b %dnd %Y %I:%M%p',
712 '%b %dth %Y %I:%M%p',
718 '%Y-%m-%d %H:%M:%S.%f',
721 '%Y-%m-%dT%H:%M:%SZ',
722 '%Y-%m-%dT%H:%M:%S.%fZ',
723 '%Y-%m-%dT%H:%M:%S.%f0Z',
725 '%Y-%m-%dT%H:%M:%S.%f',
729 format_expressions.extend([
736 format_expressions.extend([
742 for expression in format_expressions:
744 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
747 if upload_date is None:
748 timetuple = email.utils.parsedate_tz(date_str)
750 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
754 def determine_ext(url, default_ext='unknown_video'):
757 guess = url.partition('?')[0].rpartition('.')[2]
758 if re.match(r'^[A-Za-z0-9]+$', guess):
764 def subtitles_filename(filename, sub_lang, sub_format):
765 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
768 def date_from_str(date_str):
770 Return a datetime object from a string in the format YYYYMMDD or
771 (now|today)[+-][0-9](day|week|month|year)(s)?"""
772 today = datetime.date.today()
773 if date_str in ('now', 'today'):
775 if date_str == 'yesterday':
776 return today - datetime.timedelta(days=1)
777 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
778 if match is not None:
779 sign = match.group('sign')
780 time = int(match.group('time'))
783 unit = match.group('unit')
784 # A bad aproximation?
792 delta = datetime.timedelta(**{unit: time})
794 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
797 def hyphenate_date(date_str):
799 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
800 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
801 if match is not None:
802 return '-'.join(match.groups())
807 class DateRange(object):
808 """Represents a time interval between two dates"""
810 def __init__(self, start=None, end=None):
811 """start and end must be strings in the format accepted by date"""
812 if start is not None:
813 self.start = date_from_str(start)
815 self.start = datetime.datetime.min.date()
817 self.end = date_from_str(end)
819 self.end = datetime.datetime.max.date()
820 if self.start > self.end:
821 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
825 """Returns a range that only contains the given day"""
828 def __contains__(self, date):
829 """Check if the date is in the range"""
830 if not isinstance(date, datetime.date):
831 date = date_from_str(date)
832 return self.start <= date <= self.end
835 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
839 """ Returns the platform name as a compat_str """
840 res = platform.platform()
841 if isinstance(res, bytes):
842 res = res.decode(preferredencoding())
844 assert isinstance(res, compat_str)
848 def _windows_write_string(s, out):
849 """ Returns True if the string was written using special methods,
850 False if it has yet to be written out."""
851 # Adapted from http://stackoverflow.com/a/3259271/35070
854 import ctypes.wintypes
862 fileno = out.fileno()
863 except AttributeError:
864 # If the output stream doesn't have a fileno, it's virtual
866 if fileno not in WIN_OUTPUT_IDS:
869 GetStdHandle = ctypes.WINFUNCTYPE(
870 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
871 (b"GetStdHandle", ctypes.windll.kernel32))
872 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
874 WriteConsoleW = ctypes.WINFUNCTYPE(
875 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
876 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
877 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
878 written = ctypes.wintypes.DWORD(0)
880 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
881 FILE_TYPE_CHAR = 0x0002
882 FILE_TYPE_REMOTE = 0x8000
883 GetConsoleMode = ctypes.WINFUNCTYPE(
884 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
885 ctypes.POINTER(ctypes.wintypes.DWORD))(
886 (b"GetConsoleMode", ctypes.windll.kernel32))
887 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
889 def not_a_console(handle):
890 if handle == INVALID_HANDLE_VALUE or handle is None:
892 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
893 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
898 def next_nonbmp_pos(s):
900 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
901 except StopIteration:
905 count = min(next_nonbmp_pos(s), 1024)
908 h, s, count if count else 2, ctypes.byref(written), None)
910 raise OSError('Failed to write string')
911 if not count: # We just wrote a non-BMP character
912 assert written.value == 2
915 assert written.value > 0
916 s = s[written.value:]
920 def write_string(s, out=None, encoding=None):
923 assert type(s) == compat_str
925 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
926 if _windows_write_string(s, out):
929 if ('b' in getattr(out, 'mode', '') or
930 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
931 byt = s.encode(encoding or preferredencoding(), 'ignore')
933 elif hasattr(out, 'buffer'):
934 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
935 byt = s.encode(enc, 'ignore')
936 out.buffer.write(byt)
942 def bytes_to_intlist(bs):
945 if isinstance(bs[0], int): # Python 3
948 return [ord(c) for c in bs]
951 def intlist_to_bytes(xs):
954 return struct_pack('%dB' % len(xs), *xs)
957 # Cross-platform file locking
958 if sys.platform == 'win32':
959 import ctypes.wintypes
962 class OVERLAPPED(ctypes.Structure):
964 ('Internal', ctypes.wintypes.LPVOID),
965 ('InternalHigh', ctypes.wintypes.LPVOID),
966 ('Offset', ctypes.wintypes.DWORD),
967 ('OffsetHigh', ctypes.wintypes.DWORD),
968 ('hEvent', ctypes.wintypes.HANDLE),
971 kernel32 = ctypes.windll.kernel32
972 LockFileEx = kernel32.LockFileEx
973 LockFileEx.argtypes = [
974 ctypes.wintypes.HANDLE, # hFile
975 ctypes.wintypes.DWORD, # dwFlags
976 ctypes.wintypes.DWORD, # dwReserved
977 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
978 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
979 ctypes.POINTER(OVERLAPPED) # Overlapped
981 LockFileEx.restype = ctypes.wintypes.BOOL
982 UnlockFileEx = kernel32.UnlockFileEx
983 UnlockFileEx.argtypes = [
984 ctypes.wintypes.HANDLE, # hFile
985 ctypes.wintypes.DWORD, # dwReserved
986 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
987 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
988 ctypes.POINTER(OVERLAPPED) # Overlapped
990 UnlockFileEx.restype = ctypes.wintypes.BOOL
991 whole_low = 0xffffffff
992 whole_high = 0x7fffffff
994 def _lock_file(f, exclusive):
995 overlapped = OVERLAPPED()
996 overlapped.Offset = 0
997 overlapped.OffsetHigh = 0
998 overlapped.hEvent = 0
999 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1000 handle = msvcrt.get_osfhandle(f.fileno())
1001 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1002 whole_low, whole_high, f._lock_file_overlapped_p):
1003 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1005 def _unlock_file(f):
1006 assert f._lock_file_overlapped_p
1007 handle = msvcrt.get_osfhandle(f.fileno())
1008 if not UnlockFileEx(handle, 0,
1009 whole_low, whole_high, f._lock_file_overlapped_p):
1010 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1015 def _lock_file(f, exclusive):
1016 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1018 def _unlock_file(f):
1019 fcntl.flock(f, fcntl.LOCK_UN)
1022 class locked_file(object):
1023 def __init__(self, filename, mode, encoding=None):
1024 assert mode in ['r', 'a', 'w']
1025 self.f = io.open(filename, mode, encoding=encoding)
1028 def __enter__(self):
1029 exclusive = self.mode != 'r'
1031 _lock_file(self.f, exclusive)
1037 def __exit__(self, etype, value, traceback):
1039 _unlock_file(self.f)
1046 def write(self, *args):
1047 return self.f.write(*args)
1049 def read(self, *args):
1050 return self.f.read(*args)
1053 def get_filesystem_encoding():
1054 encoding = sys.getfilesystemencoding()
1055 return encoding if encoding is not None else 'utf-8'
1058 def shell_quote(args):
1060 encoding = get_filesystem_encoding()
1062 if isinstance(a, bytes):
1063 # We may get a filename encoded with 'encodeFilename'
1064 a = a.decode(encoding)
1065 quoted_args.append(pipes.quote(a))
1066 return ' '.join(quoted_args)
1069 def takewhile_inclusive(pred, seq):
1070 """ Like itertools.takewhile, but include the latest evaluated element
1071 (the first element so that Not pred(e)) """
1078 def smuggle_url(url, data):
1079 """ Pass additional data in a URL for internal use. """
1081 sdata = compat_urllib_parse.urlencode(
1082 {'__youtubedl_smuggle': json.dumps(data)})
1083 return url + '#' + sdata
1086 def unsmuggle_url(smug_url, default=None):
1087 if '#__youtubedl_smuggle' not in smug_url:
1088 return smug_url, default
1089 url, _, sdata = smug_url.rpartition('#')
1090 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1091 data = json.loads(jsond)
1095 def format_bytes(bytes):
1098 if type(bytes) is str:
1099 bytes = float(bytes)
1103 exponent = int(math.log(bytes, 1024.0))
1104 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1105 converted = float(bytes) / float(1024 ** exponent)
1106 return '%.2f%s' % (converted, suffix)
1109 def parse_filesize(s):
1113 # The lower-case forms are of course incorrect and inofficial,
1114 # but we support those too
1152 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1154 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1158 num_str = m.group('num').replace(',', '.')
1159 mult = _UNIT_TABLE[m.group('unit')]
1160 return int(float(num_str) * mult)
1163 def get_term_width():
1164 columns = compat_getenv('COLUMNS', None)
1169 sp = subprocess.Popen(
1171 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1172 out, err = sp.communicate()
1173 return int(out.split()[1])
1179 def month_by_name(name):
1180 """ Return the number of a month by (locale-independently) English name """
1183 'January', 'February', 'March', 'April', 'May', 'June',
1184 'July', 'August', 'September', 'October', 'November', 'December']
1186 return ENGLISH_NAMES.index(name) + 1
1191 def fix_xml_ampersands(xml_str):
1192 """Replace all the '&' by '&' in XML"""
1194 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1199 def setproctitle(title):
1200 assert isinstance(title, compat_str)
1202 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1205 title_bytes = title.encode('utf-8')
1206 buf = ctypes.create_string_buffer(len(title_bytes))
1207 buf.value = title_bytes
1209 libc.prctl(15, buf, 0, 0, 0)
1210 except AttributeError:
1211 return # Strange libc, just skip this
1214 def remove_start(s, start):
1215 if s.startswith(start):
1216 return s[len(start):]
1220 def remove_end(s, end):
1222 return s[:-len(end)]
1226 def url_basename(url):
1227 path = compat_urlparse.urlparse(url).path
1228 return path.strip('/').split('/')[-1]
1231 class HEADRequest(compat_urllib_request.Request):
1232 def get_method(self):
1236 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1239 v = getattr(v, get_attr, None)
1242 return default if v is None else (int(v) * invscale // scale)
1245 def str_or_none(v, default=None):
1246 return default if v is None else compat_str(v)
1249 def str_to_int(int_str):
1250 """ A more relaxed version of int_or_none """
1253 int_str = re.sub(r'[,\.\+]', '', int_str)
1257 def float_or_none(v, scale=1, invscale=1, default=None):
1258 return default if v is None else (float(v) * invscale / scale)
1261 def parse_duration(s):
1262 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1270 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1271 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1274 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1275 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1277 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1282 if m.group('only_mins'):
1283 return float_or_none(m.group('only_mins'), invscale=60)
1284 if m.group('only_hours'):
1285 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1287 res += int(m.group('secs'))
1289 res += int(m.group('mins')) * 60
1290 if m.group('hours'):
1291 res += int(m.group('hours')) * 60 * 60
1293 res += float(m.group('ms'))
1297 def prepend_extension(filename, ext):
1298 name, real_ext = os.path.splitext(filename)
1299 return '{0}.{1}{2}'.format(name, ext, real_ext)
1302 def check_executable(exe, args=[]):
1303 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1304 args can be a list of arguments for a short output (like -version) """
1306 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1312 def get_exe_version(exe, args=['--version'],
1313 version_re=None, unrecognized='present'):
1314 """ Returns the version of the specified executable,
1315 or False if the executable is not present """
1317 out, _ = subprocess.Popen(
1319 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1322 if isinstance(out, bytes): # Python 2.x
1323 out = out.decode('ascii', 'ignore')
1324 return detect_exe_version(out, version_re, unrecognized)
1327 def detect_exe_version(output, version_re=None, unrecognized='present'):
1328 assert isinstance(output, compat_str)
1329 if version_re is None:
1330 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1331 m = re.search(version_re, output)
1338 class PagedList(object):
1340 # This is only useful for tests
1341 return len(self.getslice())
1344 class OnDemandPagedList(PagedList):
1345 def __init__(self, pagefunc, pagesize):
1346 self._pagefunc = pagefunc
1347 self._pagesize = pagesize
1349 def getslice(self, start=0, end=None):
1351 for pagenum in itertools.count(start // self._pagesize):
1352 firstid = pagenum * self._pagesize
1353 nextfirstid = pagenum * self._pagesize + self._pagesize
1354 if start >= nextfirstid:
1357 page_results = list(self._pagefunc(pagenum))
1360 start % self._pagesize
1361 if firstid <= start < nextfirstid
1365 ((end - 1) % self._pagesize) + 1
1366 if (end is not None and firstid <= end <= nextfirstid)
1369 if startv != 0 or endv is not None:
1370 page_results = page_results[startv:endv]
1371 res.extend(page_results)
1373 # A little optimization - if current page is not "full", ie. does
1374 # not contain page_size videos then we can assume that this page
1375 # is the last one - there are no more ids on further pages -
1376 # i.e. no need to query again.
1377 if len(page_results) + startv < self._pagesize:
1380 # If we got the whole page, but the next page is not interesting,
1381 # break out early as well
1382 if end == nextfirstid:
1387 class InAdvancePagedList(PagedList):
1388 def __init__(self, pagefunc, pagecount, pagesize):
1389 self._pagefunc = pagefunc
1390 self._pagecount = pagecount
1391 self._pagesize = pagesize
1393 def getslice(self, start=0, end=None):
1395 start_page = start // self._pagesize
1397 self._pagecount if end is None else (end // self._pagesize + 1))
1398 skip_elems = start - start_page * self._pagesize
1399 only_more = None if end is None else end - start
1400 for pagenum in range(start_page, end_page):
1401 page = list(self._pagefunc(pagenum))
1403 page = page[skip_elems:]
1405 if only_more is not None:
1406 if len(page) < only_more:
1407 only_more -= len(page)
1409 page = page[:only_more]
1416 def uppercase_escape(s):
1417 unicode_escape = codecs.getdecoder('unicode_escape')
1419 r'\\U[0-9a-fA-F]{8}',
1420 lambda m: unicode_escape(m.group(0))[0],
1424 def escape_rfc3986(s):
1425 """Escape non-ASCII characters as suggested by RFC 3986"""
1426 if sys.version_info < (3, 0) and isinstance(s, unicode):
1427 s = s.encode('utf-8')
1428 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1431 def escape_url(url):
1432 """Escape URL as suggested by RFC 3986"""
1433 url_parsed = compat_urllib_parse_urlparse(url)
1434 return url_parsed._replace(
1435 path=escape_rfc3986(url_parsed.path),
1436 params=escape_rfc3986(url_parsed.params),
1437 query=escape_rfc3986(url_parsed.query),
1438 fragment=escape_rfc3986(url_parsed.fragment)
1442 struct.pack('!I', 0)
1444 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1445 def struct_pack(spec, *args):
1446 if isinstance(spec, compat_str):
1447 spec = spec.encode('ascii')
1448 return struct.pack(spec, *args)
1450 def struct_unpack(spec, *args):
1451 if isinstance(spec, compat_str):
1452 spec = spec.encode('ascii')
1453 return struct.unpack(spec, *args)
1455 struct_pack = struct.pack
1456 struct_unpack = struct.unpack
1459 def read_batch_urls(batch_fd):
1461 if not isinstance(url, compat_str):
1462 url = url.decode('utf-8', 'replace')
1463 BOM_UTF8 = '\xef\xbb\xbf'
1464 if url.startswith(BOM_UTF8):
1465 url = url[len(BOM_UTF8):]
1467 if url.startswith(('#', ';', ']')):
1471 with contextlib.closing(batch_fd) as fd:
1472 return [url for url in map(fixup, fd) if url]
1475 def urlencode_postdata(*args, **kargs):
1476 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1480 etree_iter = xml.etree.ElementTree.Element.iter
1481 except AttributeError: # Python <=2.6
1482 etree_iter = lambda n: n.findall('.//*')
1486 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1487 def doctype(self, name, pubid, system):
1488 pass # Ignore doctypes
1490 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1491 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1492 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1493 # Fix up XML parser in Python 2.x
1494 if sys.version_info < (3, 0):
1495 for n in etree_iter(tree):
1496 if n.text is not None:
1497 if not isinstance(n.text, compat_str):
1498 n.text = n.text.decode('utf-8')
1511 def parse_age_limit(s):
1514 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1515 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1518 def strip_jsonp(code):
1520 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1523 def js_to_json(code):
1526 if v in ('true', 'false', 'null'):
1528 if v.startswith('"'):
1530 if v.startswith("'"):
1532 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1539 res = re.sub(r'''(?x)
1540 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1541 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1542 [a-zA-Z_][a-zA-Z_0-9]*
1544 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548 def qualities(quality_ids):
1549 """ Get a numeric quality value out of a list of possible values """
1552 return quality_ids.index(qid)
1558 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1561 def limit_length(s, length):
1562 """ Add ellipses to overly long strings """
1567 return s[:length - len(ELLIPSES)] + ELLIPSES
1571 def version_tuple(v):
1572 return tuple(int(e) for e in re.split(r'[-.]', v))
1575 def is_outdated_version(version, limit, assume_new=True):
1577 return not assume_new
1579 return version_tuple(version) < version_tuple(limit)
1581 return not assume_new
1584 def ytdl_is_updateable():
1585 """ Returns if youtube-dl can be updated with -U """
1586 from zipimport import zipimporter
1588 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1591 def args_to_str(args):
1592 # Get a short string representation for a subprocess command
1593 return ' '.join(shlex_quote(a) for a in args)
1596 def urlhandle_detect_ext(url_handle):
1599 getheader = lambda h: url_handle.headers[h]
1600 except AttributeError: # Python < 3
1601 getheader = url_handle.info().getheader
1603 cd = getheader('Content-Disposition')
1605 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1607 e = determine_ext(m.group('filename'), default_ext=None)
1611 return getheader('Content-Type').split("/")[1]
1614 def age_restricted(content_limit, age_limit):
1615 """ Returns True iff the content should be blocked """
1617 if age_limit is None: # No limit set
1619 if content_limit is None:
1620 return False # Content available for everyone
1621 return age_limit < content_limit
1624 def is_html(first_bytes):
1625 """ Detect whether a file contains HTML by examining its first bytes. """
1628 (b'\xef\xbb\xbf', 'utf-8'),
1629 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1630 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1631 (b'\xff\xfe', 'utf-16-le'),
1632 (b'\xfe\xff', 'utf-16-be'),
1634 for bom, enc in BOMS:
1635 if first_bytes.startswith(bom):
1636 s = first_bytes[len(bom):].decode(enc, 'replace')
1639 s = first_bytes.decode('utf-8', 'replace')
1641 return re.match(r'^\s*<', s)