2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
40 compat_etree_fromstring,
45 compat_socket_create_connection,
49 compat_urllib_parse_urlparse,
50 compat_urllib_request,
56 # This is not clearly defined otherwise
57 compiled_regex_type = type(re.compile(''))
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
70 ENGLISH_MONTH_NAMES = [
71 'January', 'February', 'March', 'April', 'May', 'June',
72 'July', 'August', 'September', 'October', 'November', 'December']
75 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
76 'flv', 'f4v', 'f4a', 'f4b',
77 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
87 'f4f', 'f4m', 'm3u8', 'smil')
90 def preferredencoding():
91 """Get preferred encoding.
93 Returns the best encoding scheme for the system, based on
94 locale.getpreferredencoding() and some further tweaks.
97 pref = locale.getpreferredencoding()
105 def write_json_file(obj, fn):
106 """ Encode obj as JSON and write it to fn, atomically if possible """
108 fn = encodeFilename(fn)
109 if sys.version_info < (3, 0) and sys.platform != 'win32':
110 encoding = get_filesystem_encoding()
111 # os.path.basename returns a bytes object, but NamedTemporaryFile
112 # will fail if the filename contains non ascii characters unless we
113 # use a unicode object
114 path_basename = lambda f: os.path.basename(fn).decode(encoding)
115 # the same for os.path.dirname
116 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
118 path_basename = os.path.basename
119 path_dirname = os.path.dirname
123 'prefix': path_basename(fn) + '.',
124 'dir': path_dirname(fn),
128 # In Python 2.x, json.dump expects a bytestream.
129 # In Python 3.x, it writes to a character stream
130 if sys.version_info < (3, 0):
138 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
143 if sys.platform == 'win32':
144 # Need to remove existing file on Windows, else os.rename raises
145 # WindowsError or FileExistsError.
150 os.rename(tf.name, fn)
159 if sys.version_info >= (2, 7):
160 def find_xpath_attr(node, xpath, key, val=None):
161 """ Find the xpath xpath[@key=val] """
162 assert re.match(r'^[a-zA-Z_-]+$', key)
163 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
164 return node.find(expr)
166 def find_xpath_attr(node, xpath, key, val=None):
167 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
168 # .//node does not match if a node is a direct child of . !
169 if isinstance(xpath, compat_str):
170 xpath = xpath.encode('ascii')
172 for f in node.findall(xpath):
173 if key not in f.attrib:
175 if val is None or f.attrib.get(key) == val:
179 # On python2.6 the xml.etree.ElementTree.Element methods don't support
180 # the namespace parameter
183 def xpath_with_ns(path, ns_map):
184 components = [c.split(':') for c in path.split('/')]
188 replaced.append(c[0])
191 replaced.append('{%s}%s' % (ns_map[ns], tag))
192 return '/'.join(replaced)
195 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
196 def _find_xpath(xpath):
197 if sys.version_info < (2, 7): # Crazy 2.6
198 xpath = xpath.encode('ascii')
199 return node.find(xpath)
201 if isinstance(xpath, (str, compat_str)):
202 n = _find_xpath(xpath)
210 if default is not NO_DEFAULT:
213 name = xpath if name is None else name
214 raise ExtractorError('Could not find XML element %s' % name)
220 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
221 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
222 if n is None or n == default:
225 if default is not NO_DEFAULT:
228 name = xpath if name is None else name
229 raise ExtractorError('Could not find XML element\'s text %s' % name)
235 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
236 n = find_xpath_attr(node, xpath, key)
238 if default is not NO_DEFAULT:
241 name = '%s[@%s]' % (xpath, key) if name is None else name
242 raise ExtractorError('Could not find XML attribute %s' % name)
248 def get_element_by_id(id, html):
249 """Return the content of the tag with the specified ID in the passed HTML document"""
250 return get_element_by_attribute('id', id, html)
253 def get_element_by_attribute(attribute, value, html):
254 """Return the content of the tag with the specified attribute in the passed HTML document"""
256 m = re.search(r'''(?xs)
258 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
260 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
264 ''' % (re.escape(attribute), re.escape(value)), html)
268 res = m.group('content')
270 if res.startswith('"') or res.startswith("'"):
273 return unescapeHTML(res)
276 def clean_html(html):
277 """Clean an HTML snippet into a readable string"""
279 if html is None: # Convenience for sanitizing descriptions etc.
283 html = html.replace('\n', ' ')
284 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
285 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
287 html = re.sub('<.*?>', '', html)
288 # Replace html entities
289 html = unescapeHTML(html)
293 def sanitize_open(filename, open_mode):
294 """Try to open the given filename, and slightly tweak it if this fails.
296 Attempts to open the given filename. If this fails, it tries to change
297 the filename slightly, step by step, until it's either able to open it
298 or it fails and raises a final exception, like the standard open()
301 It returns the tuple (stream, definitive_file_name).
305 if sys.platform == 'win32':
307 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
308 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
309 stream = open(encodeFilename(filename), open_mode)
310 return (stream, filename)
311 except (IOError, OSError) as err:
312 if err.errno in (errno.EACCES,):
315 # In case of error, try to remove win32 forbidden chars
316 alt_filename = sanitize_path(filename)
317 if alt_filename == filename:
320 # An exception here should be caught in the caller
321 stream = open(encodeFilename(alt_filename), open_mode)
322 return (stream, alt_filename)
325 def timeconvert(timestr):
326 """Convert RFC 2822 defined time string into system timestamp"""
328 timetuple = email.utils.parsedate_tz(timestr)
329 if timetuple is not None:
330 timestamp = email.utils.mktime_tz(timetuple)
334 def sanitize_filename(s, restricted=False, is_id=False):
335 """Sanitizes a string so it could be used as part of a filename.
336 If restricted is set, use a stricter subset of allowed characters.
337 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
339 def replace_insane(char):
340 if char == '?' or ord(char) < 32 or ord(char) == 127:
343 return '' if restricted else '\''
345 return '_-' if restricted else ' -'
346 elif char in '\\/|*<>':
348 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
350 if restricted and ord(char) > 127:
355 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
356 result = ''.join(map(replace_insane, s))
358 while '__' in result:
359 result = result.replace('__', '_')
360 result = result.strip('_')
361 # Common case of "Foreign band name - English song title"
362 if restricted and result.startswith('-_'):
364 if result.startswith('-'):
365 result = '_' + result[len('-'):]
366 result = result.lstrip('.')
372 def sanitize_path(s):
373 """Sanitizes and normalizes path on Windows"""
374 if sys.platform != 'win32':
376 drive_or_unc, _ = os.path.splitdrive(s)
377 if sys.version_info < (2, 7) and not drive_or_unc:
378 drive_or_unc, _ = os.path.splitunc(s)
379 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
383 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
384 for path_part in norm_path]
386 sanitized_path.insert(0, drive_or_unc + os.path.sep)
387 return os.path.join(*sanitized_path)
390 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
391 # unwanted failures due to missing protocol
392 def sanitized_Request(url, *args, **kwargs):
393 return compat_urllib_request.Request(
394 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
397 def orderedSet(iterable):
398 """ Remove all duplicates from the input iterable """
406 def _htmlentity_transform(entity):
407 """Transforms an HTML entity to a character."""
408 # Known non-numeric HTML entity
409 if entity in compat_html_entities.name2codepoint:
410 return compat_chr(compat_html_entities.name2codepoint[entity])
412 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
414 numstr = mobj.group(1)
415 if numstr.startswith('x'):
417 numstr = '0%s' % numstr
420 # See https://github.com/rg3/youtube-dl/issues/7518
422 return compat_chr(int(numstr, base))
426 # Unknown entity in name, return its literal representation
427 return '&%s;' % entity
433 assert type(s) == compat_str
436 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
439 def get_subprocess_encoding():
440 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
441 # For subprocess calls, encode with locale encoding
442 # Refer to http://stackoverflow.com/a/9951851/35070
443 encoding = preferredencoding()
445 encoding = sys.getfilesystemencoding()
451 def encodeFilename(s, for_subprocess=False):
453 @param s The name of the file
456 assert type(s) == compat_str
458 # Python 3 has a Unicode API
459 if sys.version_info >= (3, 0):
462 # Pass '' directly to use Unicode APIs on Windows 2000 and up
463 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
464 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
465 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 return s.encode(get_subprocess_encoding(), 'ignore')
471 def decodeFilename(b, for_subprocess=False):
473 if sys.version_info >= (3, 0):
476 if not isinstance(b, bytes):
479 return b.decode(get_subprocess_encoding(), 'ignore')
482 def encodeArgument(s):
483 if not isinstance(s, compat_str):
484 # Legacy code that uses byte strings
485 # Uncomment the following line after fixing all post processors
486 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
487 s = s.decode('ascii')
488 return encodeFilename(s, True)
491 def decodeArgument(b):
492 return decodeFilename(b, True)
495 def decodeOption(optval):
498 if isinstance(optval, bytes):
499 optval = optval.decode(preferredencoding())
501 assert isinstance(optval, compat_str)
505 def formatSeconds(secs):
507 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
509 return '%d:%02d' % (secs // 60, secs % 60)
514 def make_HTTPS_handler(params, **kwargs):
515 opts_no_check_certificate = params.get('nocheckcertificate', False)
516 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
517 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
518 if opts_no_check_certificate:
519 context.check_hostname = False
520 context.verify_mode = ssl.CERT_NONE
522 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 # (create_default_context present but HTTPSHandler has no context=)
528 if sys.version_info < (3, 2):
529 return YoutubeDLHTTPSHandler(params, **kwargs)
531 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
532 context.verify_mode = (ssl.CERT_NONE
533 if opts_no_check_certificate
534 else ssl.CERT_REQUIRED)
535 context.set_default_verify_paths()
536 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
539 def bug_reports_message():
540 if ytdl_is_updateable():
541 update_cmd = 'type youtube-dl -U to update'
543 update_cmd = 'see https://yt-dl.org/update on how to update'
544 msg = '; please report this issue on https://yt-dl.org/bug .'
545 msg += ' Make sure you are using the latest version; %s.' % update_cmd
546 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
550 class ExtractorError(Exception):
551 """Error during info extraction."""
553 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
554 """ tb, if given, is the original traceback (so that it can be printed out).
555 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
558 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 if video_id is not None:
561 msg = video_id + ': ' + msg
563 msg += ' (caused by %r)' % cause
565 msg += bug_reports_message()
566 super(ExtractorError, self).__init__(msg)
569 self.exc_info = sys.exc_info() # preserve original exception
571 self.video_id = video_id
573 def format_traceback(self):
574 if self.traceback is None:
576 return ''.join(traceback.format_tb(self.traceback))
579 class UnsupportedError(ExtractorError):
580 def __init__(self, url):
581 super(UnsupportedError, self).__init__(
582 'Unsupported URL: %s' % url, expected=True)
586 class RegexNotFoundError(ExtractorError):
587 """Error when a regex didn't match"""
591 class DownloadError(Exception):
592 """Download Error exception.
594 This exception may be thrown by FileDownloader objects if they are not
595 configured to continue on errors. They will contain the appropriate
599 def __init__(self, msg, exc_info=None):
600 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
601 super(DownloadError, self).__init__(msg)
602 self.exc_info = exc_info
605 class SameFileError(Exception):
606 """Same File exception.
608 This exception will be thrown by FileDownloader objects if they detect
609 multiple files would have to be downloaded to the same file on disk.
614 class PostProcessingError(Exception):
615 """Post Processing exception.
617 This exception may be raised by PostProcessor's .run() method to
618 indicate an error in the postprocessing task.
621 def __init__(self, msg):
625 class MaxDownloadsReached(Exception):
626 """ --max-downloads limit has been reached. """
630 class UnavailableVideoError(Exception):
631 """Unavailable Format exception.
633 This exception will be thrown when a video is requested
634 in a format that is not available for that video.
639 class ContentTooShortError(Exception):
640 """Content Too Short exception.
642 This exception may be raised by FileDownloader objects when a file they
643 download is too small for what the server announced first, indicating
644 the connection was probably interrupted.
647 def __init__(self, downloaded, expected):
649 self.downloaded = downloaded
650 self.expected = expected
653 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
654 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
655 # expected HTTP responses to meet HTTP/1.0 or later (see also
656 # https://github.com/rg3/youtube-dl/issues/6727)
657 if sys.version_info < (3, 0):
658 kwargs[b'strict'] = True
659 hc = http_class(*args, **kwargs)
660 source_address = ydl_handler._params.get('source_address')
661 if source_address is not None:
662 sa = (source_address, 0)
663 if hasattr(hc, 'source_address'): # Python 2.7+
664 hc.source_address = sa
666 def _hc_connect(self, *args, **kwargs):
667 sock = compat_socket_create_connection(
668 (self.host, self.port), self.timeout, sa)
670 self.sock = ssl.wrap_socket(
671 sock, self.key_file, self.cert_file,
672 ssl_version=ssl.PROTOCOL_TLSv1)
675 hc.connect = functools.partial(_hc_connect, hc)
680 def handle_youtubedl_headers(headers):
681 filtered_headers = headers
683 if 'Youtubedl-no-compression' in filtered_headers:
684 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
685 del filtered_headers['Youtubedl-no-compression']
687 return filtered_headers
690 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
691 """Handler for HTTP requests and responses.
693 This class, when installed with an OpenerDirector, automatically adds
694 the standard headers to every HTTP request and handles gzipped and
695 deflated responses from web servers. If compression is to be avoided in
696 a particular request, the original request in the program code only has
697 to include the HTTP header "Youtubedl-no-compression", which will be
698 removed before making the real request.
700 Part of this code was copied from:
702 http://techknack.net/python-urllib2-handlers/
704 Andrew Rowls, the author of that code, agreed to release it to the
708 def __init__(self, params, *args, **kwargs):
709 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
710 self._params = params
712 def http_open(self, req):
713 return self.do_open(functools.partial(
714 _create_http_connection, self, compat_http_client.HTTPConnection, False),
720 return zlib.decompress(data, -zlib.MAX_WBITS)
722 return zlib.decompress(data)
725 def addinfourl_wrapper(stream, headers, url, code):
726 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
727 return compat_urllib_request.addinfourl(stream, headers, url, code)
728 ret = compat_urllib_request.addinfourl(stream, headers, url)
732 def http_request(self, req):
733 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
734 # always respected by websites, some tend to give out URLs with non percent-encoded
735 # non-ASCII characters (see telemb.py, ard.py [#3412])
736 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
737 # To work around aforementioned issue we will replace request's original URL with
738 # percent-encoded one
739 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
740 # the code of this workaround has been moved here from YoutubeDL.urlopen()
741 url = req.get_full_url()
742 url_escaped = escape_url(url)
744 # Substitute URL if any change after escaping
745 if url != url_escaped:
746 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
748 url_escaped, data=req.data, headers=req.headers,
749 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
750 new_req.timeout = req.timeout
753 for h, v in std_headers.items():
754 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
755 # The dict keys are capitalized because of this bug by urllib
756 if h.capitalize() not in req.headers:
759 req.headers = handle_youtubedl_headers(req.headers)
761 if sys.version_info < (2, 7) and '#' in req.get_full_url():
762 # Python 2.6 is brain-dead when it comes to fragments
763 req._Request__original = req._Request__original.partition('#')[0]
764 req._Request__r_type = req._Request__r_type.partition('#')[0]
768 def http_response(self, req, resp):
771 if resp.headers.get('Content-encoding', '') == 'gzip':
772 content = resp.read()
773 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
775 uncompressed = io.BytesIO(gz.read())
776 except IOError as original_ioerror:
777 # There may be junk add the end of the file
778 # See http://stackoverflow.com/q/4928560/35070 for details
779 for i in range(1, 1024):
781 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
782 uncompressed = io.BytesIO(gz.read())
787 raise original_ioerror
788 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
789 resp.msg = old_resp.msg
790 del resp.headers['Content-encoding']
792 if resp.headers.get('Content-encoding', '') == 'deflate':
793 gz = io.BytesIO(self.deflate(resp.read()))
794 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
795 resp.msg = old_resp.msg
796 del resp.headers['Content-encoding']
797 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
798 # https://github.com/rg3/youtube-dl/issues/6457).
799 if 300 <= resp.code < 400:
800 location = resp.headers.get('Location')
802 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
803 if sys.version_info >= (3, 0):
804 location = location.encode('iso-8859-1').decode('utf-8')
805 location_escaped = escape_url(location)
806 if location != location_escaped:
807 del resp.headers['Location']
808 resp.headers['Location'] = location_escaped
811 https_request = http_request
812 https_response = http_response
815 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
816 def __init__(self, params, https_conn_class=None, *args, **kwargs):
817 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
818 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
819 self._params = params
821 def https_open(self, req):
823 if hasattr(self, '_context'): # python > 2.6
824 kwargs['context'] = self._context
825 if hasattr(self, '_check_hostname'): # python 3.x
826 kwargs['check_hostname'] = self._check_hostname
827 return self.do_open(functools.partial(
828 _create_http_connection, self, self._https_conn_class, True),
832 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
833 def __init__(self, cookiejar=None):
834 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
836 def http_response(self, request, response):
837 # Python 2 will choke on next HTTP request in row if there are non-ASCII
838 # characters in Set-Cookie HTTP header of last response (see
839 # https://github.com/rg3/youtube-dl/issues/6769).
840 # In order to at least prevent crashing we will percent encode Set-Cookie
841 # header before HTTPCookieProcessor starts processing it.
842 # if sys.version_info < (3, 0) and response.headers:
843 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
844 # set_cookie = response.headers.get(set_cookie_header)
846 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
847 # if set_cookie != set_cookie_escaped:
848 # del response.headers[set_cookie_header]
849 # response.headers[set_cookie_header] = set_cookie_escaped
850 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
852 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
853 https_response = http_response
856 def parse_iso8601(date_str, delimiter='T', timezone=None):
857 """ Return a UNIX timestamp from the given date """
862 date_str = re.sub(r'\.[0-9]+', '', date_str)
866 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
869 timezone = datetime.timedelta()
871 date_str = date_str[:-len(m.group(0))]
872 if not m.group('sign'):
873 timezone = datetime.timedelta()
875 sign = 1 if m.group('sign') == '+' else -1
876 timezone = datetime.timedelta(
877 hours=sign * int(m.group('hours')),
878 minutes=sign * int(m.group('minutes')))
880 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
881 dt = datetime.datetime.strptime(date_str, date_format) - timezone
882 return calendar.timegm(dt.timetuple())
887 def unified_strdate(date_str, day_first=True):
888 """Return a string with the date in the format YYYYMMDD"""
894 date_str = date_str.replace(',', ' ')
895 # %z (UTC offset) is only supported in python>=3.2
896 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
897 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
898 # Remove AM/PM + timezone
899 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
901 format_expressions = [
914 '%Y-%m-%d %H:%M:%S.%f',
917 '%Y-%m-%dT%H:%M:%SZ',
918 '%Y-%m-%dT%H:%M:%S.%fZ',
919 '%Y-%m-%dT%H:%M:%S.%f0Z',
921 '%Y-%m-%dT%H:%M:%S.%f',
925 format_expressions.extend([
933 format_expressions.extend([
940 for expression in format_expressions:
942 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
945 if upload_date is None:
946 timetuple = email.utils.parsedate_tz(date_str)
948 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
949 if upload_date is not None:
950 return compat_str(upload_date)
953 def determine_ext(url, default_ext='unknown_video'):
956 guess = url.partition('?')[0].rpartition('.')[2]
957 if re.match(r'^[A-Za-z0-9]+$', guess):
959 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
960 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
961 return guess.rstrip('/')
966 def subtitles_filename(filename, sub_lang, sub_format):
967 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
970 def date_from_str(date_str):
972 Return a datetime object from a string in the format YYYYMMDD or
973 (now|today)[+-][0-9](day|week|month|year)(s)?"""
974 today = datetime.date.today()
975 if date_str in ('now', 'today'):
977 if date_str == 'yesterday':
978 return today - datetime.timedelta(days=1)
979 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
980 if match is not None:
981 sign = match.group('sign')
982 time = int(match.group('time'))
985 unit = match.group('unit')
986 # A bad approximation?
994 delta = datetime.timedelta(**{unit: time})
996 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
999 def hyphenate_date(date_str):
1001 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1002 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1003 if match is not None:
1004 return '-'.join(match.groups())
1009 class DateRange(object):
1010 """Represents a time interval between two dates"""
1012 def __init__(self, start=None, end=None):
1013 """start and end must be strings in the format accepted by date"""
1014 if start is not None:
1015 self.start = date_from_str(start)
1017 self.start = datetime.datetime.min.date()
1019 self.end = date_from_str(end)
1021 self.end = datetime.datetime.max.date()
1022 if self.start > self.end:
1023 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1027 """Returns a range that only contains the given day"""
1028 return cls(day, day)
1030 def __contains__(self, date):
1031 """Check if the date is in the range"""
1032 if not isinstance(date, datetime.date):
1033 date = date_from_str(date)
1034 return self.start <= date <= self.end
1037 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1040 def platform_name():
1041 """ Returns the platform name as a compat_str """
1042 res = platform.platform()
1043 if isinstance(res, bytes):
1044 res = res.decode(preferredencoding())
1046 assert isinstance(res, compat_str)
1050 def _windows_write_string(s, out):
1051 """ Returns True if the string was written using special methods,
1052 False if it has yet to be written out."""
1053 # Adapted from http://stackoverflow.com/a/3259271/35070
1056 import ctypes.wintypes
1064 fileno = out.fileno()
1065 except AttributeError:
1066 # If the output stream doesn't have a fileno, it's virtual
1068 except io.UnsupportedOperation:
1069 # Some strange Windows pseudo files?
1071 if fileno not in WIN_OUTPUT_IDS:
1074 GetStdHandle = ctypes.WINFUNCTYPE(
1075 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1076 (b'GetStdHandle', ctypes.windll.kernel32))
1077 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1079 WriteConsoleW = ctypes.WINFUNCTYPE(
1080 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1081 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1082 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1083 written = ctypes.wintypes.DWORD(0)
1085 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1086 FILE_TYPE_CHAR = 0x0002
1087 FILE_TYPE_REMOTE = 0x8000
1088 GetConsoleMode = ctypes.WINFUNCTYPE(
1089 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1090 ctypes.POINTER(ctypes.wintypes.DWORD))(
1091 (b'GetConsoleMode', ctypes.windll.kernel32))
1092 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1094 def not_a_console(handle):
1095 if handle == INVALID_HANDLE_VALUE or handle is None:
1097 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1098 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1100 if not_a_console(h):
1103 def next_nonbmp_pos(s):
1105 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1106 except StopIteration:
1110 count = min(next_nonbmp_pos(s), 1024)
1112 ret = WriteConsoleW(
1113 h, s, count if count else 2, ctypes.byref(written), None)
1115 raise OSError('Failed to write string')
1116 if not count: # We just wrote a non-BMP character
1117 assert written.value == 2
1120 assert written.value > 0
1121 s = s[written.value:]
1125 def write_string(s, out=None, encoding=None):
1128 assert type(s) == compat_str
1130 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1131 if _windows_write_string(s, out):
1134 if ('b' in getattr(out, 'mode', '') or
1135 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1136 byt = s.encode(encoding or preferredencoding(), 'ignore')
1138 elif hasattr(out, 'buffer'):
1139 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1140 byt = s.encode(enc, 'ignore')
1141 out.buffer.write(byt)
1147 def bytes_to_intlist(bs):
1150 if isinstance(bs[0], int): # Python 3
1153 return [ord(c) for c in bs]
1156 def intlist_to_bytes(xs):
1159 return struct_pack('%dB' % len(xs), *xs)
1162 # Cross-platform file locking
1163 if sys.platform == 'win32':
1164 import ctypes.wintypes
1167 class OVERLAPPED(ctypes.Structure):
1169 ('Internal', ctypes.wintypes.LPVOID),
1170 ('InternalHigh', ctypes.wintypes.LPVOID),
1171 ('Offset', ctypes.wintypes.DWORD),
1172 ('OffsetHigh', ctypes.wintypes.DWORD),
1173 ('hEvent', ctypes.wintypes.HANDLE),
1176 kernel32 = ctypes.windll.kernel32
1177 LockFileEx = kernel32.LockFileEx
1178 LockFileEx.argtypes = [
1179 ctypes.wintypes.HANDLE, # hFile
1180 ctypes.wintypes.DWORD, # dwFlags
1181 ctypes.wintypes.DWORD, # dwReserved
1182 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1183 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1184 ctypes.POINTER(OVERLAPPED) # Overlapped
1186 LockFileEx.restype = ctypes.wintypes.BOOL
1187 UnlockFileEx = kernel32.UnlockFileEx
1188 UnlockFileEx.argtypes = [
1189 ctypes.wintypes.HANDLE, # hFile
1190 ctypes.wintypes.DWORD, # dwReserved
1191 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1192 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1193 ctypes.POINTER(OVERLAPPED) # Overlapped
1195 UnlockFileEx.restype = ctypes.wintypes.BOOL
1196 whole_low = 0xffffffff
1197 whole_high = 0x7fffffff
1199 def _lock_file(f, exclusive):
1200 overlapped = OVERLAPPED()
1201 overlapped.Offset = 0
1202 overlapped.OffsetHigh = 0
1203 overlapped.hEvent = 0
1204 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1205 handle = msvcrt.get_osfhandle(f.fileno())
1206 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1207 whole_low, whole_high, f._lock_file_overlapped_p):
1208 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1210 def _unlock_file(f):
1211 assert f._lock_file_overlapped_p
1212 handle = msvcrt.get_osfhandle(f.fileno())
1213 if not UnlockFileEx(handle, 0,
1214 whole_low, whole_high, f._lock_file_overlapped_p):
1215 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1220 def _lock_file(f, exclusive):
1221 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1223 def _unlock_file(f):
1224 fcntl.flock(f, fcntl.LOCK_UN)
1227 class locked_file(object):
1228 def __init__(self, filename, mode, encoding=None):
1229 assert mode in ['r', 'a', 'w']
1230 self.f = io.open(filename, mode, encoding=encoding)
1233 def __enter__(self):
1234 exclusive = self.mode != 'r'
1236 _lock_file(self.f, exclusive)
1242 def __exit__(self, etype, value, traceback):
1244 _unlock_file(self.f)
1251 def write(self, *args):
1252 return self.f.write(*args)
1254 def read(self, *args):
1255 return self.f.read(*args)
1258 def get_filesystem_encoding():
1259 encoding = sys.getfilesystemencoding()
1260 return encoding if encoding is not None else 'utf-8'
1263 def shell_quote(args):
1265 encoding = get_filesystem_encoding()
1267 if isinstance(a, bytes):
1268 # We may get a filename encoded with 'encodeFilename'
1269 a = a.decode(encoding)
1270 quoted_args.append(pipes.quote(a))
1271 return ' '.join(quoted_args)
1274 def smuggle_url(url, data):
1275 """ Pass additional data in a URL for internal use. """
1277 sdata = compat_urllib_parse.urlencode(
1278 {'__youtubedl_smuggle': json.dumps(data)})
1279 return url + '#' + sdata
1282 def unsmuggle_url(smug_url, default=None):
1283 if '#__youtubedl_smuggle' not in smug_url:
1284 return smug_url, default
1285 url, _, sdata = smug_url.rpartition('#')
1286 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1287 data = json.loads(jsond)
1291 def format_bytes(bytes):
1294 if type(bytes) is str:
1295 bytes = float(bytes)
1299 exponent = int(math.log(bytes, 1024.0))
1300 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1301 converted = float(bytes) / float(1024 ** exponent)
1302 return '%.2f%s' % (converted, suffix)
1305 def parse_filesize(s):
1309 # The lower-case forms are of course incorrect and unofficial,
1310 # but we support those too
1348 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1350 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1354 num_str = m.group('num').replace(',', '.')
1355 mult = _UNIT_TABLE[m.group('unit')]
1356 return int(float(num_str) * mult)
1359 def month_by_name(name):
1360 """ Return the number of a month by (locale-independently) English name """
1363 return ENGLISH_MONTH_NAMES.index(name) + 1
1368 def month_by_abbreviation(abbrev):
1369 """ Return the number of a month by (locale-independently) English
1373 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1378 def fix_xml_ampersands(xml_str):
1379 """Replace all the '&' by '&' in XML"""
1381 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1386 def setproctitle(title):
1387 assert isinstance(title, compat_str)
1389 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1392 title_bytes = title.encode('utf-8')
1393 buf = ctypes.create_string_buffer(len(title_bytes))
1394 buf.value = title_bytes
1396 libc.prctl(15, buf, 0, 0, 0)
1397 except AttributeError:
1398 return # Strange libc, just skip this
1401 def remove_start(s, start):
1402 if s.startswith(start):
1403 return s[len(start):]
1407 def remove_end(s, end):
1409 return s[:-len(end)]
1413 def remove_quotes(s):
1414 if s is None or len(s) < 2:
1416 for quote in ('"', "'", ):
1417 if s[0] == quote and s[-1] == quote:
1422 def url_basename(url):
1423 path = compat_urlparse.urlparse(url).path
1424 return path.strip('/').split('/')[-1]
1427 class HEADRequest(compat_urllib_request.Request):
1428 def get_method(self):
1432 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1435 v = getattr(v, get_attr, None)
1441 return int(v) * invscale // scale
1446 def str_or_none(v, default=None):
1447 return default if v is None else compat_str(v)
1450 def str_to_int(int_str):
1451 """ A more relaxed version of int_or_none """
1454 int_str = re.sub(r'[,\.\+]', '', int_str)
1458 def float_or_none(v, scale=1, invscale=1, default=None):
1462 return float(v) * invscale / scale
1467 def parse_duration(s):
1468 if not isinstance(s, compat_basestring):
1476 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1477 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1479 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1482 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1483 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1485 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1487 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1492 if m.group('only_mins'):
1493 return float_or_none(m.group('only_mins'), invscale=60)
1494 if m.group('only_hours'):
1495 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1497 res += int(m.group('secs'))
1498 if m.group('mins_reversed'):
1499 res += int(m.group('mins_reversed')) * 60
1501 res += int(m.group('mins')) * 60
1502 if m.group('hours'):
1503 res += int(m.group('hours')) * 60 * 60
1504 if m.group('hours_reversed'):
1505 res += int(m.group('hours_reversed')) * 60 * 60
1507 res += int(m.group('days')) * 24 * 60 * 60
1509 res += float(m.group('ms'))
1513 def prepend_extension(filename, ext, expected_real_ext=None):
1514 name, real_ext = os.path.splitext(filename)
1516 '{0}.{1}{2}'.format(name, ext, real_ext)
1517 if not expected_real_ext or real_ext[1:] == expected_real_ext
1518 else '{0}.{1}'.format(filename, ext))
1521 def replace_extension(filename, ext, expected_real_ext=None):
1522 name, real_ext = os.path.splitext(filename)
1523 return '{0}.{1}'.format(
1524 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1528 def check_executable(exe, args=[]):
1529 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1530 args can be a list of arguments for a short output (like -version) """
1532 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1538 def get_exe_version(exe, args=['--version'],
1539 version_re=None, unrecognized='present'):
1540 """ Returns the version of the specified executable,
1541 or False if the executable is not present """
1543 out, _ = subprocess.Popen(
1544 [encodeArgument(exe)] + args,
1545 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1548 if isinstance(out, bytes): # Python 2.x
1549 out = out.decode('ascii', 'ignore')
1550 return detect_exe_version(out, version_re, unrecognized)
1553 def detect_exe_version(output, version_re=None, unrecognized='present'):
1554 assert isinstance(output, compat_str)
1555 if version_re is None:
1556 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1557 m = re.search(version_re, output)
1564 class PagedList(object):
1566 # This is only useful for tests
1567 return len(self.getslice())
1570 class OnDemandPagedList(PagedList):
1571 def __init__(self, pagefunc, pagesize, use_cache=False):
1572 self._pagefunc = pagefunc
1573 self._pagesize = pagesize
1574 self._use_cache = use_cache
1578 def getslice(self, start=0, end=None):
1580 for pagenum in itertools.count(start // self._pagesize):
1581 firstid = pagenum * self._pagesize
1582 nextfirstid = pagenum * self._pagesize + self._pagesize
1583 if start >= nextfirstid:
1588 page_results = self._cache.get(pagenum)
1589 if page_results is None:
1590 page_results = list(self._pagefunc(pagenum))
1592 self._cache[pagenum] = page_results
1595 start % self._pagesize
1596 if firstid <= start < nextfirstid
1600 ((end - 1) % self._pagesize) + 1
1601 if (end is not None and firstid <= end <= nextfirstid)
1604 if startv != 0 or endv is not None:
1605 page_results = page_results[startv:endv]
1606 res.extend(page_results)
1608 # A little optimization - if current page is not "full", ie. does
1609 # not contain page_size videos then we can assume that this page
1610 # is the last one - there are no more ids on further pages -
1611 # i.e. no need to query again.
1612 if len(page_results) + startv < self._pagesize:
1615 # If we got the whole page, but the next page is not interesting,
1616 # break out early as well
1617 if end == nextfirstid:
1622 class InAdvancePagedList(PagedList):
1623 def __init__(self, pagefunc, pagecount, pagesize):
1624 self._pagefunc = pagefunc
1625 self._pagecount = pagecount
1626 self._pagesize = pagesize
1628 def getslice(self, start=0, end=None):
1630 start_page = start // self._pagesize
1632 self._pagecount if end is None else (end // self._pagesize + 1))
1633 skip_elems = start - start_page * self._pagesize
1634 only_more = None if end is None else end - start
1635 for pagenum in range(start_page, end_page):
1636 page = list(self._pagefunc(pagenum))
1638 page = page[skip_elems:]
1640 if only_more is not None:
1641 if len(page) < only_more:
1642 only_more -= len(page)
1644 page = page[:only_more]
1651 def uppercase_escape(s):
1652 unicode_escape = codecs.getdecoder('unicode_escape')
1654 r'\\U[0-9a-fA-F]{8}',
1655 lambda m: unicode_escape(m.group(0))[0],
1659 def lowercase_escape(s):
1660 unicode_escape = codecs.getdecoder('unicode_escape')
1662 r'\\u[0-9a-fA-F]{4}',
1663 lambda m: unicode_escape(m.group(0))[0],
1667 def escape_rfc3986(s):
1668 """Escape non-ASCII characters as suggested by RFC 3986"""
1669 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1670 s = s.encode('utf-8')
1671 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1674 def escape_url(url):
1675 """Escape URL as suggested by RFC 3986"""
1676 url_parsed = compat_urllib_parse_urlparse(url)
1677 return url_parsed._replace(
1678 path=escape_rfc3986(url_parsed.path),
1679 params=escape_rfc3986(url_parsed.params),
1680 query=escape_rfc3986(url_parsed.query),
1681 fragment=escape_rfc3986(url_parsed.fragment)
1685 struct.pack('!I', 0)
1687 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1688 def struct_pack(spec, *args):
1689 if isinstance(spec, compat_str):
1690 spec = spec.encode('ascii')
1691 return struct.pack(spec, *args)
1693 def struct_unpack(spec, *args):
1694 if isinstance(spec, compat_str):
1695 spec = spec.encode('ascii')
1696 return struct.unpack(spec, *args)
1698 struct_pack = struct.pack
1699 struct_unpack = struct.unpack
1702 def read_batch_urls(batch_fd):
1704 if not isinstance(url, compat_str):
1705 url = url.decode('utf-8', 'replace')
1706 BOM_UTF8 = '\xef\xbb\xbf'
1707 if url.startswith(BOM_UTF8):
1708 url = url[len(BOM_UTF8):]
1710 if url.startswith(('#', ';', ']')):
1714 with contextlib.closing(batch_fd) as fd:
1715 return [url for url in map(fixup, fd) if url]
1718 def urlencode_postdata(*args, **kargs):
1719 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1722 def update_url_query(url, query):
1723 parsed_url = compat_urlparse.urlparse(url)
1724 qs = compat_parse_qs(parsed_url.query)
1726 return compat_urlparse.urlunparse(parsed_url._replace(
1727 query=compat_urllib_parse.urlencode(qs, True)))
1730 def encode_dict(d, encoding='utf-8'):
1732 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1733 return dict((encode(k), encode(v)) for k, v in d.items())
1736 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1737 if isinstance(key_or_keys, (list, tuple)):
1738 for key in key_or_keys:
1739 if key not in d or d[key] is None or skip_false_values and not d[key]:
1743 return d.get(key_or_keys, default)
1746 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1747 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1759 def parse_age_limit(s):
1762 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1763 return int(m.group('age')) if m else US_RATINGS.get(s)
1766 def strip_jsonp(code):
1768 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1771 def js_to_json(code):
1774 if v in ('true', 'false', 'null'):
1776 if v.startswith('"'):
1777 v = re.sub(r"\\'", "'", v[1:-1])
1778 elif v.startswith("'"):
1780 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1787 res = re.sub(r'''(?x)
1788 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1789 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1790 [a-zA-Z_][.a-zA-Z_0-9]*
1792 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1796 def qualities(quality_ids):
1797 """ Get a numeric quality value out of a list of possible values """
1800 return quality_ids.index(qid)
1806 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1809 def limit_length(s, length):
1810 """ Add ellipses to overly long strings """
1815 return s[:length - len(ELLIPSES)] + ELLIPSES
1819 def version_tuple(v):
1820 return tuple(int(e) for e in re.split(r'[-.]', v))
1823 def is_outdated_version(version, limit, assume_new=True):
1825 return not assume_new
1827 return version_tuple(version) < version_tuple(limit)
1829 return not assume_new
1832 def ytdl_is_updateable():
1833 """ Returns if youtube-dl can be updated with -U """
1834 from zipimport import zipimporter
1836 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1839 def args_to_str(args):
1840 # Get a short string representation for a subprocess command
1841 return ' '.join(shlex_quote(a) for a in args)
1844 def error_to_compat_str(err):
1846 # On python 2 error byte string must be decoded with proper
1847 # encoding rather than ascii
1848 if sys.version_info[0] < 3:
1849 err_str = err_str.decode(preferredencoding())
1853 def mimetype2ext(mt):
1860 _, _, res = mt.rpartition('/')
1864 'smptett+xml': 'tt',
1870 'x-mp4-fragmented': 'mp4',
1875 def urlhandle_detect_ext(url_handle):
1878 getheader = lambda h: url_handle.headers[h]
1879 except AttributeError: # Python < 3
1880 getheader = url_handle.info().getheader
1882 cd = getheader('Content-Disposition')
1884 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1886 e = determine_ext(m.group('filename'), default_ext=None)
1890 return mimetype2ext(getheader('Content-Type'))
1893 def encode_data_uri(data, mime_type):
1894 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1897 def age_restricted(content_limit, age_limit):
1898 """ Returns True iff the content should be blocked """
1900 if age_limit is None: # No limit set
1902 if content_limit is None:
1903 return False # Content available for everyone
1904 return age_limit < content_limit
1907 def is_html(first_bytes):
1908 """ Detect whether a file contains HTML by examining its first bytes. """
1911 (b'\xef\xbb\xbf', 'utf-8'),
1912 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1913 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1914 (b'\xff\xfe', 'utf-16-le'),
1915 (b'\xfe\xff', 'utf-16-be'),
1917 for bom, enc in BOMS:
1918 if first_bytes.startswith(bom):
1919 s = first_bytes[len(bom):].decode(enc, 'replace')
1922 s = first_bytes.decode('utf-8', 'replace')
1924 return re.match(r'^\s*<', s)
1927 def determine_protocol(info_dict):
1928 protocol = info_dict.get('protocol')
1929 if protocol is not None:
1932 url = info_dict['url']
1933 if url.startswith('rtmp'):
1935 elif url.startswith('mms'):
1937 elif url.startswith('rtsp'):
1940 ext = determine_ext(url)
1946 return compat_urllib_parse_urlparse(url).scheme
1949 def render_table(header_row, data):
1950 """ Render a list of rows, each as a list of values """
1951 table = [header_row] + data
1952 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1953 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1954 return '\n'.join(format_str % tuple(row) for row in table)
1957 def _match_one(filter_part, dct):
1958 COMPARISON_OPERATORS = {
1966 operator_rex = re.compile(r'''(?x)\s*
1968 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1970 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1971 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1974 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1975 m = operator_rex.search(filter_part)
1977 op = COMPARISON_OPERATORS[m.group('op')]
1978 if m.group('strval') is not None:
1979 if m.group('op') not in ('=', '!='):
1981 'Operator %s does not support string values!' % m.group('op'))
1982 comparison_value = m.group('strval')
1985 comparison_value = int(m.group('intval'))
1987 comparison_value = parse_filesize(m.group('intval'))
1988 if comparison_value is None:
1989 comparison_value = parse_filesize(m.group('intval') + 'B')
1990 if comparison_value is None:
1992 'Invalid integer value %r in filter part %r' % (
1993 m.group('intval'), filter_part))
1994 actual_value = dct.get(m.group('key'))
1995 if actual_value is None:
1996 return m.group('none_inclusive')
1997 return op(actual_value, comparison_value)
2000 '': lambda v: v is not None,
2001 '!': lambda v: v is None,
2003 operator_rex = re.compile(r'''(?x)\s*
2004 (?P<op>%s)\s*(?P<key>[a-z_]+)
2006 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2007 m = operator_rex.search(filter_part)
2009 op = UNARY_OPERATORS[m.group('op')]
2010 actual_value = dct.get(m.group('key'))
2011 return op(actual_value)
2013 raise ValueError('Invalid filter part %r' % filter_part)
2016 def match_str(filter_str, dct):
2017 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2020 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2023 def match_filter_func(filter_str):
2024 def _match_func(info_dict):
2025 if match_str(filter_str, info_dict):
2028 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2029 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2033 def parse_dfxp_time_expr(time_expr):
2037 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2039 return float(mobj.group('time_offset'))
2041 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2043 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2046 def srt_subtitles_timecode(seconds):
2047 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2050 def dfxp2srt(dfxp_data):
2051 _x = functools.partial(xpath_with_ns, ns_map={
2052 'ttml': 'http://www.w3.org/ns/ttml',
2053 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2056 class TTMLPElementParser(object):
2059 def start(self, tag, attrib):
2060 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2066 def data(self, data):
2070 return self.out.strip()
2072 def parse_node(node):
2073 target = TTMLPElementParser()
2074 parser = xml.etree.ElementTree.XMLParser(target=target)
2075 parser.feed(xml.etree.ElementTree.tostring(node))
2076 return parser.close()
2078 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2080 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2083 raise ValueError('Invalid dfxp/TTML subtitle')
2085 for para, index in zip(paras, itertools.count(1)):
2086 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2087 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2088 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2089 if begin_time is None:
2094 end_time = begin_time + dur
2095 out.append('%d\n%s --> %s\n%s\n\n' % (
2097 srt_subtitles_timecode(begin_time),
2098 srt_subtitles_timecode(end_time),
2104 def cli_option(params, command_option, param):
2105 param = params.get(param)
2106 return [command_option, param] if param is not None else []
2109 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2110 param = params.get(param)
2111 assert isinstance(param, bool)
2113 return [command_option + separator + (true_value if param else false_value)]
2114 return [command_option, true_value if param else false_value]
2117 def cli_valueless_option(params, command_option, param, expected_value=True):
2118 param = params.get(param)
2119 return [command_option] if param == expected_value else []
2122 def cli_configuration_args(params, param, default=[]):
2123 ex_args = params.get(param)
2126 assert isinstance(ex_args, list)
2130 class ISO639Utils(object):
2131 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2320 def short2long(cls, code):
2321 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2322 return cls._lang_map.get(code[:2])
2325 def long2short(cls, code):
2326 """Convert language code from ISO 639-2/T to ISO 639-1"""
2327 for short_name, long_name in cls._lang_map.items():
2328 if long_name == code:
2332 class ISO3166Utils(object):
2333 # From http://data.okfn.org/data/core/country-list
2335 'AF': 'Afghanistan',
2336 'AX': 'Ã…land Islands',
2339 'AS': 'American Samoa',
2344 'AG': 'Antigua and Barbuda',
2361 'BO': 'Bolivia, Plurinational State of',
2362 'BQ': 'Bonaire, Sint Eustatius and Saba',
2363 'BA': 'Bosnia and Herzegovina',
2365 'BV': 'Bouvet Island',
2367 'IO': 'British Indian Ocean Territory',
2368 'BN': 'Brunei Darussalam',
2370 'BF': 'Burkina Faso',
2376 'KY': 'Cayman Islands',
2377 'CF': 'Central African Republic',
2381 'CX': 'Christmas Island',
2382 'CC': 'Cocos (Keeling) Islands',
2386 'CD': 'Congo, the Democratic Republic of the',
2387 'CK': 'Cook Islands',
2389 'CI': 'Côte d\'Ivoire',
2394 'CZ': 'Czech Republic',
2398 'DO': 'Dominican Republic',
2401 'SV': 'El Salvador',
2402 'GQ': 'Equatorial Guinea',
2406 'FK': 'Falkland Islands (Malvinas)',
2407 'FO': 'Faroe Islands',
2411 'GF': 'French Guiana',
2412 'PF': 'French Polynesia',
2413 'TF': 'French Southern Territories',
2428 'GW': 'Guinea-Bissau',
2431 'HM': 'Heard Island and McDonald Islands',
2432 'VA': 'Holy See (Vatican City State)',
2439 'IR': 'Iran, Islamic Republic of',
2442 'IM': 'Isle of Man',
2452 'KP': 'Korea, Democratic People\'s Republic of',
2453 'KR': 'Korea, Republic of',
2456 'LA': 'Lao People\'s Democratic Republic',
2462 'LI': 'Liechtenstein',
2466 'MK': 'Macedonia, the Former Yugoslav Republic of',
2473 'MH': 'Marshall Islands',
2479 'FM': 'Micronesia, Federated States of',
2480 'MD': 'Moldova, Republic of',
2491 'NL': 'Netherlands',
2492 'NC': 'New Caledonia',
2493 'NZ': 'New Zealand',
2498 'NF': 'Norfolk Island',
2499 'MP': 'Northern Mariana Islands',
2504 'PS': 'Palestine, State of',
2506 'PG': 'Papua New Guinea',
2509 'PH': 'Philippines',
2513 'PR': 'Puerto Rico',
2517 'RU': 'Russian Federation',
2519 'BL': 'Saint Barthélemy',
2520 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2521 'KN': 'Saint Kitts and Nevis',
2522 'LC': 'Saint Lucia',
2523 'MF': 'Saint Martin (French part)',
2524 'PM': 'Saint Pierre and Miquelon',
2525 'VC': 'Saint Vincent and the Grenadines',
2528 'ST': 'Sao Tome and Principe',
2529 'SA': 'Saudi Arabia',
2533 'SL': 'Sierra Leone',
2535 'SX': 'Sint Maarten (Dutch part)',
2538 'SB': 'Solomon Islands',
2540 'ZA': 'South Africa',
2541 'GS': 'South Georgia and the South Sandwich Islands',
2542 'SS': 'South Sudan',
2547 'SJ': 'Svalbard and Jan Mayen',
2550 'CH': 'Switzerland',
2551 'SY': 'Syrian Arab Republic',
2552 'TW': 'Taiwan, Province of China',
2554 'TZ': 'Tanzania, United Republic of',
2556 'TL': 'Timor-Leste',
2560 'TT': 'Trinidad and Tobago',
2563 'TM': 'Turkmenistan',
2564 'TC': 'Turks and Caicos Islands',
2568 'AE': 'United Arab Emirates',
2569 'GB': 'United Kingdom',
2570 'US': 'United States',
2571 'UM': 'United States Minor Outlying Islands',
2575 'VE': 'Venezuela, Bolivarian Republic of',
2577 'VG': 'Virgin Islands, British',
2578 'VI': 'Virgin Islands, U.S.',
2579 'WF': 'Wallis and Futuna',
2580 'EH': 'Western Sahara',
2587 def short2full(cls, code):
2588 """Convert an ISO 3166-2 country code to the corresponding full name"""
2589 return cls._country_map.get(code.upper())
2592 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2593 def __init__(self, proxies=None):
2594 # Set default handlers
2595 for type in ('http', 'https'):
2596 setattr(self, '%s_open' % type,
2597 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2598 meth(r, proxy, type))
2599 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2601 def proxy_open(self, req, proxy, type):
2602 req_proxy = req.headers.get('Ytdl-request-proxy')
2603 if req_proxy is not None:
2605 del req.headers['Ytdl-request-proxy']
2607 if proxy == '__noproxy__':
2608 return None # No Proxy
2609 return compat_urllib_request.ProxyHandler.proxy_open(
2610 self, req, proxy, type)
2613 def ohdave_rsa_encrypt(data, exponent, modulus):
2615 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2618 data: data to encrypt, bytes-like object
2619 exponent, modulus: parameter e and N of RSA algorithm, both integer
2620 Output: hex string of encrypted data
2622 Limitation: supports one block encryption only
2625 payload = int(binascii.hexlify(data[::-1]), 16)
2626 encrypted = pow(payload, exponent, modulus)
2627 return '%x' % encrypted
2630 def encode_base_n(num, n, table=None):
2631 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2633 table = FULL_TABLE[:n]
2636 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2643 ret = table[num % n] + ret
2648 def decode_packed_codes(code):
2650 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2652 obfucasted_code, base, count, symbols = mobj.groups()
2655 symbols = symbols.split('|')
2660 base_n_count = encode_base_n(count, base)
2661 symbol_table[base_n_count] = symbols[count] or base_n_count
2664 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],