2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os.rename(tf.name, fn)
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z-]+$', key)
145 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
146 expr = xpath + "[@%s='%s']" % (key, val)
147 return node.find(expr)
149 def find_xpath_attr(node, xpath, key, val):
150 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
151 # .//node does not match if a node is a direct child of . !
152 if isinstance(xpath, compat_str):
153 xpath = xpath.encode('ascii')
155 for f in node.findall(xpath):
156 if f.attrib.get(key) == val:
160 # On python2.6 the xml.etree.ElementTree.Element methods don't support
161 # the namespace parameter
164 def xpath_with_ns(path, ns_map):
165 components = [c.split(':') for c in path.split('/')]
169 replaced.append(c[0])
172 replaced.append('{%s}%s' % (ns_map[ns], tag))
173 return '/'.join(replaced)
176 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
177 if sys.version_info < (2, 7): # Crazy 2.6
178 xpath = xpath.encode('ascii')
181 if n is None or n.text is None:
182 if default is not NO_DEFAULT:
185 name = xpath if name is None else name
186 raise ExtractorError('Could not find XML element %s' % name)
192 def get_element_by_id(id, html):
193 """Return the content of the tag with the specified ID in the passed HTML document"""
194 return get_element_by_attribute("id", id, html)
197 def get_element_by_attribute(attribute, value, html):
198 """Return the content of the tag with the specified attribute in the passed HTML document"""
200 m = re.search(r'''(?xs)
202 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
204 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
208 ''' % (re.escape(attribute), re.escape(value)), html)
212 res = m.group('content')
214 if res.startswith('"') or res.startswith("'"):
217 return unescapeHTML(res)
220 def clean_html(html):
221 """Clean an HTML snippet into a readable string"""
223 if html is None: # Convenience for sanitizing descriptions etc.
227 html = html.replace('\n', ' ')
228 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
229 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
231 html = re.sub('<.*?>', '', html)
232 # Replace html entities
233 html = unescapeHTML(html)
237 def sanitize_open(filename, open_mode):
238 """Try to open the given filename, and slightly tweak it if this fails.
240 Attempts to open the given filename. If this fails, it tries to change
241 the filename slightly, step by step, until it's either able to open it
242 or it fails and raises a final exception, like the standard open()
245 It returns the tuple (stream, definitive_file_name).
249 if sys.platform == 'win32':
251 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
252 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
253 stream = open(encodeFilename(filename), open_mode)
254 return (stream, filename)
255 except (IOError, OSError) as err:
256 if err.errno in (errno.EACCES,):
259 # In case of error, try to remove win32 forbidden chars
260 alt_filename = sanitize_path(filename)
261 if alt_filename == filename:
264 # An exception here should be caught in the caller
265 stream = open(encodeFilename(alt_filename), open_mode)
266 return (stream, alt_filename)
269 def timeconvert(timestr):
270 """Convert RFC 2822 defined time string into system timestamp"""
272 timetuple = email.utils.parsedate_tz(timestr)
273 if timetuple is not None:
274 timestamp = email.utils.mktime_tz(timetuple)
278 def sanitize_filename(s, restricted=False, is_id=False):
279 """Sanitizes a string so it could be used as part of a filename.
280 If restricted is set, use a stricter subset of allowed characters.
281 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
283 def replace_insane(char):
284 if char == '?' or ord(char) < 32 or ord(char) == 127:
287 return '' if restricted else '\''
289 return '_-' if restricted else ' -'
290 elif char in '\\/|*<>':
292 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
294 if restricted and ord(char) > 127:
299 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
300 result = ''.join(map(replace_insane, s))
302 while '__' in result:
303 result = result.replace('__', '_')
304 result = result.strip('_')
305 # Common case of "Foreign band name - English song title"
306 if restricted and result.startswith('-_'):
308 if result.startswith('-'):
309 result = '_' + result[len('-'):]
310 result = result.lstrip('.')
316 def sanitize_path(s):
317 """Sanitizes and normalizes path on Windows"""
318 if sys.platform != 'win32':
320 drive_or_unc, _ = os.path.splitdrive(s)
321 if sys.version_info < (2, 7) and not drive_or_unc:
322 drive_or_unc, _ = os.path.splitunc(s)
323 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
327 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
328 for path_part in norm_path]
330 sanitized_path.insert(0, drive_or_unc + os.path.sep)
331 return os.path.join(*sanitized_path)
334 def orderedSet(iterable):
335 """ Remove all duplicates from the input iterable """
343 def _htmlentity_transform(entity):
344 """Transforms an HTML entity to a character."""
345 # Known non-numeric HTML entity
346 if entity in compat_html_entities.name2codepoint:
347 return compat_chr(compat_html_entities.name2codepoint[entity])
349 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
351 numstr = mobj.group(1)
352 if numstr.startswith('x'):
354 numstr = '0%s' % numstr
357 return compat_chr(int(numstr, base))
359 # Unknown entity in name, return its literal representation
360 return ('&%s;' % entity)
366 assert type(s) == compat_str
369 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
372 def get_subprocess_encoding():
373 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
374 # For subprocess calls, encode with locale encoding
375 # Refer to http://stackoverflow.com/a/9951851/35070
376 encoding = preferredencoding()
378 encoding = sys.getfilesystemencoding()
384 def encodeFilename(s, for_subprocess=False):
386 @param s The name of the file
389 assert type(s) == compat_str
391 # Python 3 has a Unicode API
392 if sys.version_info >= (3, 0):
395 # Pass '' directly to use Unicode APIs on Windows 2000 and up
396 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
397 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
398 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
401 return s.encode(get_subprocess_encoding(), 'ignore')
404 def decodeFilename(b, for_subprocess=False):
406 if sys.version_info >= (3, 0):
409 if not isinstance(b, bytes):
412 return b.decode(get_subprocess_encoding(), 'ignore')
415 def encodeArgument(s):
416 if not isinstance(s, compat_str):
417 # Legacy code that uses byte strings
418 # Uncomment the following line after fixing all post processors
419 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
420 s = s.decode('ascii')
421 return encodeFilename(s, True)
424 def decodeArgument(b):
425 return decodeFilename(b, True)
428 def decodeOption(optval):
431 if isinstance(optval, bytes):
432 optval = optval.decode(preferredencoding())
434 assert isinstance(optval, compat_str)
438 def formatSeconds(secs):
440 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
442 return '%d:%02d' % (secs // 60, secs % 60)
447 def make_HTTPS_handler(params, **kwargs):
448 opts_no_check_certificate = params.get('nocheckcertificate', False)
449 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
450 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
451 if opts_no_check_certificate:
452 context.check_hostname = False
453 context.verify_mode = ssl.CERT_NONE
455 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
458 # (create_default_context present but HTTPSHandler has no context=)
461 if sys.version_info < (3, 2):
462 return YoutubeDLHTTPSHandler(params, **kwargs)
464 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
465 context.verify_mode = (ssl.CERT_NONE
466 if opts_no_check_certificate
467 else ssl.CERT_REQUIRED)
468 context.set_default_verify_paths()
469 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
472 def bug_reports_message():
473 if ytdl_is_updateable():
474 update_cmd = 'type youtube-dl -U to update'
476 update_cmd = 'see https://yt-dl.org/update on how to update'
477 msg = '; please report this issue on https://yt-dl.org/bug .'
478 msg += ' Make sure you are using the latest version; %s.' % update_cmd
479 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 class ExtractorError(Exception):
484 """Error during info extraction."""
486 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
487 """ tb, if given, is the original traceback (so that it can be printed out).
488 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
491 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
493 if video_id is not None:
494 msg = video_id + ': ' + msg
496 msg += ' (caused by %r)' % cause
498 msg += bug_reports_message()
499 super(ExtractorError, self).__init__(msg)
502 self.exc_info = sys.exc_info() # preserve original exception
504 self.video_id = video_id
506 def format_traceback(self):
507 if self.traceback is None:
509 return ''.join(traceback.format_tb(self.traceback))
512 class UnsupportedError(ExtractorError):
513 def __init__(self, url):
514 super(UnsupportedError, self).__init__(
515 'Unsupported URL: %s' % url, expected=True)
519 class RegexNotFoundError(ExtractorError):
520 """Error when a regex didn't match"""
524 class DownloadError(Exception):
525 """Download Error exception.
527 This exception may be thrown by FileDownloader objects if they are not
528 configured to continue on errors. They will contain the appropriate
532 def __init__(self, msg, exc_info=None):
533 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
534 super(DownloadError, self).__init__(msg)
535 self.exc_info = exc_info
538 class SameFileError(Exception):
539 """Same File exception.
541 This exception will be thrown by FileDownloader objects if they detect
542 multiple files would have to be downloaded to the same file on disk.
547 class PostProcessingError(Exception):
548 """Post Processing exception.
550 This exception may be raised by PostProcessor's .run() method to
551 indicate an error in the postprocessing task.
554 def __init__(self, msg):
558 class MaxDownloadsReached(Exception):
559 """ --max-downloads limit has been reached. """
563 class UnavailableVideoError(Exception):
564 """Unavailable Format exception.
566 This exception will be thrown when a video is requested
567 in a format that is not available for that video.
572 class ContentTooShortError(Exception):
573 """Content Too Short exception.
575 This exception may be raised by FileDownloader objects when a file they
576 download is too small for what the server announced first, indicating
577 the connection was probably interrupted.
580 def __init__(self, downloaded, expected):
582 self.downloaded = downloaded
583 self.expected = expected
586 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
587 hc = http_class(*args, **kwargs)
588 source_address = ydl_handler._params.get('source_address')
589 if source_address is not None:
590 sa = (source_address, 0)
591 if hasattr(hc, 'source_address'): # Python 2.7+
592 hc.source_address = sa
594 def _hc_connect(self, *args, **kwargs):
595 sock = compat_socket_create_connection(
596 (self.host, self.port), self.timeout, sa)
598 self.sock = ssl.wrap_socket(
599 sock, self.key_file, self.cert_file,
600 ssl_version=ssl.PROTOCOL_TLSv1)
603 hc.connect = functools.partial(_hc_connect, hc)
608 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
609 """Handler for HTTP requests and responses.
611 This class, when installed with an OpenerDirector, automatically adds
612 the standard headers to every HTTP request and handles gzipped and
613 deflated responses from web servers. If compression is to be avoided in
614 a particular request, the original request in the program code only has
615 to include the HTTP header "Youtubedl-No-Compression", which will be
616 removed before making the real request.
618 Part of this code was copied from:
620 http://techknack.net/python-urllib2-handlers/
622 Andrew Rowls, the author of that code, agreed to release it to the
626 def __init__(self, params, *args, **kwargs):
627 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
628 self._params = params
630 def http_open(self, req):
631 return self.do_open(functools.partial(
632 _create_http_connection, self, compat_http_client.HTTPConnection, False),
638 return zlib.decompress(data, -zlib.MAX_WBITS)
640 return zlib.decompress(data)
643 def addinfourl_wrapper(stream, headers, url, code):
644 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
645 return compat_urllib_request.addinfourl(stream, headers, url, code)
646 ret = compat_urllib_request.addinfourl(stream, headers, url)
650 def http_request(self, req):
651 for h, v in std_headers.items():
652 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
653 # The dict keys are capitalized because of this bug by urllib
654 if h.capitalize() not in req.headers:
656 if 'Youtubedl-no-compression' in req.headers:
657 if 'Accept-encoding' in req.headers:
658 del req.headers['Accept-encoding']
659 del req.headers['Youtubedl-no-compression']
661 if sys.version_info < (2, 7) and '#' in req.get_full_url():
662 # Python 2.6 is brain-dead when it comes to fragments
663 req._Request__original = req._Request__original.partition('#')[0]
664 req._Request__r_type = req._Request__r_type.partition('#')[0]
668 def http_response(self, req, resp):
671 if resp.headers.get('Content-encoding', '') == 'gzip':
672 content = resp.read()
673 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
675 uncompressed = io.BytesIO(gz.read())
676 except IOError as original_ioerror:
677 # There may be junk add the end of the file
678 # See http://stackoverflow.com/q/4928560/35070 for details
679 for i in range(1, 1024):
681 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
682 uncompressed = io.BytesIO(gz.read())
687 raise original_ioerror
688 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
689 resp.msg = old_resp.msg
691 if resp.headers.get('Content-encoding', '') == 'deflate':
692 gz = io.BytesIO(self.deflate(resp.read()))
693 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
694 resp.msg = old_resp.msg
697 https_request = http_request
698 https_response = http_response
701 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
702 def __init__(self, params, https_conn_class=None, *args, **kwargs):
703 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
704 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
705 self._params = params
707 def https_open(self, req):
709 if hasattr(self, '_context'): # python > 2.6
710 kwargs['context'] = self._context
711 if hasattr(self, '_check_hostname'): # python 3.x
712 kwargs['check_hostname'] = self._check_hostname
713 return self.do_open(functools.partial(
714 _create_http_connection, self, self._https_conn_class, True),
718 def parse_iso8601(date_str, delimiter='T', timezone=None):
719 """ Return a UNIX timestamp from the given date """
726 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
729 timezone = datetime.timedelta()
731 date_str = date_str[:-len(m.group(0))]
732 if not m.group('sign'):
733 timezone = datetime.timedelta()
735 sign = 1 if m.group('sign') == '+' else -1
736 timezone = datetime.timedelta(
737 hours=sign * int(m.group('hours')),
738 minutes=sign * int(m.group('minutes')))
739 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
740 dt = datetime.datetime.strptime(date_str, date_format) - timezone
741 return calendar.timegm(dt.timetuple())
744 def unified_strdate(date_str, day_first=True):
745 """Return a string with the date in the format YYYYMMDD"""
751 date_str = date_str.replace(',', ' ')
752 # %z (UTC offset) is only supported in python>=3.2
753 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
754 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
755 # Remove AM/PM + timezone
756 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
758 format_expressions = [
763 '%b %dst %Y %I:%M%p',
764 '%b %dnd %Y %I:%M%p',
765 '%b %dth %Y %I:%M%p',
771 '%Y-%m-%d %H:%M:%S.%f',
774 '%Y-%m-%dT%H:%M:%SZ',
775 '%Y-%m-%dT%H:%M:%S.%fZ',
776 '%Y-%m-%dT%H:%M:%S.%f0Z',
778 '%Y-%m-%dT%H:%M:%S.%f',
782 format_expressions.extend([
790 format_expressions.extend([
797 for expression in format_expressions:
799 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
802 if upload_date is None:
803 timetuple = email.utils.parsedate_tz(date_str)
805 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
809 def determine_ext(url, default_ext='unknown_video'):
812 guess = url.partition('?')[0].rpartition('.')[2]
813 if re.match(r'^[A-Za-z0-9]+$', guess):
819 def subtitles_filename(filename, sub_lang, sub_format):
820 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
823 def date_from_str(date_str):
825 Return a datetime object from a string in the format YYYYMMDD or
826 (now|today)[+-][0-9](day|week|month|year)(s)?"""
827 today = datetime.date.today()
828 if date_str in ('now', 'today'):
830 if date_str == 'yesterday':
831 return today - datetime.timedelta(days=1)
832 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
833 if match is not None:
834 sign = match.group('sign')
835 time = int(match.group('time'))
838 unit = match.group('unit')
839 # A bad aproximation?
847 delta = datetime.timedelta(**{unit: time})
849 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
852 def hyphenate_date(date_str):
854 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
855 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
856 if match is not None:
857 return '-'.join(match.groups())
862 class DateRange(object):
863 """Represents a time interval between two dates"""
865 def __init__(self, start=None, end=None):
866 """start and end must be strings in the format accepted by date"""
867 if start is not None:
868 self.start = date_from_str(start)
870 self.start = datetime.datetime.min.date()
872 self.end = date_from_str(end)
874 self.end = datetime.datetime.max.date()
875 if self.start > self.end:
876 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
880 """Returns a range that only contains the given day"""
883 def __contains__(self, date):
884 """Check if the date is in the range"""
885 if not isinstance(date, datetime.date):
886 date = date_from_str(date)
887 return self.start <= date <= self.end
890 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
894 """ Returns the platform name as a compat_str """
895 res = platform.platform()
896 if isinstance(res, bytes):
897 res = res.decode(preferredencoding())
899 assert isinstance(res, compat_str)
903 def _windows_write_string(s, out):
904 """ Returns True if the string was written using special methods,
905 False if it has yet to be written out."""
906 # Adapted from http://stackoverflow.com/a/3259271/35070
909 import ctypes.wintypes
917 fileno = out.fileno()
918 except AttributeError:
919 # If the output stream doesn't have a fileno, it's virtual
921 except io.UnsupportedOperation:
922 # Some strange Windows pseudo files?
924 if fileno not in WIN_OUTPUT_IDS:
927 GetStdHandle = ctypes.WINFUNCTYPE(
928 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
929 (b"GetStdHandle", ctypes.windll.kernel32))
930 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
932 WriteConsoleW = ctypes.WINFUNCTYPE(
933 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
934 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
935 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
936 written = ctypes.wintypes.DWORD(0)
938 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
939 FILE_TYPE_CHAR = 0x0002
940 FILE_TYPE_REMOTE = 0x8000
941 GetConsoleMode = ctypes.WINFUNCTYPE(
942 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
943 ctypes.POINTER(ctypes.wintypes.DWORD))(
944 (b"GetConsoleMode", ctypes.windll.kernel32))
945 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
947 def not_a_console(handle):
948 if handle == INVALID_HANDLE_VALUE or handle is None:
950 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
951 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
956 def next_nonbmp_pos(s):
958 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
959 except StopIteration:
963 count = min(next_nonbmp_pos(s), 1024)
966 h, s, count if count else 2, ctypes.byref(written), None)
968 raise OSError('Failed to write string')
969 if not count: # We just wrote a non-BMP character
970 assert written.value == 2
973 assert written.value > 0
974 s = s[written.value:]
978 def write_string(s, out=None, encoding=None):
981 assert type(s) == compat_str
983 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
984 if _windows_write_string(s, out):
987 if ('b' in getattr(out, 'mode', '') or
988 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
989 byt = s.encode(encoding or preferredencoding(), 'ignore')
991 elif hasattr(out, 'buffer'):
992 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
993 byt = s.encode(enc, 'ignore')
994 out.buffer.write(byt)
1000 def bytes_to_intlist(bs):
1003 if isinstance(bs[0], int): # Python 3
1006 return [ord(c) for c in bs]
1009 def intlist_to_bytes(xs):
1012 return struct_pack('%dB' % len(xs), *xs)
1015 # Cross-platform file locking
1016 if sys.platform == 'win32':
1017 import ctypes.wintypes
1020 class OVERLAPPED(ctypes.Structure):
1022 ('Internal', ctypes.wintypes.LPVOID),
1023 ('InternalHigh', ctypes.wintypes.LPVOID),
1024 ('Offset', ctypes.wintypes.DWORD),
1025 ('OffsetHigh', ctypes.wintypes.DWORD),
1026 ('hEvent', ctypes.wintypes.HANDLE),
1029 kernel32 = ctypes.windll.kernel32
1030 LockFileEx = kernel32.LockFileEx
1031 LockFileEx.argtypes = [
1032 ctypes.wintypes.HANDLE, # hFile
1033 ctypes.wintypes.DWORD, # dwFlags
1034 ctypes.wintypes.DWORD, # dwReserved
1035 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1036 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1037 ctypes.POINTER(OVERLAPPED) # Overlapped
1039 LockFileEx.restype = ctypes.wintypes.BOOL
1040 UnlockFileEx = kernel32.UnlockFileEx
1041 UnlockFileEx.argtypes = [
1042 ctypes.wintypes.HANDLE, # hFile
1043 ctypes.wintypes.DWORD, # dwReserved
1044 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1045 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1046 ctypes.POINTER(OVERLAPPED) # Overlapped
1048 UnlockFileEx.restype = ctypes.wintypes.BOOL
1049 whole_low = 0xffffffff
1050 whole_high = 0x7fffffff
1052 def _lock_file(f, exclusive):
1053 overlapped = OVERLAPPED()
1054 overlapped.Offset = 0
1055 overlapped.OffsetHigh = 0
1056 overlapped.hEvent = 0
1057 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1058 handle = msvcrt.get_osfhandle(f.fileno())
1059 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1060 whole_low, whole_high, f._lock_file_overlapped_p):
1061 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1063 def _unlock_file(f):
1064 assert f._lock_file_overlapped_p
1065 handle = msvcrt.get_osfhandle(f.fileno())
1066 if not UnlockFileEx(handle, 0,
1067 whole_low, whole_high, f._lock_file_overlapped_p):
1068 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1073 def _lock_file(f, exclusive):
1074 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1076 def _unlock_file(f):
1077 fcntl.flock(f, fcntl.LOCK_UN)
1080 class locked_file(object):
1081 def __init__(self, filename, mode, encoding=None):
1082 assert mode in ['r', 'a', 'w']
1083 self.f = io.open(filename, mode, encoding=encoding)
1086 def __enter__(self):
1087 exclusive = self.mode != 'r'
1089 _lock_file(self.f, exclusive)
1095 def __exit__(self, etype, value, traceback):
1097 _unlock_file(self.f)
1104 def write(self, *args):
1105 return self.f.write(*args)
1107 def read(self, *args):
1108 return self.f.read(*args)
1111 def get_filesystem_encoding():
1112 encoding = sys.getfilesystemencoding()
1113 return encoding if encoding is not None else 'utf-8'
1116 def shell_quote(args):
1118 encoding = get_filesystem_encoding()
1120 if isinstance(a, bytes):
1121 # We may get a filename encoded with 'encodeFilename'
1122 a = a.decode(encoding)
1123 quoted_args.append(pipes.quote(a))
1124 return ' '.join(quoted_args)
1127 def smuggle_url(url, data):
1128 """ Pass additional data in a URL for internal use. """
1130 sdata = compat_urllib_parse.urlencode(
1131 {'__youtubedl_smuggle': json.dumps(data)})
1132 return url + '#' + sdata
1135 def unsmuggle_url(smug_url, default=None):
1136 if '#__youtubedl_smuggle' not in smug_url:
1137 return smug_url, default
1138 url, _, sdata = smug_url.rpartition('#')
1139 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1140 data = json.loads(jsond)
1144 def format_bytes(bytes):
1147 if type(bytes) is str:
1148 bytes = float(bytes)
1152 exponent = int(math.log(bytes, 1024.0))
1153 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1154 converted = float(bytes) / float(1024 ** exponent)
1155 return '%.2f%s' % (converted, suffix)
1158 def parse_filesize(s):
1162 # The lower-case forms are of course incorrect and inofficial,
1163 # but we support those too
1201 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1203 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1207 num_str = m.group('num').replace(',', '.')
1208 mult = _UNIT_TABLE[m.group('unit')]
1209 return int(float(num_str) * mult)
1212 def month_by_name(name):
1213 """ Return the number of a month by (locale-independently) English name """
1216 return ENGLISH_MONTH_NAMES.index(name) + 1
1221 def month_by_abbreviation(abbrev):
1222 """ Return the number of a month by (locale-independently) English
1226 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1231 def fix_xml_ampersands(xml_str):
1232 """Replace all the '&' by '&' in XML"""
1234 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1239 def setproctitle(title):
1240 assert isinstance(title, compat_str)
1242 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245 title_bytes = title.encode('utf-8')
1246 buf = ctypes.create_string_buffer(len(title_bytes))
1247 buf.value = title_bytes
1249 libc.prctl(15, buf, 0, 0, 0)
1250 except AttributeError:
1251 return # Strange libc, just skip this
1254 def remove_start(s, start):
1255 if s.startswith(start):
1256 return s[len(start):]
1260 def remove_end(s, end):
1262 return s[:-len(end)]
1266 def url_basename(url):
1267 path = compat_urlparse.urlparse(url).path
1268 return path.strip('/').split('/')[-1]
1271 class HEADRequest(compat_urllib_request.Request):
1272 def get_method(self):
1276 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1279 v = getattr(v, get_attr, None)
1282 return default if v is None else (int(v) * invscale // scale)
1285 def str_or_none(v, default=None):
1286 return default if v is None else compat_str(v)
1289 def str_to_int(int_str):
1290 """ A more relaxed version of int_or_none """
1293 int_str = re.sub(r'[,\.\+]', '', int_str)
1297 def float_or_none(v, scale=1, invscale=1, default=None):
1298 return default if v is None else (float(v) * invscale / scale)
1301 def parse_duration(s):
1302 if not isinstance(s, compat_basestring):
1310 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1311 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1313 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1316 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1317 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1319 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1321 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1326 if m.group('only_mins'):
1327 return float_or_none(m.group('only_mins'), invscale=60)
1328 if m.group('only_hours'):
1329 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1331 res += int(m.group('secs'))
1332 if m.group('mins_reversed'):
1333 res += int(m.group('mins_reversed')) * 60
1335 res += int(m.group('mins')) * 60
1336 if m.group('hours'):
1337 res += int(m.group('hours')) * 60 * 60
1338 if m.group('hours_reversed'):
1339 res += int(m.group('hours_reversed')) * 60 * 60
1341 res += int(m.group('days')) * 24 * 60 * 60
1343 res += float(m.group('ms'))
1347 def prepend_extension(filename, ext, expected_real_ext=None):
1348 name, real_ext = os.path.splitext(filename)
1350 '{0}.{1}{2}'.format(name, ext, real_ext)
1351 if not expected_real_ext or real_ext[1:] == expected_real_ext
1352 else '{0}.{1}'.format(filename, ext))
1355 def replace_extension(filename, ext, expected_real_ext=None):
1356 name, real_ext = os.path.splitext(filename)
1357 return '{0}.{1}'.format(
1358 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1362 def check_executable(exe, args=[]):
1363 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1364 args can be a list of arguments for a short output (like -version) """
1366 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1372 def get_exe_version(exe, args=['--version'],
1373 version_re=None, unrecognized='present'):
1374 """ Returns the version of the specified executable,
1375 or False if the executable is not present """
1377 out, _ = subprocess.Popen(
1378 [encodeArgument(exe)] + args,
1379 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1382 if isinstance(out, bytes): # Python 2.x
1383 out = out.decode('ascii', 'ignore')
1384 return detect_exe_version(out, version_re, unrecognized)
1387 def detect_exe_version(output, version_re=None, unrecognized='present'):
1388 assert isinstance(output, compat_str)
1389 if version_re is None:
1390 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1391 m = re.search(version_re, output)
1398 class PagedList(object):
1400 # This is only useful for tests
1401 return len(self.getslice())
1404 class OnDemandPagedList(PagedList):
1405 def __init__(self, pagefunc, pagesize):
1406 self._pagefunc = pagefunc
1407 self._pagesize = pagesize
1409 def getslice(self, start=0, end=None):
1411 for pagenum in itertools.count(start // self._pagesize):
1412 firstid = pagenum * self._pagesize
1413 nextfirstid = pagenum * self._pagesize + self._pagesize
1414 if start >= nextfirstid:
1417 page_results = list(self._pagefunc(pagenum))
1420 start % self._pagesize
1421 if firstid <= start < nextfirstid
1425 ((end - 1) % self._pagesize) + 1
1426 if (end is not None and firstid <= end <= nextfirstid)
1429 if startv != 0 or endv is not None:
1430 page_results = page_results[startv:endv]
1431 res.extend(page_results)
1433 # A little optimization - if current page is not "full", ie. does
1434 # not contain page_size videos then we can assume that this page
1435 # is the last one - there are no more ids on further pages -
1436 # i.e. no need to query again.
1437 if len(page_results) + startv < self._pagesize:
1440 # If we got the whole page, but the next page is not interesting,
1441 # break out early as well
1442 if end == nextfirstid:
1447 class InAdvancePagedList(PagedList):
1448 def __init__(self, pagefunc, pagecount, pagesize):
1449 self._pagefunc = pagefunc
1450 self._pagecount = pagecount
1451 self._pagesize = pagesize
1453 def getslice(self, start=0, end=None):
1455 start_page = start // self._pagesize
1457 self._pagecount if end is None else (end // self._pagesize + 1))
1458 skip_elems = start - start_page * self._pagesize
1459 only_more = None if end is None else end - start
1460 for pagenum in range(start_page, end_page):
1461 page = list(self._pagefunc(pagenum))
1463 page = page[skip_elems:]
1465 if only_more is not None:
1466 if len(page) < only_more:
1467 only_more -= len(page)
1469 page = page[:only_more]
1476 def uppercase_escape(s):
1477 unicode_escape = codecs.getdecoder('unicode_escape')
1479 r'\\U[0-9a-fA-F]{8}',
1480 lambda m: unicode_escape(m.group(0))[0],
1484 def lowercase_escape(s):
1485 unicode_escape = codecs.getdecoder('unicode_escape')
1487 r'\\u[0-9a-fA-F]{4}',
1488 lambda m: unicode_escape(m.group(0))[0],
1492 def escape_rfc3986(s):
1493 """Escape non-ASCII characters as suggested by RFC 3986"""
1494 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1495 s = s.encode('utf-8')
1496 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1499 def escape_url(url):
1500 """Escape URL as suggested by RFC 3986"""
1501 url_parsed = compat_urllib_parse_urlparse(url)
1502 return url_parsed._replace(
1503 path=escape_rfc3986(url_parsed.path),
1504 params=escape_rfc3986(url_parsed.params),
1505 query=escape_rfc3986(url_parsed.query),
1506 fragment=escape_rfc3986(url_parsed.fragment)
1510 struct.pack('!I', 0)
1512 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1513 def struct_pack(spec, *args):
1514 if isinstance(spec, compat_str):
1515 spec = spec.encode('ascii')
1516 return struct.pack(spec, *args)
1518 def struct_unpack(spec, *args):
1519 if isinstance(spec, compat_str):
1520 spec = spec.encode('ascii')
1521 return struct.unpack(spec, *args)
1523 struct_pack = struct.pack
1524 struct_unpack = struct.unpack
1527 def read_batch_urls(batch_fd):
1529 if not isinstance(url, compat_str):
1530 url = url.decode('utf-8', 'replace')
1531 BOM_UTF8 = '\xef\xbb\xbf'
1532 if url.startswith(BOM_UTF8):
1533 url = url[len(BOM_UTF8):]
1535 if url.startswith(('#', ';', ']')):
1539 with contextlib.closing(batch_fd) as fd:
1540 return [url for url in map(fixup, fd) if url]
1543 def urlencode_postdata(*args, **kargs):
1544 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1548 etree_iter = xml.etree.ElementTree.Element.iter
1549 except AttributeError: # Python <=2.6
1550 etree_iter = lambda n: n.findall('.//*')
1554 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1555 def doctype(self, name, pubid, system):
1556 pass # Ignore doctypes
1558 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1559 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1560 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1561 # Fix up XML parser in Python 2.x
1562 if sys.version_info < (3, 0):
1563 for n in etree_iter(tree):
1564 if n.text is not None:
1565 if not isinstance(n.text, compat_str):
1566 n.text = n.text.decode('utf-8')
1579 def parse_age_limit(s):
1582 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1583 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1586 def strip_jsonp(code):
1588 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1591 def js_to_json(code):
1594 if v in ('true', 'false', 'null'):
1596 if v.startswith('"'):
1598 if v.startswith("'"):
1600 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1607 res = re.sub(r'''(?x)
1608 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1609 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1610 [a-zA-Z_][.a-zA-Z_0-9]*
1612 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1616 def qualities(quality_ids):
1617 """ Get a numeric quality value out of a list of possible values """
1620 return quality_ids.index(qid)
1626 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1629 def limit_length(s, length):
1630 """ Add ellipses to overly long strings """
1635 return s[:length - len(ELLIPSES)] + ELLIPSES
1639 def version_tuple(v):
1640 return tuple(int(e) for e in re.split(r'[-.]', v))
1643 def is_outdated_version(version, limit, assume_new=True):
1645 return not assume_new
1647 return version_tuple(version) < version_tuple(limit)
1649 return not assume_new
1652 def ytdl_is_updateable():
1653 """ Returns if youtube-dl can be updated with -U """
1654 from zipimport import zipimporter
1656 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1659 def args_to_str(args):
1660 # Get a short string representation for a subprocess command
1661 return ' '.join(shlex_quote(a) for a in args)
1664 def mimetype2ext(mt):
1665 _, _, res = mt.rpartition('/')
1669 'x-mp4-fragmented': 'mp4',
1674 def urlhandle_detect_ext(url_handle):
1677 getheader = lambda h: url_handle.headers[h]
1678 except AttributeError: # Python < 3
1679 getheader = url_handle.info().getheader
1681 cd = getheader('Content-Disposition')
1683 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1685 e = determine_ext(m.group('filename'), default_ext=None)
1689 return mimetype2ext(getheader('Content-Type'))
1692 def age_restricted(content_limit, age_limit):
1693 """ Returns True iff the content should be blocked """
1695 if age_limit is None: # No limit set
1697 if content_limit is None:
1698 return False # Content available for everyone
1699 return age_limit < content_limit
1702 def is_html(first_bytes):
1703 """ Detect whether a file contains HTML by examining its first bytes. """
1706 (b'\xef\xbb\xbf', 'utf-8'),
1707 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1708 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1709 (b'\xff\xfe', 'utf-16-le'),
1710 (b'\xfe\xff', 'utf-16-be'),
1712 for bom, enc in BOMS:
1713 if first_bytes.startswith(bom):
1714 s = first_bytes[len(bom):].decode(enc, 'replace')
1717 s = first_bytes.decode('utf-8', 'replace')
1719 return re.match(r'^\s*<', s)
1722 def determine_protocol(info_dict):
1723 protocol = info_dict.get('protocol')
1724 if protocol is not None:
1727 url = info_dict['url']
1728 if url.startswith('rtmp'):
1730 elif url.startswith('mms'):
1732 elif url.startswith('rtsp'):
1735 ext = determine_ext(url)
1741 return compat_urllib_parse_urlparse(url).scheme
1744 def render_table(header_row, data):
1745 """ Render a list of rows, each as a list of values """
1746 table = [header_row] + data
1747 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1748 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1749 return '\n'.join(format_str % tuple(row) for row in table)
1752 def _match_one(filter_part, dct):
1753 COMPARISON_OPERATORS = {
1761 operator_rex = re.compile(r'''(?x)\s*
1763 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1765 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1766 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1769 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1770 m = operator_rex.search(filter_part)
1772 op = COMPARISON_OPERATORS[m.group('op')]
1773 if m.group('strval') is not None:
1774 if m.group('op') not in ('=', '!='):
1776 'Operator %s does not support string values!' % m.group('op'))
1777 comparison_value = m.group('strval')
1780 comparison_value = int(m.group('intval'))
1782 comparison_value = parse_filesize(m.group('intval'))
1783 if comparison_value is None:
1784 comparison_value = parse_filesize(m.group('intval') + 'B')
1785 if comparison_value is None:
1787 'Invalid integer value %r in filter part %r' % (
1788 m.group('intval'), filter_part))
1789 actual_value = dct.get(m.group('key'))
1790 if actual_value is None:
1791 return m.group('none_inclusive')
1792 return op(actual_value, comparison_value)
1795 '': lambda v: v is not None,
1796 '!': lambda v: v is None,
1798 operator_rex = re.compile(r'''(?x)\s*
1799 (?P<op>%s)\s*(?P<key>[a-z_]+)
1801 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1802 m = operator_rex.search(filter_part)
1804 op = UNARY_OPERATORS[m.group('op')]
1805 actual_value = dct.get(m.group('key'))
1806 return op(actual_value)
1808 raise ValueError('Invalid filter part %r' % filter_part)
1811 def match_str(filter_str, dct):
1812 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1815 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1818 def match_filter_func(filter_str):
1819 def _match_func(info_dict):
1820 if match_str(filter_str, info_dict):
1823 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1824 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1828 def parse_dfxp_time_expr(time_expr):
1832 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1834 return float(mobj.group('time_offset'))
1836 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1838 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1841 def srt_subtitles_timecode(seconds):
1842 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1845 def dfxp2srt(dfxp_data):
1846 _x = functools.partial(xpath_with_ns, ns_map={
1847 'ttml': 'http://www.w3.org/ns/ttml',
1848 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1851 def parse_node(node):
1852 str_or_empty = functools.partial(str_or_none, default='')
1854 out = str_or_empty(node.text)
1857 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1858 out += '\n' + str_or_empty(child.tail)
1859 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1860 out += str_or_empty(parse_node(child))
1862 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1866 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1868 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1871 raise ValueError('Invalid dfxp/TTML subtitle')
1873 for para, index in zip(paras, itertools.count(1)):
1874 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1875 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1877 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1878 out.append('%d\n%s --> %s\n%s\n\n' % (
1880 srt_subtitles_timecode(begin_time),
1881 srt_subtitles_timecode(end_time),
1887 class ISO639Utils(object):
1888 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2077 def short2long(cls, code):
2078 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2079 return cls._lang_map.get(code[:2])
2082 def long2short(cls, code):
2083 """Convert language code from ISO 639-2/T to ISO 639-1"""
2084 for short_name, long_name in cls._lang_map.items():
2085 if long_name == code:
2089 class ISO3166Utils(object):
2090 # From http://data.okfn.org/data/core/country-list
2092 'AF': 'Afghanistan',
2093 'AX': 'Ã…land Islands',
2096 'AS': 'American Samoa',
2101 'AG': 'Antigua and Barbuda',
2118 'BO': 'Bolivia, Plurinational State of',
2119 'BQ': 'Bonaire, Sint Eustatius and Saba',
2120 'BA': 'Bosnia and Herzegovina',
2122 'BV': 'Bouvet Island',
2124 'IO': 'British Indian Ocean Territory',
2125 'BN': 'Brunei Darussalam',
2127 'BF': 'Burkina Faso',
2133 'KY': 'Cayman Islands',
2134 'CF': 'Central African Republic',
2138 'CX': 'Christmas Island',
2139 'CC': 'Cocos (Keeling) Islands',
2143 'CD': 'Congo, the Democratic Republic of the',
2144 'CK': 'Cook Islands',
2146 'CI': 'Côte d\'Ivoire',
2151 'CZ': 'Czech Republic',
2155 'DO': 'Dominican Republic',
2158 'SV': 'El Salvador',
2159 'GQ': 'Equatorial Guinea',
2163 'FK': 'Falkland Islands (Malvinas)',
2164 'FO': 'Faroe Islands',
2168 'GF': 'French Guiana',
2169 'PF': 'French Polynesia',
2170 'TF': 'French Southern Territories',
2185 'GW': 'Guinea-Bissau',
2188 'HM': 'Heard Island and McDonald Islands',
2189 'VA': 'Holy See (Vatican City State)',
2196 'IR': 'Iran, Islamic Republic of',
2199 'IM': 'Isle of Man',
2209 'KP': 'Korea, Democratic People\'s Republic of',
2210 'KR': 'Korea, Republic of',
2213 'LA': 'Lao People\'s Democratic Republic',
2219 'LI': 'Liechtenstein',
2223 'MK': 'Macedonia, the Former Yugoslav Republic of',
2230 'MH': 'Marshall Islands',
2236 'FM': 'Micronesia, Federated States of',
2237 'MD': 'Moldova, Republic of',
2248 'NL': 'Netherlands',
2249 'NC': 'New Caledonia',
2250 'NZ': 'New Zealand',
2255 'NF': 'Norfolk Island',
2256 'MP': 'Northern Mariana Islands',
2261 'PS': 'Palestine, State of',
2263 'PG': 'Papua New Guinea',
2266 'PH': 'Philippines',
2270 'PR': 'Puerto Rico',
2274 'RU': 'Russian Federation',
2276 'BL': 'Saint Barthélemy',
2277 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2278 'KN': 'Saint Kitts and Nevis',
2279 'LC': 'Saint Lucia',
2280 'MF': 'Saint Martin (French part)',
2281 'PM': 'Saint Pierre and Miquelon',
2282 'VC': 'Saint Vincent and the Grenadines',
2285 'ST': 'Sao Tome and Principe',
2286 'SA': 'Saudi Arabia',
2290 'SL': 'Sierra Leone',
2292 'SX': 'Sint Maarten (Dutch part)',
2295 'SB': 'Solomon Islands',
2297 'ZA': 'South Africa',
2298 'GS': 'South Georgia and the South Sandwich Islands',
2299 'SS': 'South Sudan',
2304 'SJ': 'Svalbard and Jan Mayen',
2307 'CH': 'Switzerland',
2308 'SY': 'Syrian Arab Republic',
2309 'TW': 'Taiwan, Province of China',
2311 'TZ': 'Tanzania, United Republic of',
2313 'TL': 'Timor-Leste',
2317 'TT': 'Trinidad and Tobago',
2320 'TM': 'Turkmenistan',
2321 'TC': 'Turks and Caicos Islands',
2325 'AE': 'United Arab Emirates',
2326 'GB': 'United Kingdom',
2327 'US': 'United States',
2328 'UM': 'United States Minor Outlying Islands',
2332 'VE': 'Venezuela, Bolivarian Republic of',
2334 'VG': 'Virgin Islands, British',
2335 'VI': 'Virgin Islands, U.S.',
2336 'WF': 'Wallis and Futuna',
2337 'EH': 'Western Sahara',
2344 def short2full(cls, code):
2345 """Convert an ISO 3166-2 country code to the corresponding full name"""
2346 return cls._country_map.get(code.upper())
2349 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2350 def __init__(self, proxies=None):
2351 # Set default handlers
2352 for type in ('http', 'https'):
2353 setattr(self, '%s_open' % type,
2354 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2355 meth(r, proxy, type))
2356 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2358 def proxy_open(self, req, proxy, type):
2359 req_proxy = req.headers.get('Ytdl-request-proxy')
2360 if req_proxy is not None:
2362 del req.headers['Ytdl-request-proxy']
2364 if proxy == '__noproxy__':
2365 return None # No Proxy
2366 return compat_urllib_request.ProxyHandler.proxy_open(
2367 self, req, proxy, type)