2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os.rename(tf.name, fn)
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val=None):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z-]+$', key)
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
148 return node.find(expr)
150 def find_xpath_attr(node, xpath, key, val=None):
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
153 if isinstance(xpath, compat_str):
154 xpath = xpath.encode('ascii')
156 for f in node.findall(xpath):
157 if key not in f.attrib:
159 if val is None or f.attrib.get(key) == val:
163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
164 # the namespace parameter
167 def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
172 replaced.append(c[0])
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
184 if n is None or n.text is None:
185 if default is not NO_DEFAULT:
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
195 def get_element_by_id(id, html):
196 """Return the content of the tag with the specified ID in the passed HTML document"""
197 return get_element_by_attribute("id", id, html)
200 def get_element_by_attribute(attribute, value, html):
201 """Return the content of the tag with the specified attribute in the passed HTML document"""
203 m = re.search(r'''(?xs)
205 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
207 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
211 ''' % (re.escape(attribute), re.escape(value)), html)
215 res = m.group('content')
217 if res.startswith('"') or res.startswith("'"):
220 return unescapeHTML(res)
223 def clean_html(html):
224 """Clean an HTML snippet into a readable string"""
226 if html is None: # Convenience for sanitizing descriptions etc.
230 html = html.replace('\n', ' ')
231 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
232 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
234 html = re.sub('<.*?>', '', html)
235 # Replace html entities
236 html = unescapeHTML(html)
240 def sanitize_open(filename, open_mode):
241 """Try to open the given filename, and slightly tweak it if this fails.
243 Attempts to open the given filename. If this fails, it tries to change
244 the filename slightly, step by step, until it's either able to open it
245 or it fails and raises a final exception, like the standard open()
248 It returns the tuple (stream, definitive_file_name).
252 if sys.platform == 'win32':
254 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
255 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
256 stream = open(encodeFilename(filename), open_mode)
257 return (stream, filename)
258 except (IOError, OSError) as err:
259 if err.errno in (errno.EACCES,):
262 # In case of error, try to remove win32 forbidden chars
263 alt_filename = sanitize_path(filename)
264 if alt_filename == filename:
267 # An exception here should be caught in the caller
268 stream = open(encodeFilename(alt_filename), open_mode)
269 return (stream, alt_filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 def sanitize_filename(s, restricted=False, is_id=False):
282 """Sanitizes a string so it could be used as part of a filename.
283 If restricted is set, use a stricter subset of allowed characters.
284 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
286 def replace_insane(char):
287 if char == '?' or ord(char) < 32 or ord(char) == 127:
290 return '' if restricted else '\''
292 return '_-' if restricted else ' -'
293 elif char in '\\/|*<>':
295 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
297 if restricted and ord(char) > 127:
302 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
303 result = ''.join(map(replace_insane, s))
305 while '__' in result:
306 result = result.replace('__', '_')
307 result = result.strip('_')
308 # Common case of "Foreign band name - English song title"
309 if restricted and result.startswith('-_'):
311 if result.startswith('-'):
312 result = '_' + result[len('-'):]
313 result = result.lstrip('.')
319 def sanitize_path(s):
320 """Sanitizes and normalizes path on Windows"""
321 if sys.platform != 'win32':
323 drive_or_unc, _ = os.path.splitdrive(s)
324 if sys.version_info < (2, 7) and not drive_or_unc:
325 drive_or_unc, _ = os.path.splitunc(s)
326 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
330 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
331 for path_part in norm_path]
333 sanitized_path.insert(0, drive_or_unc + os.path.sep)
334 return os.path.join(*sanitized_path)
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
357 numstr = '0%s' % numstr
360 return compat_chr(int(numstr, base))
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
369 assert type(s) == compat_str
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
381 encoding = sys.getfilesystemencoding()
387 def encodeFilename(s, for_subprocess=False):
389 @param s The name of the file
392 assert type(s) == compat_str
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
404 return s.encode(get_subprocess_encoding(), 'ignore')
407 def decodeFilename(b, for_subprocess=False):
409 if sys.version_info >= (3, 0):
412 if not isinstance(b, bytes):
415 return b.decode(get_subprocess_encoding(), 'ignore')
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
427 def decodeArgument(b):
428 return decodeFilename(b, True)
431 def decodeOption(optval):
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
437 assert isinstance(optval, compat_str)
441 def formatSeconds(secs):
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
445 return '%d:%02d' % (secs // 60, secs % 60)
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
461 # (create_default_context present but HTTPSHandler has no context=)
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
486 class ExtractorError(Exception):
487 """Error during info extraction."""
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
496 if video_id is not None:
497 msg = video_id + ': ' + msg
499 msg += ' (caused by %r)' % cause
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
505 self.exc_info = sys.exc_info() # preserve original exception
507 self.video_id = video_id
509 def format_traceback(self):
510 if self.traceback is None:
512 return ''.join(traceback.format_tb(self.traceback))
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
527 class DownloadError(Exception):
528 """Download Error exception.
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
541 class SameFileError(Exception):
542 """Same File exception.
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
550 class PostProcessingError(Exception):
551 """Post Processing exception.
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
557 def __init__(self, msg):
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
583 def __init__(self, downloaded, expected):
585 self.downloaded = downloaded
586 self.expected = expected
589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
590 hc = http_class(*args, **kwargs)
591 source_address = ydl_handler._params.get('source_address')
592 if source_address is not None:
593 sa = (source_address, 0)
594 if hasattr(hc, 'source_address'): # Python 2.7+
595 hc.source_address = sa
597 def _hc_connect(self, *args, **kwargs):
598 sock = compat_socket_create_connection(
599 (self.host, self.port), self.timeout, sa)
601 self.sock = ssl.wrap_socket(
602 sock, self.key_file, self.cert_file,
603 ssl_version=ssl.PROTOCOL_TLSv1)
606 hc.connect = functools.partial(_hc_connect, hc)
611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
612 """Handler for HTTP requests and responses.
614 This class, when installed with an OpenerDirector, automatically adds
615 the standard headers to every HTTP request and handles gzipped and
616 deflated responses from web servers. If compression is to be avoided in
617 a particular request, the original request in the program code only has
618 to include the HTTP header "Youtubedl-No-Compression", which will be
619 removed before making the real request.
621 Part of this code was copied from:
623 http://techknack.net/python-urllib2-handlers/
625 Andrew Rowls, the author of that code, agreed to release it to the
629 def __init__(self, params, *args, **kwargs):
630 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
631 self._params = params
633 def http_open(self, req):
634 return self.do_open(functools.partial(
635 _create_http_connection, self, compat_http_client.HTTPConnection, False),
641 return zlib.decompress(data, -zlib.MAX_WBITS)
643 return zlib.decompress(data)
646 def addinfourl_wrapper(stream, headers, url, code):
647 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
648 return compat_urllib_request.addinfourl(stream, headers, url, code)
649 ret = compat_urllib_request.addinfourl(stream, headers, url)
653 def http_request(self, req):
654 for h, v in std_headers.items():
655 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
656 # The dict keys are capitalized because of this bug by urllib
657 if h.capitalize() not in req.headers:
659 if 'Youtubedl-no-compression' in req.headers:
660 if 'Accept-encoding' in req.headers:
661 del req.headers['Accept-encoding']
662 del req.headers['Youtubedl-no-compression']
664 if sys.version_info < (2, 7) and '#' in req.get_full_url():
665 # Python 2.6 is brain-dead when it comes to fragments
666 req._Request__original = req._Request__original.partition('#')[0]
667 req._Request__r_type = req._Request__r_type.partition('#')[0]
671 def http_response(self, req, resp):
674 if resp.headers.get('Content-encoding', '') == 'gzip':
675 content = resp.read()
676 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
678 uncompressed = io.BytesIO(gz.read())
679 except IOError as original_ioerror:
680 # There may be junk add the end of the file
681 # See http://stackoverflow.com/q/4928560/35070 for details
682 for i in range(1, 1024):
684 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
685 uncompressed = io.BytesIO(gz.read())
690 raise original_ioerror
691 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
692 resp.msg = old_resp.msg
694 if resp.headers.get('Content-encoding', '') == 'deflate':
695 gz = io.BytesIO(self.deflate(resp.read()))
696 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
697 resp.msg = old_resp.msg
700 https_request = http_request
701 https_response = http_response
704 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
705 def __init__(self, params, https_conn_class=None, *args, **kwargs):
706 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
707 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
708 self._params = params
710 def https_open(self, req):
712 if hasattr(self, '_context'): # python > 2.6
713 kwargs['context'] = self._context
714 if hasattr(self, '_check_hostname'): # python 3.x
715 kwargs['check_hostname'] = self._check_hostname
716 return self.do_open(functools.partial(
717 _create_http_connection, self, self._https_conn_class, True),
721 def parse_iso8601(date_str, delimiter='T', timezone=None):
722 """ Return a UNIX timestamp from the given date """
729 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
732 timezone = datetime.timedelta()
734 date_str = date_str[:-len(m.group(0))]
735 if not m.group('sign'):
736 timezone = datetime.timedelta()
738 sign = 1 if m.group('sign') == '+' else -1
739 timezone = datetime.timedelta(
740 hours=sign * int(m.group('hours')),
741 minutes=sign * int(m.group('minutes')))
742 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
743 dt = datetime.datetime.strptime(date_str, date_format) - timezone
744 return calendar.timegm(dt.timetuple())
747 def unified_strdate(date_str, day_first=True):
748 """Return a string with the date in the format YYYYMMDD"""
754 date_str = date_str.replace(',', ' ')
755 # %z (UTC offset) is only supported in python>=3.2
756 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
757 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
758 # Remove AM/PM + timezone
759 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
761 format_expressions = [
766 '%b %dst %Y %I:%M%p',
767 '%b %dnd %Y %I:%M%p',
768 '%b %dth %Y %I:%M%p',
774 '%Y-%m-%d %H:%M:%S.%f',
777 '%Y-%m-%dT%H:%M:%SZ',
778 '%Y-%m-%dT%H:%M:%S.%fZ',
779 '%Y-%m-%dT%H:%M:%S.%f0Z',
781 '%Y-%m-%dT%H:%M:%S.%f',
785 format_expressions.extend([
793 format_expressions.extend([
800 for expression in format_expressions:
802 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
805 if upload_date is None:
806 timetuple = email.utils.parsedate_tz(date_str)
808 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
812 def determine_ext(url, default_ext='unknown_video'):
815 guess = url.partition('?')[0].rpartition('.')[2]
816 if re.match(r'^[A-Za-z0-9]+$', guess):
822 def subtitles_filename(filename, sub_lang, sub_format):
823 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
826 def date_from_str(date_str):
828 Return a datetime object from a string in the format YYYYMMDD or
829 (now|today)[+-][0-9](day|week|month|year)(s)?"""
830 today = datetime.date.today()
831 if date_str in ('now', 'today'):
833 if date_str == 'yesterday':
834 return today - datetime.timedelta(days=1)
835 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
836 if match is not None:
837 sign = match.group('sign')
838 time = int(match.group('time'))
841 unit = match.group('unit')
842 # A bad aproximation?
850 delta = datetime.timedelta(**{unit: time})
852 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
855 def hyphenate_date(date_str):
857 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
858 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
859 if match is not None:
860 return '-'.join(match.groups())
865 class DateRange(object):
866 """Represents a time interval between two dates"""
868 def __init__(self, start=None, end=None):
869 """start and end must be strings in the format accepted by date"""
870 if start is not None:
871 self.start = date_from_str(start)
873 self.start = datetime.datetime.min.date()
875 self.end = date_from_str(end)
877 self.end = datetime.datetime.max.date()
878 if self.start > self.end:
879 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
883 """Returns a range that only contains the given day"""
886 def __contains__(self, date):
887 """Check if the date is in the range"""
888 if not isinstance(date, datetime.date):
889 date = date_from_str(date)
890 return self.start <= date <= self.end
893 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
897 """ Returns the platform name as a compat_str """
898 res = platform.platform()
899 if isinstance(res, bytes):
900 res = res.decode(preferredencoding())
902 assert isinstance(res, compat_str)
906 def _windows_write_string(s, out):
907 """ Returns True if the string was written using special methods,
908 False if it has yet to be written out."""
909 # Adapted from http://stackoverflow.com/a/3259271/35070
912 import ctypes.wintypes
920 fileno = out.fileno()
921 except AttributeError:
922 # If the output stream doesn't have a fileno, it's virtual
924 except io.UnsupportedOperation:
925 # Some strange Windows pseudo files?
927 if fileno not in WIN_OUTPUT_IDS:
930 GetStdHandle = ctypes.WINFUNCTYPE(
931 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
932 (b"GetStdHandle", ctypes.windll.kernel32))
933 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
935 WriteConsoleW = ctypes.WINFUNCTYPE(
936 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
937 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
938 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
939 written = ctypes.wintypes.DWORD(0)
941 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
942 FILE_TYPE_CHAR = 0x0002
943 FILE_TYPE_REMOTE = 0x8000
944 GetConsoleMode = ctypes.WINFUNCTYPE(
945 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
946 ctypes.POINTER(ctypes.wintypes.DWORD))(
947 (b"GetConsoleMode", ctypes.windll.kernel32))
948 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
950 def not_a_console(handle):
951 if handle == INVALID_HANDLE_VALUE or handle is None:
953 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
954 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
959 def next_nonbmp_pos(s):
961 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
962 except StopIteration:
966 count = min(next_nonbmp_pos(s), 1024)
969 h, s, count if count else 2, ctypes.byref(written), None)
971 raise OSError('Failed to write string')
972 if not count: # We just wrote a non-BMP character
973 assert written.value == 2
976 assert written.value > 0
977 s = s[written.value:]
981 def write_string(s, out=None, encoding=None):
984 assert type(s) == compat_str
986 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
987 if _windows_write_string(s, out):
990 if ('b' in getattr(out, 'mode', '') or
991 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
992 byt = s.encode(encoding or preferredencoding(), 'ignore')
994 elif hasattr(out, 'buffer'):
995 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
996 byt = s.encode(enc, 'ignore')
997 out.buffer.write(byt)
1003 def bytes_to_intlist(bs):
1006 if isinstance(bs[0], int): # Python 3
1009 return [ord(c) for c in bs]
1012 def intlist_to_bytes(xs):
1015 return struct_pack('%dB' % len(xs), *xs)
1018 # Cross-platform file locking
1019 if sys.platform == 'win32':
1020 import ctypes.wintypes
1023 class OVERLAPPED(ctypes.Structure):
1025 ('Internal', ctypes.wintypes.LPVOID),
1026 ('InternalHigh', ctypes.wintypes.LPVOID),
1027 ('Offset', ctypes.wintypes.DWORD),
1028 ('OffsetHigh', ctypes.wintypes.DWORD),
1029 ('hEvent', ctypes.wintypes.HANDLE),
1032 kernel32 = ctypes.windll.kernel32
1033 LockFileEx = kernel32.LockFileEx
1034 LockFileEx.argtypes = [
1035 ctypes.wintypes.HANDLE, # hFile
1036 ctypes.wintypes.DWORD, # dwFlags
1037 ctypes.wintypes.DWORD, # dwReserved
1038 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1039 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1040 ctypes.POINTER(OVERLAPPED) # Overlapped
1042 LockFileEx.restype = ctypes.wintypes.BOOL
1043 UnlockFileEx = kernel32.UnlockFileEx
1044 UnlockFileEx.argtypes = [
1045 ctypes.wintypes.HANDLE, # hFile
1046 ctypes.wintypes.DWORD, # dwReserved
1047 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1048 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1049 ctypes.POINTER(OVERLAPPED) # Overlapped
1051 UnlockFileEx.restype = ctypes.wintypes.BOOL
1052 whole_low = 0xffffffff
1053 whole_high = 0x7fffffff
1055 def _lock_file(f, exclusive):
1056 overlapped = OVERLAPPED()
1057 overlapped.Offset = 0
1058 overlapped.OffsetHigh = 0
1059 overlapped.hEvent = 0
1060 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1061 handle = msvcrt.get_osfhandle(f.fileno())
1062 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1063 whole_low, whole_high, f._lock_file_overlapped_p):
1064 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1066 def _unlock_file(f):
1067 assert f._lock_file_overlapped_p
1068 handle = msvcrt.get_osfhandle(f.fileno())
1069 if not UnlockFileEx(handle, 0,
1070 whole_low, whole_high, f._lock_file_overlapped_p):
1071 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1076 def _lock_file(f, exclusive):
1077 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1079 def _unlock_file(f):
1080 fcntl.flock(f, fcntl.LOCK_UN)
1083 class locked_file(object):
1084 def __init__(self, filename, mode, encoding=None):
1085 assert mode in ['r', 'a', 'w']
1086 self.f = io.open(filename, mode, encoding=encoding)
1089 def __enter__(self):
1090 exclusive = self.mode != 'r'
1092 _lock_file(self.f, exclusive)
1098 def __exit__(self, etype, value, traceback):
1100 _unlock_file(self.f)
1107 def write(self, *args):
1108 return self.f.write(*args)
1110 def read(self, *args):
1111 return self.f.read(*args)
1114 def get_filesystem_encoding():
1115 encoding = sys.getfilesystemencoding()
1116 return encoding if encoding is not None else 'utf-8'
1119 def shell_quote(args):
1121 encoding = get_filesystem_encoding()
1123 if isinstance(a, bytes):
1124 # We may get a filename encoded with 'encodeFilename'
1125 a = a.decode(encoding)
1126 quoted_args.append(pipes.quote(a))
1127 return ' '.join(quoted_args)
1130 def smuggle_url(url, data):
1131 """ Pass additional data in a URL for internal use. """
1133 sdata = compat_urllib_parse.urlencode(
1134 {'__youtubedl_smuggle': json.dumps(data)})
1135 return url + '#' + sdata
1138 def unsmuggle_url(smug_url, default=None):
1139 if '#__youtubedl_smuggle' not in smug_url:
1140 return smug_url, default
1141 url, _, sdata = smug_url.rpartition('#')
1142 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1143 data = json.loads(jsond)
1147 def format_bytes(bytes):
1150 if type(bytes) is str:
1151 bytes = float(bytes)
1155 exponent = int(math.log(bytes, 1024.0))
1156 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1157 converted = float(bytes) / float(1024 ** exponent)
1158 return '%.2f%s' % (converted, suffix)
1161 def parse_filesize(s):
1165 # The lower-case forms are of course incorrect and inofficial,
1166 # but we support those too
1204 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1206 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1210 num_str = m.group('num').replace(',', '.')
1211 mult = _UNIT_TABLE[m.group('unit')]
1212 return int(float(num_str) * mult)
1215 def month_by_name(name):
1216 """ Return the number of a month by (locale-independently) English name """
1219 return ENGLISH_MONTH_NAMES.index(name) + 1
1224 def month_by_abbreviation(abbrev):
1225 """ Return the number of a month by (locale-independently) English
1229 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1234 def fix_xml_ampersands(xml_str):
1235 """Replace all the '&' by '&' in XML"""
1237 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1242 def setproctitle(title):
1243 assert isinstance(title, compat_str)
1245 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 title_bytes = title.encode('utf-8')
1249 buf = ctypes.create_string_buffer(len(title_bytes))
1250 buf.value = title_bytes
1252 libc.prctl(15, buf, 0, 0, 0)
1253 except AttributeError:
1254 return # Strange libc, just skip this
1257 def remove_start(s, start):
1258 if s.startswith(start):
1259 return s[len(start):]
1263 def remove_end(s, end):
1265 return s[:-len(end)]
1269 def url_basename(url):
1270 path = compat_urlparse.urlparse(url).path
1271 return path.strip('/').split('/')[-1]
1274 class HEADRequest(compat_urllib_request.Request):
1275 def get_method(self):
1279 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1282 v = getattr(v, get_attr, None)
1285 return default if v is None else (int(v) * invscale // scale)
1288 def str_or_none(v, default=None):
1289 return default if v is None else compat_str(v)
1292 def str_to_int(int_str):
1293 """ A more relaxed version of int_or_none """
1296 int_str = re.sub(r'[,\.\+]', '', int_str)
1300 def float_or_none(v, scale=1, invscale=1, default=None):
1301 return default if v is None else (float(v) * invscale / scale)
1304 def parse_duration(s):
1305 if not isinstance(s, compat_basestring):
1313 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1314 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1316 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1319 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1320 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1322 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1324 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1329 if m.group('only_mins'):
1330 return float_or_none(m.group('only_mins'), invscale=60)
1331 if m.group('only_hours'):
1332 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1334 res += int(m.group('secs'))
1335 if m.group('mins_reversed'):
1336 res += int(m.group('mins_reversed')) * 60
1338 res += int(m.group('mins')) * 60
1339 if m.group('hours'):
1340 res += int(m.group('hours')) * 60 * 60
1341 if m.group('hours_reversed'):
1342 res += int(m.group('hours_reversed')) * 60 * 60
1344 res += int(m.group('days')) * 24 * 60 * 60
1346 res += float(m.group('ms'))
1350 def prepend_extension(filename, ext, expected_real_ext=None):
1351 name, real_ext = os.path.splitext(filename)
1353 '{0}.{1}{2}'.format(name, ext, real_ext)
1354 if not expected_real_ext or real_ext[1:] == expected_real_ext
1355 else '{0}.{1}'.format(filename, ext))
1358 def replace_extension(filename, ext, expected_real_ext=None):
1359 name, real_ext = os.path.splitext(filename)
1360 return '{0}.{1}'.format(
1361 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1365 def check_executable(exe, args=[]):
1366 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1367 args can be a list of arguments for a short output (like -version) """
1369 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1375 def get_exe_version(exe, args=['--version'],
1376 version_re=None, unrecognized='present'):
1377 """ Returns the version of the specified executable,
1378 or False if the executable is not present """
1380 out, _ = subprocess.Popen(
1381 [encodeArgument(exe)] + args,
1382 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1385 if isinstance(out, bytes): # Python 2.x
1386 out = out.decode('ascii', 'ignore')
1387 return detect_exe_version(out, version_re, unrecognized)
1390 def detect_exe_version(output, version_re=None, unrecognized='present'):
1391 assert isinstance(output, compat_str)
1392 if version_re is None:
1393 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1394 m = re.search(version_re, output)
1401 class PagedList(object):
1403 # This is only useful for tests
1404 return len(self.getslice())
1407 class OnDemandPagedList(PagedList):
1408 def __init__(self, pagefunc, pagesize):
1409 self._pagefunc = pagefunc
1410 self._pagesize = pagesize
1412 def getslice(self, start=0, end=None):
1414 for pagenum in itertools.count(start // self._pagesize):
1415 firstid = pagenum * self._pagesize
1416 nextfirstid = pagenum * self._pagesize + self._pagesize
1417 if start >= nextfirstid:
1420 page_results = list(self._pagefunc(pagenum))
1423 start % self._pagesize
1424 if firstid <= start < nextfirstid
1428 ((end - 1) % self._pagesize) + 1
1429 if (end is not None and firstid <= end <= nextfirstid)
1432 if startv != 0 or endv is not None:
1433 page_results = page_results[startv:endv]
1434 res.extend(page_results)
1436 # A little optimization - if current page is not "full", ie. does
1437 # not contain page_size videos then we can assume that this page
1438 # is the last one - there are no more ids on further pages -
1439 # i.e. no need to query again.
1440 if len(page_results) + startv < self._pagesize:
1443 # If we got the whole page, but the next page is not interesting,
1444 # break out early as well
1445 if end == nextfirstid:
1450 class InAdvancePagedList(PagedList):
1451 def __init__(self, pagefunc, pagecount, pagesize):
1452 self._pagefunc = pagefunc
1453 self._pagecount = pagecount
1454 self._pagesize = pagesize
1456 def getslice(self, start=0, end=None):
1458 start_page = start // self._pagesize
1460 self._pagecount if end is None else (end // self._pagesize + 1))
1461 skip_elems = start - start_page * self._pagesize
1462 only_more = None if end is None else end - start
1463 for pagenum in range(start_page, end_page):
1464 page = list(self._pagefunc(pagenum))
1466 page = page[skip_elems:]
1468 if only_more is not None:
1469 if len(page) < only_more:
1470 only_more -= len(page)
1472 page = page[:only_more]
1479 def uppercase_escape(s):
1480 unicode_escape = codecs.getdecoder('unicode_escape')
1482 r'\\U[0-9a-fA-F]{8}',
1483 lambda m: unicode_escape(m.group(0))[0],
1487 def lowercase_escape(s):
1488 unicode_escape = codecs.getdecoder('unicode_escape')
1490 r'\\u[0-9a-fA-F]{4}',
1491 lambda m: unicode_escape(m.group(0))[0],
1495 def escape_rfc3986(s):
1496 """Escape non-ASCII characters as suggested by RFC 3986"""
1497 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1498 s = s.encode('utf-8')
1499 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1502 def escape_url(url):
1503 """Escape URL as suggested by RFC 3986"""
1504 url_parsed = compat_urllib_parse_urlparse(url)
1505 return url_parsed._replace(
1506 path=escape_rfc3986(url_parsed.path),
1507 params=escape_rfc3986(url_parsed.params),
1508 query=escape_rfc3986(url_parsed.query),
1509 fragment=escape_rfc3986(url_parsed.fragment)
1513 struct.pack('!I', 0)
1515 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1516 def struct_pack(spec, *args):
1517 if isinstance(spec, compat_str):
1518 spec = spec.encode('ascii')
1519 return struct.pack(spec, *args)
1521 def struct_unpack(spec, *args):
1522 if isinstance(spec, compat_str):
1523 spec = spec.encode('ascii')
1524 return struct.unpack(spec, *args)
1526 struct_pack = struct.pack
1527 struct_unpack = struct.unpack
1530 def read_batch_urls(batch_fd):
1532 if not isinstance(url, compat_str):
1533 url = url.decode('utf-8', 'replace')
1534 BOM_UTF8 = '\xef\xbb\xbf'
1535 if url.startswith(BOM_UTF8):
1536 url = url[len(BOM_UTF8):]
1538 if url.startswith(('#', ';', ']')):
1542 with contextlib.closing(batch_fd) as fd:
1543 return [url for url in map(fixup, fd) if url]
1546 def urlencode_postdata(*args, **kargs):
1547 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1551 etree_iter = xml.etree.ElementTree.Element.iter
1552 except AttributeError: # Python <=2.6
1553 etree_iter = lambda n: n.findall('.//*')
1557 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1558 def doctype(self, name, pubid, system):
1559 pass # Ignore doctypes
1561 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1562 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1563 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1564 # Fix up XML parser in Python 2.x
1565 if sys.version_info < (3, 0):
1566 for n in etree_iter(tree):
1567 if n.text is not None:
1568 if not isinstance(n.text, compat_str):
1569 n.text = n.text.decode('utf-8')
1582 def parse_age_limit(s):
1585 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1586 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1589 def strip_jsonp(code):
1591 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1594 def js_to_json(code):
1597 if v in ('true', 'false', 'null'):
1599 if v.startswith('"'):
1601 if v.startswith("'"):
1603 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1610 res = re.sub(r'''(?x)
1611 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1612 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1613 [a-zA-Z_][.a-zA-Z_0-9]*
1615 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1619 def qualities(quality_ids):
1620 """ Get a numeric quality value out of a list of possible values """
1623 return quality_ids.index(qid)
1629 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1632 def limit_length(s, length):
1633 """ Add ellipses to overly long strings """
1638 return s[:length - len(ELLIPSES)] + ELLIPSES
1642 def version_tuple(v):
1643 return tuple(int(e) for e in re.split(r'[-.]', v))
1646 def is_outdated_version(version, limit, assume_new=True):
1648 return not assume_new
1650 return version_tuple(version) < version_tuple(limit)
1652 return not assume_new
1655 def ytdl_is_updateable():
1656 """ Returns if youtube-dl can be updated with -U """
1657 from zipimport import zipimporter
1659 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1662 def args_to_str(args):
1663 # Get a short string representation for a subprocess command
1664 return ' '.join(shlex_quote(a) for a in args)
1667 def mimetype2ext(mt):
1668 _, _, res = mt.rpartition('/')
1672 'x-mp4-fragmented': 'mp4',
1677 def urlhandle_detect_ext(url_handle):
1680 getheader = lambda h: url_handle.headers[h]
1681 except AttributeError: # Python < 3
1682 getheader = url_handle.info().getheader
1684 cd = getheader('Content-Disposition')
1686 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1688 e = determine_ext(m.group('filename'), default_ext=None)
1692 return mimetype2ext(getheader('Content-Type'))
1695 def age_restricted(content_limit, age_limit):
1696 """ Returns True iff the content should be blocked """
1698 if age_limit is None: # No limit set
1700 if content_limit is None:
1701 return False # Content available for everyone
1702 return age_limit < content_limit
1705 def is_html(first_bytes):
1706 """ Detect whether a file contains HTML by examining its first bytes. """
1709 (b'\xef\xbb\xbf', 'utf-8'),
1710 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1711 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1712 (b'\xff\xfe', 'utf-16-le'),
1713 (b'\xfe\xff', 'utf-16-be'),
1715 for bom, enc in BOMS:
1716 if first_bytes.startswith(bom):
1717 s = first_bytes[len(bom):].decode(enc, 'replace')
1720 s = first_bytes.decode('utf-8', 'replace')
1722 return re.match(r'^\s*<', s)
1725 def determine_protocol(info_dict):
1726 protocol = info_dict.get('protocol')
1727 if protocol is not None:
1730 url = info_dict['url']
1731 if url.startswith('rtmp'):
1733 elif url.startswith('mms'):
1735 elif url.startswith('rtsp'):
1738 ext = determine_ext(url)
1744 return compat_urllib_parse_urlparse(url).scheme
1747 def render_table(header_row, data):
1748 """ Render a list of rows, each as a list of values """
1749 table = [header_row] + data
1750 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1751 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1752 return '\n'.join(format_str % tuple(row) for row in table)
1755 def _match_one(filter_part, dct):
1756 COMPARISON_OPERATORS = {
1764 operator_rex = re.compile(r'''(?x)\s*
1766 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1768 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1769 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1772 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1773 m = operator_rex.search(filter_part)
1775 op = COMPARISON_OPERATORS[m.group('op')]
1776 if m.group('strval') is not None:
1777 if m.group('op') not in ('=', '!='):
1779 'Operator %s does not support string values!' % m.group('op'))
1780 comparison_value = m.group('strval')
1783 comparison_value = int(m.group('intval'))
1785 comparison_value = parse_filesize(m.group('intval'))
1786 if comparison_value is None:
1787 comparison_value = parse_filesize(m.group('intval') + 'B')
1788 if comparison_value is None:
1790 'Invalid integer value %r in filter part %r' % (
1791 m.group('intval'), filter_part))
1792 actual_value = dct.get(m.group('key'))
1793 if actual_value is None:
1794 return m.group('none_inclusive')
1795 return op(actual_value, comparison_value)
1798 '': lambda v: v is not None,
1799 '!': lambda v: v is None,
1801 operator_rex = re.compile(r'''(?x)\s*
1802 (?P<op>%s)\s*(?P<key>[a-z_]+)
1804 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1805 m = operator_rex.search(filter_part)
1807 op = UNARY_OPERATORS[m.group('op')]
1808 actual_value = dct.get(m.group('key'))
1809 return op(actual_value)
1811 raise ValueError('Invalid filter part %r' % filter_part)
1814 def match_str(filter_str, dct):
1815 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1818 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1821 def match_filter_func(filter_str):
1822 def _match_func(info_dict):
1823 if match_str(filter_str, info_dict):
1826 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1827 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1831 def parse_dfxp_time_expr(time_expr):
1835 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1837 return float(mobj.group('time_offset'))
1839 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1841 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1844 def srt_subtitles_timecode(seconds):
1845 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1848 def dfxp2srt(dfxp_data):
1849 _x = functools.partial(xpath_with_ns, ns_map={
1850 'ttml': 'http://www.w3.org/ns/ttml',
1851 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1854 def parse_node(node):
1855 str_or_empty = functools.partial(str_or_none, default='')
1857 out = str_or_empty(node.text)
1860 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1861 out += '\n' + str_or_empty(child.tail)
1862 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1863 out += str_or_empty(parse_node(child))
1865 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1869 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1871 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1874 raise ValueError('Invalid dfxp/TTML subtitle')
1876 for para, index in zip(paras, itertools.count(1)):
1877 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1878 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1880 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1881 out.append('%d\n%s --> %s\n%s\n\n' % (
1883 srt_subtitles_timecode(begin_time),
1884 srt_subtitles_timecode(end_time),
1890 class ISO639Utils(object):
1891 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2080 def short2long(cls, code):
2081 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2082 return cls._lang_map.get(code[:2])
2085 def long2short(cls, code):
2086 """Convert language code from ISO 639-2/T to ISO 639-1"""
2087 for short_name, long_name in cls._lang_map.items():
2088 if long_name == code:
2092 class ISO3166Utils(object):
2093 # From http://data.okfn.org/data/core/country-list
2095 'AF': 'Afghanistan',
2096 'AX': 'Ã…land Islands',
2099 'AS': 'American Samoa',
2104 'AG': 'Antigua and Barbuda',
2121 'BO': 'Bolivia, Plurinational State of',
2122 'BQ': 'Bonaire, Sint Eustatius and Saba',
2123 'BA': 'Bosnia and Herzegovina',
2125 'BV': 'Bouvet Island',
2127 'IO': 'British Indian Ocean Territory',
2128 'BN': 'Brunei Darussalam',
2130 'BF': 'Burkina Faso',
2136 'KY': 'Cayman Islands',
2137 'CF': 'Central African Republic',
2141 'CX': 'Christmas Island',
2142 'CC': 'Cocos (Keeling) Islands',
2146 'CD': 'Congo, the Democratic Republic of the',
2147 'CK': 'Cook Islands',
2149 'CI': 'Côte d\'Ivoire',
2154 'CZ': 'Czech Republic',
2158 'DO': 'Dominican Republic',
2161 'SV': 'El Salvador',
2162 'GQ': 'Equatorial Guinea',
2166 'FK': 'Falkland Islands (Malvinas)',
2167 'FO': 'Faroe Islands',
2171 'GF': 'French Guiana',
2172 'PF': 'French Polynesia',
2173 'TF': 'French Southern Territories',
2188 'GW': 'Guinea-Bissau',
2191 'HM': 'Heard Island and McDonald Islands',
2192 'VA': 'Holy See (Vatican City State)',
2199 'IR': 'Iran, Islamic Republic of',
2202 'IM': 'Isle of Man',
2212 'KP': 'Korea, Democratic People\'s Republic of',
2213 'KR': 'Korea, Republic of',
2216 'LA': 'Lao People\'s Democratic Republic',
2222 'LI': 'Liechtenstein',
2226 'MK': 'Macedonia, the Former Yugoslav Republic of',
2233 'MH': 'Marshall Islands',
2239 'FM': 'Micronesia, Federated States of',
2240 'MD': 'Moldova, Republic of',
2251 'NL': 'Netherlands',
2252 'NC': 'New Caledonia',
2253 'NZ': 'New Zealand',
2258 'NF': 'Norfolk Island',
2259 'MP': 'Northern Mariana Islands',
2264 'PS': 'Palestine, State of',
2266 'PG': 'Papua New Guinea',
2269 'PH': 'Philippines',
2273 'PR': 'Puerto Rico',
2277 'RU': 'Russian Federation',
2279 'BL': 'Saint Barthélemy',
2280 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2281 'KN': 'Saint Kitts and Nevis',
2282 'LC': 'Saint Lucia',
2283 'MF': 'Saint Martin (French part)',
2284 'PM': 'Saint Pierre and Miquelon',
2285 'VC': 'Saint Vincent and the Grenadines',
2288 'ST': 'Sao Tome and Principe',
2289 'SA': 'Saudi Arabia',
2293 'SL': 'Sierra Leone',
2295 'SX': 'Sint Maarten (Dutch part)',
2298 'SB': 'Solomon Islands',
2300 'ZA': 'South Africa',
2301 'GS': 'South Georgia and the South Sandwich Islands',
2302 'SS': 'South Sudan',
2307 'SJ': 'Svalbard and Jan Mayen',
2310 'CH': 'Switzerland',
2311 'SY': 'Syrian Arab Republic',
2312 'TW': 'Taiwan, Province of China',
2314 'TZ': 'Tanzania, United Republic of',
2316 'TL': 'Timor-Leste',
2320 'TT': 'Trinidad and Tobago',
2323 'TM': 'Turkmenistan',
2324 'TC': 'Turks and Caicos Islands',
2328 'AE': 'United Arab Emirates',
2329 'GB': 'United Kingdom',
2330 'US': 'United States',
2331 'UM': 'United States Minor Outlying Islands',
2335 'VE': 'Venezuela, Bolivarian Republic of',
2337 'VG': 'Virgin Islands, British',
2338 'VI': 'Virgin Islands, U.S.',
2339 'WF': 'Wallis and Futuna',
2340 'EH': 'Western Sahara',
2347 def short2full(cls, code):
2348 """Convert an ISO 3166-2 country code to the corresponding full name"""
2349 return cls._country_map.get(code.upper())
2352 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2353 def __init__(self, proxies=None):
2354 # Set default handlers
2355 for type in ('http', 'https'):
2356 setattr(self, '%s_open' % type,
2357 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2358 meth(r, proxy, type))
2359 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2361 def proxy_open(self, req, proxy, type):
2362 req_proxy = req.headers.get('Ytdl-request-proxy')
2363 if req_proxy is not None:
2365 del req.headers['Ytdl-request-proxy']
2367 if proxy == '__noproxy__':
2368 return None # No Proxy
2369 return compat_urllib_request.ProxyHandler.proxy_open(
2370 self, req, proxy, type)