2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
132 os.rename(tf.name, fn)
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val=None):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z-]+$', key)
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
148 return node.find(expr)
150 def find_xpath_attr(node, xpath, key, val=None):
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
153 if isinstance(xpath, compat_str):
154 xpath = xpath.encode('ascii')
156 for f in node.findall(xpath):
157 if key not in f.attrib:
159 if val is None or f.attrib.get(key) == val:
163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
164 # the namespace parameter
167 def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
172 replaced.append(c[0])
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
184 if n is None or n.text is None:
185 if default is not NO_DEFAULT:
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
195 def get_element_by_id(id, html):
196 """Return the content of the tag with the specified ID in the passed HTML document"""
197 return get_element_by_attribute("id", id, html)
200 def get_element_by_attribute(attribute, value, html):
201 """Return the content of the tag with the specified attribute in the passed HTML document"""
203 m = re.search(r'''(?xs)
205 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
207 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
211 ''' % (re.escape(attribute), re.escape(value)), html)
215 res = m.group('content')
217 if res.startswith('"') or res.startswith("'"):
220 return unescapeHTML(res)
223 def clean_html(html):
224 """Clean an HTML snippet into a readable string"""
226 if html is None: # Convenience for sanitizing descriptions etc.
230 html = html.replace('\n', ' ')
231 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
232 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
234 html = re.sub('<.*?>', '', html)
235 # Replace html entities
236 html = unescapeHTML(html)
240 def sanitize_open(filename, open_mode):
241 """Try to open the given filename, and slightly tweak it if this fails.
243 Attempts to open the given filename. If this fails, it tries to change
244 the filename slightly, step by step, until it's either able to open it
245 or it fails and raises a final exception, like the standard open()
248 It returns the tuple (stream, definitive_file_name).
252 if sys.platform == 'win32':
254 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
255 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
256 stream = open(encodeFilename(filename), open_mode)
257 return (stream, filename)
258 except (IOError, OSError) as err:
259 if err.errno in (errno.EACCES,):
262 # In case of error, try to remove win32 forbidden chars
263 alt_filename = sanitize_path(filename)
264 if alt_filename == filename:
267 # An exception here should be caught in the caller
268 stream = open(encodeFilename(alt_filename), open_mode)
269 return (stream, alt_filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 def sanitize_filename(s, restricted=False, is_id=False):
282 """Sanitizes a string so it could be used as part of a filename.
283 If restricted is set, use a stricter subset of allowed characters.
284 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
286 def replace_insane(char):
287 if char == '?' or ord(char) < 32 or ord(char) == 127:
290 return '' if restricted else '\''
292 return '_-' if restricted else ' -'
293 elif char in '\\/|*<>':
295 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
297 if restricted and ord(char) > 127:
302 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
303 result = ''.join(map(replace_insane, s))
305 while '__' in result:
306 result = result.replace('__', '_')
307 result = result.strip('_')
308 # Common case of "Foreign band name - English song title"
309 if restricted and result.startswith('-_'):
311 if result.startswith('-'):
312 result = '_' + result[len('-'):]
313 result = result.lstrip('.')
319 def sanitize_path(s):
320 """Sanitizes and normalizes path on Windows"""
321 if sys.platform != 'win32':
323 drive_or_unc, _ = os.path.splitdrive(s)
324 if sys.version_info < (2, 7) and not drive_or_unc:
325 drive_or_unc, _ = os.path.splitunc(s)
326 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
330 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
331 for path_part in norm_path]
333 sanitized_path.insert(0, drive_or_unc + os.path.sep)
334 return os.path.join(*sanitized_path)
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
357 numstr = '0%s' % numstr
360 return compat_chr(int(numstr, base))
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
369 assert type(s) == compat_str
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
381 encoding = sys.getfilesystemencoding()
387 def encodeFilename(s, for_subprocess=False):
389 @param s The name of the file
392 assert type(s) == compat_str
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
404 return s.encode(get_subprocess_encoding(), 'ignore')
407 def decodeFilename(b, for_subprocess=False):
409 if sys.version_info >= (3, 0):
412 if not isinstance(b, bytes):
415 return b.decode(get_subprocess_encoding(), 'ignore')
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
427 def decodeArgument(b):
428 return decodeFilename(b, True)
431 def decodeOption(optval):
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
437 assert isinstance(optval, compat_str)
441 def formatSeconds(secs):
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
445 return '%d:%02d' % (secs // 60, secs % 60)
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
461 # (create_default_context present but HTTPSHandler has no context=)
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
486 class ExtractorError(Exception):
487 """Error during info extraction."""
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
496 if video_id is not None:
497 msg = video_id + ': ' + msg
499 msg += ' (caused by %r)' % cause
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
505 self.exc_info = sys.exc_info() # preserve original exception
507 self.video_id = video_id
509 def format_traceback(self):
510 if self.traceback is None:
512 return ''.join(traceback.format_tb(self.traceback))
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
527 class DownloadError(Exception):
528 """Download Error exception.
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
541 class SameFileError(Exception):
542 """Same File exception.
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
550 class PostProcessingError(Exception):
551 """Post Processing exception.
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
557 def __init__(self, msg):
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
583 def __init__(self, downloaded, expected):
585 self.downloaded = downloaded
586 self.expected = expected
589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
590 hc = http_class(*args, **kwargs)
591 source_address = ydl_handler._params.get('source_address')
592 if source_address is not None:
593 sa = (source_address, 0)
594 if hasattr(hc, 'source_address'): # Python 2.7+
595 hc.source_address = sa
597 def _hc_connect(self, *args, **kwargs):
598 sock = compat_socket_create_connection(
599 (self.host, self.port), self.timeout, sa)
601 self.sock = ssl.wrap_socket(
602 sock, self.key_file, self.cert_file,
603 ssl_version=ssl.PROTOCOL_TLSv1)
606 hc.connect = functools.partial(_hc_connect, hc)
611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
612 """Handler for HTTP requests and responses.
614 This class, when installed with an OpenerDirector, automatically adds
615 the standard headers to every HTTP request and handles gzipped and
616 deflated responses from web servers. If compression is to be avoided in
617 a particular request, the original request in the program code only has
618 to include the HTTP header "Youtubedl-No-Compression", which will be
619 removed before making the real request.
621 Part of this code was copied from:
623 http://techknack.net/python-urllib2-handlers/
625 Andrew Rowls, the author of that code, agreed to release it to the
629 def __init__(self, params, *args, **kwargs):
630 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
631 self._params = params
633 def http_open(self, req):
634 return self.do_open(functools.partial(
635 _create_http_connection, self, compat_http_client.HTTPConnection, False),
641 return zlib.decompress(data, -zlib.MAX_WBITS)
643 return zlib.decompress(data)
646 def addinfourl_wrapper(stream, headers, url, code):
647 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
648 return compat_urllib_request.addinfourl(stream, headers, url, code)
649 ret = compat_urllib_request.addinfourl(stream, headers, url)
653 def http_request(self, req):
654 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
655 # always respected by websites, some tend to give out URLs with non percent-encoded
656 # non-ASCII characters (see telemb.py, ard.py [#3412])
657 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
658 # To work around aforementioned issue we will replace request's original URL with
659 # percent-encoded one
660 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
661 # the code of this workaround has been moved here from YoutubeDL.urlopen()
662 url = req.get_full_url()
663 url_escaped = escape_url(url)
665 # Substitute URL if any change after escaping
666 if url != url_escaped:
667 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
669 url_escaped, data=req.data, headers=req.headers,
670 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
671 new_req.timeout = req.timeout
674 for h, v in std_headers.items():
675 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
676 # The dict keys are capitalized because of this bug by urllib
677 if h.capitalize() not in req.headers:
679 if 'Youtubedl-no-compression' in req.headers:
680 if 'Accept-encoding' in req.headers:
681 del req.headers['Accept-encoding']
682 del req.headers['Youtubedl-no-compression']
684 if sys.version_info < (2, 7) and '#' in req.get_full_url():
685 # Python 2.6 is brain-dead when it comes to fragments
686 req._Request__original = req._Request__original.partition('#')[0]
687 req._Request__r_type = req._Request__r_type.partition('#')[0]
691 def http_response(self, req, resp):
694 if resp.headers.get('Content-encoding', '') == 'gzip':
695 content = resp.read()
696 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
698 uncompressed = io.BytesIO(gz.read())
699 except IOError as original_ioerror:
700 # There may be junk add the end of the file
701 # See http://stackoverflow.com/q/4928560/35070 for details
702 for i in range(1, 1024):
704 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
705 uncompressed = io.BytesIO(gz.read())
710 raise original_ioerror
711 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
712 resp.msg = old_resp.msg
714 if resp.headers.get('Content-encoding', '') == 'deflate':
715 gz = io.BytesIO(self.deflate(resp.read()))
716 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
717 resp.msg = old_resp.msg
720 https_request = http_request
721 https_response = http_response
724 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
725 def __init__(self, params, https_conn_class=None, *args, **kwargs):
726 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
727 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
728 self._params = params
730 def https_open(self, req):
732 if hasattr(self, '_context'): # python > 2.6
733 kwargs['context'] = self._context
734 if hasattr(self, '_check_hostname'): # python 3.x
735 kwargs['check_hostname'] = self._check_hostname
736 return self.do_open(functools.partial(
737 _create_http_connection, self, self._https_conn_class, True),
741 def parse_iso8601(date_str, delimiter='T', timezone=None):
742 """ Return a UNIX timestamp from the given date """
749 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
752 timezone = datetime.timedelta()
754 date_str = date_str[:-len(m.group(0))]
755 if not m.group('sign'):
756 timezone = datetime.timedelta()
758 sign = 1 if m.group('sign') == '+' else -1
759 timezone = datetime.timedelta(
760 hours=sign * int(m.group('hours')),
761 minutes=sign * int(m.group('minutes')))
762 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
763 dt = datetime.datetime.strptime(date_str, date_format) - timezone
764 return calendar.timegm(dt.timetuple())
767 def unified_strdate(date_str, day_first=True):
768 """Return a string with the date in the format YYYYMMDD"""
774 date_str = date_str.replace(',', ' ')
775 # %z (UTC offset) is only supported in python>=3.2
776 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
777 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
778 # Remove AM/PM + timezone
779 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
781 format_expressions = [
786 '%b %dst %Y %I:%M%p',
787 '%b %dnd %Y %I:%M%p',
788 '%b %dth %Y %I:%M%p',
794 '%Y-%m-%d %H:%M:%S.%f',
797 '%Y-%m-%dT%H:%M:%SZ',
798 '%Y-%m-%dT%H:%M:%S.%fZ',
799 '%Y-%m-%dT%H:%M:%S.%f0Z',
801 '%Y-%m-%dT%H:%M:%S.%f',
805 format_expressions.extend([
813 format_expressions.extend([
820 for expression in format_expressions:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
832 def determine_ext(url, default_ext='unknown_video'):
835 guess = url.partition('?')[0].rpartition('.')[2]
836 if re.match(r'^[A-Za-z0-9]+$', guess):
842 def subtitles_filename(filename, sub_lang, sub_format):
843 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
846 def date_from_str(date_str):
848 Return a datetime object from a string in the format YYYYMMDD or
849 (now|today)[+-][0-9](day|week|month|year)(s)?"""
850 today = datetime.date.today()
851 if date_str in ('now', 'today'):
853 if date_str == 'yesterday':
854 return today - datetime.timedelta(days=1)
855 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
856 if match is not None:
857 sign = match.group('sign')
858 time = int(match.group('time'))
861 unit = match.group('unit')
862 # A bad aproximation?
870 delta = datetime.timedelta(**{unit: time})
872 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
875 def hyphenate_date(date_str):
877 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
878 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
879 if match is not None:
880 return '-'.join(match.groups())
885 class DateRange(object):
886 """Represents a time interval between two dates"""
888 def __init__(self, start=None, end=None):
889 """start and end must be strings in the format accepted by date"""
890 if start is not None:
891 self.start = date_from_str(start)
893 self.start = datetime.datetime.min.date()
895 self.end = date_from_str(end)
897 self.end = datetime.datetime.max.date()
898 if self.start > self.end:
899 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
903 """Returns a range that only contains the given day"""
906 def __contains__(self, date):
907 """Check if the date is in the range"""
908 if not isinstance(date, datetime.date):
909 date = date_from_str(date)
910 return self.start <= date <= self.end
913 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
917 """ Returns the platform name as a compat_str """
918 res = platform.platform()
919 if isinstance(res, bytes):
920 res = res.decode(preferredencoding())
922 assert isinstance(res, compat_str)
926 def _windows_write_string(s, out):
927 """ Returns True if the string was written using special methods,
928 False if it has yet to be written out."""
929 # Adapted from http://stackoverflow.com/a/3259271/35070
932 import ctypes.wintypes
940 fileno = out.fileno()
941 except AttributeError:
942 # If the output stream doesn't have a fileno, it's virtual
944 except io.UnsupportedOperation:
945 # Some strange Windows pseudo files?
947 if fileno not in WIN_OUTPUT_IDS:
950 GetStdHandle = ctypes.WINFUNCTYPE(
951 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
952 (b"GetStdHandle", ctypes.windll.kernel32))
953 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
955 WriteConsoleW = ctypes.WINFUNCTYPE(
956 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
957 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
958 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
959 written = ctypes.wintypes.DWORD(0)
961 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
962 FILE_TYPE_CHAR = 0x0002
963 FILE_TYPE_REMOTE = 0x8000
964 GetConsoleMode = ctypes.WINFUNCTYPE(
965 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
966 ctypes.POINTER(ctypes.wintypes.DWORD))(
967 (b"GetConsoleMode", ctypes.windll.kernel32))
968 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
970 def not_a_console(handle):
971 if handle == INVALID_HANDLE_VALUE or handle is None:
973 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
974 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
979 def next_nonbmp_pos(s):
981 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
982 except StopIteration:
986 count = min(next_nonbmp_pos(s), 1024)
989 h, s, count if count else 2, ctypes.byref(written), None)
991 raise OSError('Failed to write string')
992 if not count: # We just wrote a non-BMP character
993 assert written.value == 2
996 assert written.value > 0
997 s = s[written.value:]
1001 def write_string(s, out=None, encoding=None):
1004 assert type(s) == compat_str
1006 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1007 if _windows_write_string(s, out):
1010 if ('b' in getattr(out, 'mode', '') or
1011 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1012 byt = s.encode(encoding or preferredencoding(), 'ignore')
1014 elif hasattr(out, 'buffer'):
1015 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1016 byt = s.encode(enc, 'ignore')
1017 out.buffer.write(byt)
1023 def bytes_to_intlist(bs):
1026 if isinstance(bs[0], int): # Python 3
1029 return [ord(c) for c in bs]
1032 def intlist_to_bytes(xs):
1035 return struct_pack('%dB' % len(xs), *xs)
1038 # Cross-platform file locking
1039 if sys.platform == 'win32':
1040 import ctypes.wintypes
1043 class OVERLAPPED(ctypes.Structure):
1045 ('Internal', ctypes.wintypes.LPVOID),
1046 ('InternalHigh', ctypes.wintypes.LPVOID),
1047 ('Offset', ctypes.wintypes.DWORD),
1048 ('OffsetHigh', ctypes.wintypes.DWORD),
1049 ('hEvent', ctypes.wintypes.HANDLE),
1052 kernel32 = ctypes.windll.kernel32
1053 LockFileEx = kernel32.LockFileEx
1054 LockFileEx.argtypes = [
1055 ctypes.wintypes.HANDLE, # hFile
1056 ctypes.wintypes.DWORD, # dwFlags
1057 ctypes.wintypes.DWORD, # dwReserved
1058 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1059 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1060 ctypes.POINTER(OVERLAPPED) # Overlapped
1062 LockFileEx.restype = ctypes.wintypes.BOOL
1063 UnlockFileEx = kernel32.UnlockFileEx
1064 UnlockFileEx.argtypes = [
1065 ctypes.wintypes.HANDLE, # hFile
1066 ctypes.wintypes.DWORD, # dwReserved
1067 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1068 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1069 ctypes.POINTER(OVERLAPPED) # Overlapped
1071 UnlockFileEx.restype = ctypes.wintypes.BOOL
1072 whole_low = 0xffffffff
1073 whole_high = 0x7fffffff
1075 def _lock_file(f, exclusive):
1076 overlapped = OVERLAPPED()
1077 overlapped.Offset = 0
1078 overlapped.OffsetHigh = 0
1079 overlapped.hEvent = 0
1080 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1081 handle = msvcrt.get_osfhandle(f.fileno())
1082 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1083 whole_low, whole_high, f._lock_file_overlapped_p):
1084 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1086 def _unlock_file(f):
1087 assert f._lock_file_overlapped_p
1088 handle = msvcrt.get_osfhandle(f.fileno())
1089 if not UnlockFileEx(handle, 0,
1090 whole_low, whole_high, f._lock_file_overlapped_p):
1091 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1096 def _lock_file(f, exclusive):
1097 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1099 def _unlock_file(f):
1100 fcntl.flock(f, fcntl.LOCK_UN)
1103 class locked_file(object):
1104 def __init__(self, filename, mode, encoding=None):
1105 assert mode in ['r', 'a', 'w']
1106 self.f = io.open(filename, mode, encoding=encoding)
1109 def __enter__(self):
1110 exclusive = self.mode != 'r'
1112 _lock_file(self.f, exclusive)
1118 def __exit__(self, etype, value, traceback):
1120 _unlock_file(self.f)
1127 def write(self, *args):
1128 return self.f.write(*args)
1130 def read(self, *args):
1131 return self.f.read(*args)
1134 def get_filesystem_encoding():
1135 encoding = sys.getfilesystemencoding()
1136 return encoding if encoding is not None else 'utf-8'
1139 def shell_quote(args):
1141 encoding = get_filesystem_encoding()
1143 if isinstance(a, bytes):
1144 # We may get a filename encoded with 'encodeFilename'
1145 a = a.decode(encoding)
1146 quoted_args.append(pipes.quote(a))
1147 return ' '.join(quoted_args)
1150 def smuggle_url(url, data):
1151 """ Pass additional data in a URL for internal use. """
1153 sdata = compat_urllib_parse.urlencode(
1154 {'__youtubedl_smuggle': json.dumps(data)})
1155 return url + '#' + sdata
1158 def unsmuggle_url(smug_url, default=None):
1159 if '#__youtubedl_smuggle' not in smug_url:
1160 return smug_url, default
1161 url, _, sdata = smug_url.rpartition('#')
1162 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1163 data = json.loads(jsond)
1167 def format_bytes(bytes):
1170 if type(bytes) is str:
1171 bytes = float(bytes)
1175 exponent = int(math.log(bytes, 1024.0))
1176 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1177 converted = float(bytes) / float(1024 ** exponent)
1178 return '%.2f%s' % (converted, suffix)
1181 def parse_filesize(s):
1185 # The lower-case forms are of course incorrect and inofficial,
1186 # but we support those too
1224 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1226 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1230 num_str = m.group('num').replace(',', '.')
1231 mult = _UNIT_TABLE[m.group('unit')]
1232 return int(float(num_str) * mult)
1235 def month_by_name(name):
1236 """ Return the number of a month by (locale-independently) English name """
1239 return ENGLISH_MONTH_NAMES.index(name) + 1
1244 def month_by_abbreviation(abbrev):
1245 """ Return the number of a month by (locale-independently) English
1249 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1254 def fix_xml_ampersands(xml_str):
1255 """Replace all the '&' by '&' in XML"""
1257 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1262 def setproctitle(title):
1263 assert isinstance(title, compat_str)
1265 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1268 title_bytes = title.encode('utf-8')
1269 buf = ctypes.create_string_buffer(len(title_bytes))
1270 buf.value = title_bytes
1272 libc.prctl(15, buf, 0, 0, 0)
1273 except AttributeError:
1274 return # Strange libc, just skip this
1277 def remove_start(s, start):
1278 if s.startswith(start):
1279 return s[len(start):]
1283 def remove_end(s, end):
1285 return s[:-len(end)]
1289 def url_basename(url):
1290 path = compat_urlparse.urlparse(url).path
1291 return path.strip('/').split('/')[-1]
1294 class HEADRequest(compat_urllib_request.Request):
1295 def get_method(self):
1299 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1302 v = getattr(v, get_attr, None)
1305 return default if v is None else (int(v) * invscale // scale)
1308 def str_or_none(v, default=None):
1309 return default if v is None else compat_str(v)
1312 def str_to_int(int_str):
1313 """ A more relaxed version of int_or_none """
1316 int_str = re.sub(r'[,\.\+]', '', int_str)
1320 def float_or_none(v, scale=1, invscale=1, default=None):
1321 return default if v is None else (float(v) * invscale / scale)
1324 def parse_duration(s):
1325 if not isinstance(s, compat_basestring):
1333 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1334 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1336 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1339 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1340 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1342 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1344 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1349 if m.group('only_mins'):
1350 return float_or_none(m.group('only_mins'), invscale=60)
1351 if m.group('only_hours'):
1352 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1354 res += int(m.group('secs'))
1355 if m.group('mins_reversed'):
1356 res += int(m.group('mins_reversed')) * 60
1358 res += int(m.group('mins')) * 60
1359 if m.group('hours'):
1360 res += int(m.group('hours')) * 60 * 60
1361 if m.group('hours_reversed'):
1362 res += int(m.group('hours_reversed')) * 60 * 60
1364 res += int(m.group('days')) * 24 * 60 * 60
1366 res += float(m.group('ms'))
1370 def prepend_extension(filename, ext, expected_real_ext=None):
1371 name, real_ext = os.path.splitext(filename)
1373 '{0}.{1}{2}'.format(name, ext, real_ext)
1374 if not expected_real_ext or real_ext[1:] == expected_real_ext
1375 else '{0}.{1}'.format(filename, ext))
1378 def replace_extension(filename, ext, expected_real_ext=None):
1379 name, real_ext = os.path.splitext(filename)
1380 return '{0}.{1}'.format(
1381 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1385 def check_executable(exe, args=[]):
1386 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1387 args can be a list of arguments for a short output (like -version) """
1389 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1395 def get_exe_version(exe, args=['--version'],
1396 version_re=None, unrecognized='present'):
1397 """ Returns the version of the specified executable,
1398 or False if the executable is not present """
1400 out, _ = subprocess.Popen(
1401 [encodeArgument(exe)] + args,
1402 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1405 if isinstance(out, bytes): # Python 2.x
1406 out = out.decode('ascii', 'ignore')
1407 return detect_exe_version(out, version_re, unrecognized)
1410 def detect_exe_version(output, version_re=None, unrecognized='present'):
1411 assert isinstance(output, compat_str)
1412 if version_re is None:
1413 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1414 m = re.search(version_re, output)
1421 class PagedList(object):
1423 # This is only useful for tests
1424 return len(self.getslice())
1427 class OnDemandPagedList(PagedList):
1428 def __init__(self, pagefunc, pagesize):
1429 self._pagefunc = pagefunc
1430 self._pagesize = pagesize
1432 def getslice(self, start=0, end=None):
1434 for pagenum in itertools.count(start // self._pagesize):
1435 firstid = pagenum * self._pagesize
1436 nextfirstid = pagenum * self._pagesize + self._pagesize
1437 if start >= nextfirstid:
1440 page_results = list(self._pagefunc(pagenum))
1443 start % self._pagesize
1444 if firstid <= start < nextfirstid
1448 ((end - 1) % self._pagesize) + 1
1449 if (end is not None and firstid <= end <= nextfirstid)
1452 if startv != 0 or endv is not None:
1453 page_results = page_results[startv:endv]
1454 res.extend(page_results)
1456 # A little optimization - if current page is not "full", ie. does
1457 # not contain page_size videos then we can assume that this page
1458 # is the last one - there are no more ids on further pages -
1459 # i.e. no need to query again.
1460 if len(page_results) + startv < self._pagesize:
1463 # If we got the whole page, but the next page is not interesting,
1464 # break out early as well
1465 if end == nextfirstid:
1470 class InAdvancePagedList(PagedList):
1471 def __init__(self, pagefunc, pagecount, pagesize):
1472 self._pagefunc = pagefunc
1473 self._pagecount = pagecount
1474 self._pagesize = pagesize
1476 def getslice(self, start=0, end=None):
1478 start_page = start // self._pagesize
1480 self._pagecount if end is None else (end // self._pagesize + 1))
1481 skip_elems = start - start_page * self._pagesize
1482 only_more = None if end is None else end - start
1483 for pagenum in range(start_page, end_page):
1484 page = list(self._pagefunc(pagenum))
1486 page = page[skip_elems:]
1488 if only_more is not None:
1489 if len(page) < only_more:
1490 only_more -= len(page)
1492 page = page[:only_more]
1499 def uppercase_escape(s):
1500 unicode_escape = codecs.getdecoder('unicode_escape')
1502 r'\\U[0-9a-fA-F]{8}',
1503 lambda m: unicode_escape(m.group(0))[0],
1507 def lowercase_escape(s):
1508 unicode_escape = codecs.getdecoder('unicode_escape')
1510 r'\\u[0-9a-fA-F]{4}',
1511 lambda m: unicode_escape(m.group(0))[0],
1515 def escape_rfc3986(s):
1516 """Escape non-ASCII characters as suggested by RFC 3986"""
1517 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1518 s = s.encode('utf-8')
1519 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1522 def escape_url(url):
1523 """Escape URL as suggested by RFC 3986"""
1524 url_parsed = compat_urllib_parse_urlparse(url)
1525 return url_parsed._replace(
1526 path=escape_rfc3986(url_parsed.path),
1527 params=escape_rfc3986(url_parsed.params),
1528 query=escape_rfc3986(url_parsed.query),
1529 fragment=escape_rfc3986(url_parsed.fragment)
1533 struct.pack('!I', 0)
1535 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1536 def struct_pack(spec, *args):
1537 if isinstance(spec, compat_str):
1538 spec = spec.encode('ascii')
1539 return struct.pack(spec, *args)
1541 def struct_unpack(spec, *args):
1542 if isinstance(spec, compat_str):
1543 spec = spec.encode('ascii')
1544 return struct.unpack(spec, *args)
1546 struct_pack = struct.pack
1547 struct_unpack = struct.unpack
1550 def read_batch_urls(batch_fd):
1552 if not isinstance(url, compat_str):
1553 url = url.decode('utf-8', 'replace')
1554 BOM_UTF8 = '\xef\xbb\xbf'
1555 if url.startswith(BOM_UTF8):
1556 url = url[len(BOM_UTF8):]
1558 if url.startswith(('#', ';', ']')):
1562 with contextlib.closing(batch_fd) as fd:
1563 return [url for url in map(fixup, fd) if url]
1566 def urlencode_postdata(*args, **kargs):
1567 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1571 etree_iter = xml.etree.ElementTree.Element.iter
1572 except AttributeError: # Python <=2.6
1573 etree_iter = lambda n: n.findall('.//*')
1577 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1578 def doctype(self, name, pubid, system):
1579 pass # Ignore doctypes
1581 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1582 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1583 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1584 # Fix up XML parser in Python 2.x
1585 if sys.version_info < (3, 0):
1586 for n in etree_iter(tree):
1587 if n.text is not None:
1588 if not isinstance(n.text, compat_str):
1589 n.text = n.text.decode('utf-8')
1602 def parse_age_limit(s):
1605 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1606 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1609 def strip_jsonp(code):
1611 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1614 def js_to_json(code):
1617 if v in ('true', 'false', 'null'):
1619 if v.startswith('"'):
1621 if v.startswith("'"):
1623 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1630 res = re.sub(r'''(?x)
1631 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1632 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1633 [a-zA-Z_][.a-zA-Z_0-9]*
1635 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1639 def qualities(quality_ids):
1640 """ Get a numeric quality value out of a list of possible values """
1643 return quality_ids.index(qid)
1649 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1652 def limit_length(s, length):
1653 """ Add ellipses to overly long strings """
1658 return s[:length - len(ELLIPSES)] + ELLIPSES
1662 def version_tuple(v):
1663 return tuple(int(e) for e in re.split(r'[-.]', v))
1666 def is_outdated_version(version, limit, assume_new=True):
1668 return not assume_new
1670 return version_tuple(version) < version_tuple(limit)
1672 return not assume_new
1675 def ytdl_is_updateable():
1676 """ Returns if youtube-dl can be updated with -U """
1677 from zipimport import zipimporter
1679 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1682 def args_to_str(args):
1683 # Get a short string representation for a subprocess command
1684 return ' '.join(shlex_quote(a) for a in args)
1687 def mimetype2ext(mt):
1688 _, _, res = mt.rpartition('/')
1692 'x-mp4-fragmented': 'mp4',
1697 def urlhandle_detect_ext(url_handle):
1700 getheader = lambda h: url_handle.headers[h]
1701 except AttributeError: # Python < 3
1702 getheader = url_handle.info().getheader
1704 cd = getheader('Content-Disposition')
1706 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1708 e = determine_ext(m.group('filename'), default_ext=None)
1712 return mimetype2ext(getheader('Content-Type'))
1715 def age_restricted(content_limit, age_limit):
1716 """ Returns True iff the content should be blocked """
1718 if age_limit is None: # No limit set
1720 if content_limit is None:
1721 return False # Content available for everyone
1722 return age_limit < content_limit
1725 def is_html(first_bytes):
1726 """ Detect whether a file contains HTML by examining its first bytes. """
1729 (b'\xef\xbb\xbf', 'utf-8'),
1730 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1731 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1732 (b'\xff\xfe', 'utf-16-le'),
1733 (b'\xfe\xff', 'utf-16-be'),
1735 for bom, enc in BOMS:
1736 if first_bytes.startswith(bom):
1737 s = first_bytes[len(bom):].decode(enc, 'replace')
1740 s = first_bytes.decode('utf-8', 'replace')
1742 return re.match(r'^\s*<', s)
1745 def determine_protocol(info_dict):
1746 protocol = info_dict.get('protocol')
1747 if protocol is not None:
1750 url = info_dict['url']
1751 if url.startswith('rtmp'):
1753 elif url.startswith('mms'):
1755 elif url.startswith('rtsp'):
1758 ext = determine_ext(url)
1764 return compat_urllib_parse_urlparse(url).scheme
1767 def render_table(header_row, data):
1768 """ Render a list of rows, each as a list of values """
1769 table = [header_row] + data
1770 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1771 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1772 return '\n'.join(format_str % tuple(row) for row in table)
1775 def _match_one(filter_part, dct):
1776 COMPARISON_OPERATORS = {
1784 operator_rex = re.compile(r'''(?x)\s*
1786 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1788 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1789 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1792 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1793 m = operator_rex.search(filter_part)
1795 op = COMPARISON_OPERATORS[m.group('op')]
1796 if m.group('strval') is not None:
1797 if m.group('op') not in ('=', '!='):
1799 'Operator %s does not support string values!' % m.group('op'))
1800 comparison_value = m.group('strval')
1803 comparison_value = int(m.group('intval'))
1805 comparison_value = parse_filesize(m.group('intval'))
1806 if comparison_value is None:
1807 comparison_value = parse_filesize(m.group('intval') + 'B')
1808 if comparison_value is None:
1810 'Invalid integer value %r in filter part %r' % (
1811 m.group('intval'), filter_part))
1812 actual_value = dct.get(m.group('key'))
1813 if actual_value is None:
1814 return m.group('none_inclusive')
1815 return op(actual_value, comparison_value)
1818 '': lambda v: v is not None,
1819 '!': lambda v: v is None,
1821 operator_rex = re.compile(r'''(?x)\s*
1822 (?P<op>%s)\s*(?P<key>[a-z_]+)
1824 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1825 m = operator_rex.search(filter_part)
1827 op = UNARY_OPERATORS[m.group('op')]
1828 actual_value = dct.get(m.group('key'))
1829 return op(actual_value)
1831 raise ValueError('Invalid filter part %r' % filter_part)
1834 def match_str(filter_str, dct):
1835 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1838 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1841 def match_filter_func(filter_str):
1842 def _match_func(info_dict):
1843 if match_str(filter_str, info_dict):
1846 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1847 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1851 def parse_dfxp_time_expr(time_expr):
1855 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1857 return float(mobj.group('time_offset'))
1859 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1861 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1864 def srt_subtitles_timecode(seconds):
1865 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1868 def dfxp2srt(dfxp_data):
1869 _x = functools.partial(xpath_with_ns, ns_map={
1870 'ttml': 'http://www.w3.org/ns/ttml',
1871 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1874 def parse_node(node):
1875 str_or_empty = functools.partial(str_or_none, default='')
1877 out = str_or_empty(node.text)
1880 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1881 out += '\n' + str_or_empty(child.tail)
1882 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1883 out += str_or_empty(parse_node(child))
1885 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1889 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1891 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1894 raise ValueError('Invalid dfxp/TTML subtitle')
1896 for para, index in zip(paras, itertools.count(1)):
1897 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1898 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1900 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1901 out.append('%d\n%s --> %s\n%s\n\n' % (
1903 srt_subtitles_timecode(begin_time),
1904 srt_subtitles_timecode(end_time),
1910 class ISO639Utils(object):
1911 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2100 def short2long(cls, code):
2101 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2102 return cls._lang_map.get(code[:2])
2105 def long2short(cls, code):
2106 """Convert language code from ISO 639-2/T to ISO 639-1"""
2107 for short_name, long_name in cls._lang_map.items():
2108 if long_name == code:
2112 class ISO3166Utils(object):
2113 # From http://data.okfn.org/data/core/country-list
2115 'AF': 'Afghanistan',
2116 'AX': 'Ã…land Islands',
2119 'AS': 'American Samoa',
2124 'AG': 'Antigua and Barbuda',
2141 'BO': 'Bolivia, Plurinational State of',
2142 'BQ': 'Bonaire, Sint Eustatius and Saba',
2143 'BA': 'Bosnia and Herzegovina',
2145 'BV': 'Bouvet Island',
2147 'IO': 'British Indian Ocean Territory',
2148 'BN': 'Brunei Darussalam',
2150 'BF': 'Burkina Faso',
2156 'KY': 'Cayman Islands',
2157 'CF': 'Central African Republic',
2161 'CX': 'Christmas Island',
2162 'CC': 'Cocos (Keeling) Islands',
2166 'CD': 'Congo, the Democratic Republic of the',
2167 'CK': 'Cook Islands',
2169 'CI': 'Côte d\'Ivoire',
2174 'CZ': 'Czech Republic',
2178 'DO': 'Dominican Republic',
2181 'SV': 'El Salvador',
2182 'GQ': 'Equatorial Guinea',
2186 'FK': 'Falkland Islands (Malvinas)',
2187 'FO': 'Faroe Islands',
2191 'GF': 'French Guiana',
2192 'PF': 'French Polynesia',
2193 'TF': 'French Southern Territories',
2208 'GW': 'Guinea-Bissau',
2211 'HM': 'Heard Island and McDonald Islands',
2212 'VA': 'Holy See (Vatican City State)',
2219 'IR': 'Iran, Islamic Republic of',
2222 'IM': 'Isle of Man',
2232 'KP': 'Korea, Democratic People\'s Republic of',
2233 'KR': 'Korea, Republic of',
2236 'LA': 'Lao People\'s Democratic Republic',
2242 'LI': 'Liechtenstein',
2246 'MK': 'Macedonia, the Former Yugoslav Republic of',
2253 'MH': 'Marshall Islands',
2259 'FM': 'Micronesia, Federated States of',
2260 'MD': 'Moldova, Republic of',
2271 'NL': 'Netherlands',
2272 'NC': 'New Caledonia',
2273 'NZ': 'New Zealand',
2278 'NF': 'Norfolk Island',
2279 'MP': 'Northern Mariana Islands',
2284 'PS': 'Palestine, State of',
2286 'PG': 'Papua New Guinea',
2289 'PH': 'Philippines',
2293 'PR': 'Puerto Rico',
2297 'RU': 'Russian Federation',
2299 'BL': 'Saint Barthélemy',
2300 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2301 'KN': 'Saint Kitts and Nevis',
2302 'LC': 'Saint Lucia',
2303 'MF': 'Saint Martin (French part)',
2304 'PM': 'Saint Pierre and Miquelon',
2305 'VC': 'Saint Vincent and the Grenadines',
2308 'ST': 'Sao Tome and Principe',
2309 'SA': 'Saudi Arabia',
2313 'SL': 'Sierra Leone',
2315 'SX': 'Sint Maarten (Dutch part)',
2318 'SB': 'Solomon Islands',
2320 'ZA': 'South Africa',
2321 'GS': 'South Georgia and the South Sandwich Islands',
2322 'SS': 'South Sudan',
2327 'SJ': 'Svalbard and Jan Mayen',
2330 'CH': 'Switzerland',
2331 'SY': 'Syrian Arab Republic',
2332 'TW': 'Taiwan, Province of China',
2334 'TZ': 'Tanzania, United Republic of',
2336 'TL': 'Timor-Leste',
2340 'TT': 'Trinidad and Tobago',
2343 'TM': 'Turkmenistan',
2344 'TC': 'Turks and Caicos Islands',
2348 'AE': 'United Arab Emirates',
2349 'GB': 'United Kingdom',
2350 'US': 'United States',
2351 'UM': 'United States Minor Outlying Islands',
2355 'VE': 'Venezuela, Bolivarian Republic of',
2357 'VG': 'Virgin Islands, British',
2358 'VI': 'Virgin Islands, U.S.',
2359 'WF': 'Wallis and Futuna',
2360 'EH': 'Western Sahara',
2367 def short2full(cls, code):
2368 """Convert an ISO 3166-2 country code to the corresponding full name"""
2369 return cls._country_map.get(code.upper())
2372 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2373 def __init__(self, proxies=None):
2374 # Set default handlers
2375 for type in ('http', 'https'):
2376 setattr(self, '%s_open' % type,
2377 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2378 meth(r, proxy, type))
2379 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2381 def proxy_open(self, req, proxy, type):
2382 req_proxy = req.headers.get('Ytdl-request-proxy')
2383 if req_proxy is not None:
2385 del req.headers['Ytdl-request-proxy']
2387 if proxy == '__noproxy__':
2388 return None # No Proxy
2389 return compat_urllib_request.ProxyHandler.proxy_open(
2390 self, req, proxy, type)