2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 if type(c) is int: return c
199 # This is not clearly defined otherwise
200 compiled_regex_type = type(re.compile(''))
203 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
204 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
205 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206 'Accept-Encoding': 'gzip, deflate',
207 'Accept-Language': 'en-us,en;q=0.5',
210 def preferredencoding():
211 """Get preferred encoding.
213 Returns the best encoding scheme for the system, based on
214 locale.getpreferredencoding() and some further tweaks.
217 pref = locale.getpreferredencoding()
224 if sys.version_info < (3,0):
226 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
229 assert type(s) == type(u'')
232 # In Python 2.x, json.dump expects a bytestream.
233 # In Python 3.x, it writes to a character stream
234 if sys.version_info < (3,0):
235 def write_json_file(obj, fn):
236 with open(fn, 'wb') as f:
239 def write_json_file(obj, fn):
240 with open(fn, 'w', encoding='utf-8') as f:
243 if sys.version_info >= (2,7):
244 def find_xpath_attr(node, xpath, key, val):
245 """ Find the xpath xpath[@key=val] """
246 assert re.match(r'^[a-zA-Z-]+$', key)
247 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
248 expr = xpath + u"[@%s='%s']" % (key, val)
249 return node.find(expr)
251 def find_xpath_attr(node, xpath, key, val):
252 for f in node.findall(xpath):
253 if f.attrib.get(key) == val:
257 # On python2.6 the xml.etree.ElementTree.Element methods don't support
258 # the namespace parameter
259 def xpath_with_ns(path, ns_map):
260 components = [c.split(':') for c in path.split('/')]
264 replaced.append(c[0])
267 replaced.append('{%s}%s' % (ns_map[ns], tag))
268 return '/'.join(replaced)
270 def htmlentity_transform(matchobj):
271 """Transforms an HTML entity to a character.
273 This function receives a match object and is intended to be used with
274 the re.sub() function.
276 entity = matchobj.group(1)
278 # Known non-numeric HTML entity
279 if entity in compat_html_entities.name2codepoint:
280 return compat_chr(compat_html_entities.name2codepoint[entity])
282 mobj = re.match(u'(?u)#(x?\\d+)', entity)
284 numstr = mobj.group(1)
285 if numstr.startswith(u'x'):
287 numstr = u'0%s' % numstr
290 return compat_chr(int(numstr, base))
292 # Unknown entity in name, return its literal representation
293 return (u'&%s;' % entity)
295 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
296 class BaseHTMLParser(compat_html_parser.HTMLParser):
298 compat_html_parser.HTMLParser.__init__(self)
301 def loads(self, html):
306 class AttrParser(BaseHTMLParser):
307 """Modified HTMLParser that isolates a tag with the specified attribute"""
308 def __init__(self, attribute, value):
309 self.attribute = attribute
314 self.watch_startpos = False
316 BaseHTMLParser.__init__(self)
318 def error(self, message):
319 if self.error_count > 10 or self.started:
320 raise compat_html_parser.HTMLParseError(message, self.getpos())
321 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
322 self.error_count += 1
325 def handle_starttag(self, tag, attrs):
328 self.find_startpos(None)
329 if self.attribute in attrs and attrs[self.attribute] == self.value:
332 self.watch_startpos = True
334 if not tag in self.depth: self.depth[tag] = 0
337 def handle_endtag(self, tag):
339 if tag in self.depth: self.depth[tag] -= 1
340 if self.depth[self.result[0]] == 0:
342 self.result.append(self.getpos())
344 def find_startpos(self, x):
345 """Needed to put the start position of the result (self.result[1])
346 after the opening tag with the requested id"""
347 if self.watch_startpos:
348 self.watch_startpos = False
349 self.result.append(self.getpos())
350 handle_entityref = handle_charref = handle_data = handle_comment = \
351 handle_decl = handle_pi = unknown_decl = find_startpos
353 def get_result(self):
354 if self.result is None:
356 if len(self.result) != 3:
358 lines = self.html.split('\n')
359 lines = lines[self.result[1][0]-1:self.result[2][0]]
360 lines[0] = lines[0][self.result[1][1]:]
362 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
363 lines[-1] = lines[-1][:self.result[2][1]]
364 return '\n'.join(lines).strip()
365 # Hack for https://github.com/rg3/youtube-dl/issues/662
366 if sys.version_info < (2, 7, 3):
367 AttrParser.parse_endtag = (lambda self, i:
368 i + len("</scr'+'ipt>")
369 if self.rawdata[i:].startswith("</scr'+'ipt>")
370 else compat_html_parser.HTMLParser.parse_endtag(self, i))
372 def get_element_by_id(id, html):
373 """Return the content of the tag with the specified ID in the passed HTML document"""
374 return get_element_by_attribute("id", id, html)
376 def get_element_by_attribute(attribute, value, html):
377 """Return the content of the tag with the specified attribute in the passed HTML document"""
378 parser = AttrParser(attribute, value)
381 except compat_html_parser.HTMLParseError:
383 return parser.get_result()
385 class MetaParser(BaseHTMLParser):
387 Modified HTMLParser that isolates a meta tag with the specified name
390 def __init__(self, name):
391 BaseHTMLParser.__init__(self)
396 def handle_starttag(self, tag, attrs):
400 if attrs.get('name') == self.name:
401 self.result = attrs.get('content')
403 def get_result(self):
406 def get_meta_content(name, html):
408 Return the content attribute from the meta tag with the given name attribute.
410 parser = MetaParser(name)
413 except compat_html_parser.HTMLParseError:
415 return parser.get_result()
418 def clean_html(html):
419 """Clean an HTML snippet into a readable string"""
421 html = html.replace('\n', ' ')
422 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
423 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
425 html = re.sub('<.*?>', '', html)
426 # Replace html entities
427 html = unescapeHTML(html)
431 def sanitize_open(filename, open_mode):
432 """Try to open the given filename, and slightly tweak it if this fails.
434 Attempts to open the given filename. If this fails, it tries to change
435 the filename slightly, step by step, until it's either able to open it
436 or it fails and raises a final exception, like the standard open()
439 It returns the tuple (stream, definitive_file_name).
443 if sys.platform == 'win32':
445 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
446 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
447 stream = open(encodeFilename(filename), open_mode)
448 return (stream, filename)
449 except (IOError, OSError) as err:
450 if err.errno in (errno.EACCES,):
453 # In case of error, try to remove win32 forbidden chars
454 alt_filename = os.path.join(
455 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
456 for path_part in os.path.split(filename)
458 if alt_filename == filename:
461 # An exception here should be caught in the caller
462 stream = open(encodeFilename(filename), open_mode)
463 return (stream, alt_filename)
466 def timeconvert(timestr):
467 """Convert RFC 2822 defined time string into system timestamp"""
469 timetuple = email.utils.parsedate_tz(timestr)
470 if timetuple is not None:
471 timestamp = email.utils.mktime_tz(timetuple)
474 def sanitize_filename(s, restricted=False, is_id=False):
475 """Sanitizes a string so it could be used as part of a filename.
476 If restricted is set, use a stricter subset of allowed characters.
477 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
479 def replace_insane(char):
480 if char == '?' or ord(char) < 32 or ord(char) == 127:
483 return '' if restricted else '\''
485 return '_-' if restricted else ' -'
486 elif char in '\\/|*<>':
488 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
490 if restricted and ord(char) > 127:
494 result = u''.join(map(replace_insane, s))
496 while '__' in result:
497 result = result.replace('__', '_')
498 result = result.strip('_')
499 # Common case of "Foreign band name - English song title"
500 if restricted and result.startswith('-_'):
506 def orderedSet(iterable):
507 """ Remove all duplicates from the input iterable """
518 assert type(s) == compat_str
520 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
524 def encodeFilename(s, for_subprocess=False):
526 @param s The name of the file
529 assert type(s) == compat_str
531 # Python 3 has a Unicode API
532 if sys.version_info >= (3, 0):
535 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
536 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
537 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
538 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
539 if not for_subprocess:
542 # For subprocess calls, encode with locale encoding
543 # Refer to http://stackoverflow.com/a/9951851/35070
544 encoding = preferredencoding()
546 encoding = sys.getfilesystemencoding()
549 return s.encode(encoding, 'ignore')
552 def encodeArgument(s):
553 if not isinstance(s, compat_str):
554 # Legacy code that uses byte strings
555 # Uncomment the following line after fixing all post processors
556 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
557 s = s.decode('ascii')
558 return encodeFilename(s, True)
561 def decodeOption(optval):
564 if isinstance(optval, bytes):
565 optval = optval.decode(preferredencoding())
567 assert isinstance(optval, compat_str)
570 def formatSeconds(secs):
572 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
574 return '%d:%02d' % (secs // 60, secs % 60)
579 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
580 if sys.version_info < (3, 2):
583 class HTTPSConnectionV3(httplib.HTTPSConnection):
584 def __init__(self, *args, **kwargs):
585 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
588 sock = socket.create_connection((self.host, self.port), self.timeout)
589 if getattr(self, '_tunnel_host', False):
593 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
595 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
597 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
598 def https_open(self, req):
599 return self.do_open(HTTPSConnectionV3, req)
600 return HTTPSHandlerV3(**kwargs)
602 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
603 context.verify_mode = (ssl.CERT_NONE
604 if opts_no_check_certificate
605 else ssl.CERT_REQUIRED)
606 context.set_default_verify_paths()
608 context.load_default_certs()
609 except AttributeError:
611 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
613 class ExtractorError(Exception):
614 """Error during info extraction."""
615 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
616 """ tb, if given, is the original traceback (so that it can be printed out).
617 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
620 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
622 if video_id is not None:
623 msg = video_id + ': ' + msg
625 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
626 super(ExtractorError, self).__init__(msg)
629 self.exc_info = sys.exc_info() # preserve original exception
631 self.video_id = video_id
633 def format_traceback(self):
634 if self.traceback is None:
636 return u''.join(traceback.format_tb(self.traceback))
639 class RegexNotFoundError(ExtractorError):
640 """Error when a regex didn't match"""
644 class DownloadError(Exception):
645 """Download Error exception.
647 This exception may be thrown by FileDownloader objects if they are not
648 configured to continue on errors. They will contain the appropriate
651 def __init__(self, msg, exc_info=None):
652 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
653 super(DownloadError, self).__init__(msg)
654 self.exc_info = exc_info
657 class SameFileError(Exception):
658 """Same File exception.
660 This exception will be thrown by FileDownloader objects if they detect
661 multiple files would have to be downloaded to the same file on disk.
666 class PostProcessingError(Exception):
667 """Post Processing exception.
669 This exception may be raised by PostProcessor's .run() method to
670 indicate an error in the postprocessing task.
672 def __init__(self, msg):
675 class MaxDownloadsReached(Exception):
676 """ --max-downloads limit has been reached. """
680 class UnavailableVideoError(Exception):
681 """Unavailable Format exception.
683 This exception will be thrown when a video is requested
684 in a format that is not available for that video.
689 class ContentTooShortError(Exception):
690 """Content Too Short exception.
692 This exception may be raised by FileDownloader objects when a file they
693 download is too small for what the server announced first, indicating
694 the connection was probably interrupted.
700 def __init__(self, downloaded, expected):
701 self.downloaded = downloaded
702 self.expected = expected
704 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
705 """Handler for HTTP requests and responses.
707 This class, when installed with an OpenerDirector, automatically adds
708 the standard headers to every HTTP request and handles gzipped and
709 deflated responses from web servers. If compression is to be avoided in
710 a particular request, the original request in the program code only has
711 to include the HTTP header "Youtubedl-No-Compression", which will be
712 removed before making the real request.
714 Part of this code was copied from:
716 http://techknack.net/python-urllib2-handlers/
718 Andrew Rowls, the author of that code, agreed to release it to the
725 return zlib.decompress(data, -zlib.MAX_WBITS)
727 return zlib.decompress(data)
730 def addinfourl_wrapper(stream, headers, url, code):
731 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
732 return compat_urllib_request.addinfourl(stream, headers, url, code)
733 ret = compat_urllib_request.addinfourl(stream, headers, url)
737 def http_request(self, req):
738 for h,v in std_headers.items():
742 if 'Youtubedl-no-compression' in req.headers:
743 if 'Accept-encoding' in req.headers:
744 del req.headers['Accept-encoding']
745 del req.headers['Youtubedl-no-compression']
746 if 'Youtubedl-user-agent' in req.headers:
747 if 'User-agent' in req.headers:
748 del req.headers['User-agent']
749 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
750 del req.headers['Youtubedl-user-agent']
755 def http_response(self, req, resp):
758 if resp.headers.get('Content-encoding', '') == 'gzip':
759 content = resp.read()
760 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
762 uncompressed = io.BytesIO(gz.read())
763 except IOError as original_ioerror:
764 # There may be junk add the end of the file
765 # See http://stackoverflow.com/q/4928560/35070 for details
766 for i in range(1, 1024):
768 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
769 uncompressed = io.BytesIO(gz.read())
774 raise original_ioerror
775 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
776 resp.msg = old_resp.msg
778 if resp.headers.get('Content-encoding', '') == 'deflate':
779 gz = io.BytesIO(self.deflate(resp.read()))
780 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
781 resp.msg = old_resp.msg
784 https_request = http_request
785 https_response = http_response
788 def parse_iso8601(date_str, delimiter='T'):
789 """ Return a UNIX timestamp from the given date """
795 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
798 timezone = datetime.timedelta()
800 date_str = date_str[:-len(m.group(0))]
801 if not m.group('sign'):
802 timezone = datetime.timedelta()
804 sign = 1 if m.group('sign') == '+' else -1
805 timezone = datetime.timedelta(
806 hours=sign * int(m.group('hours')),
807 minutes=sign * int(m.group('minutes')))
808 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
809 dt = datetime.datetime.strptime(date_str, date_format) - timezone
810 return calendar.timegm(dt.timetuple())
813 def unified_strdate(date_str):
814 """Return a string with the date in the format YYYYMMDD"""
821 date_str = date_str.replace(',', ' ')
822 # %z (UTC offset) is only supported in python>=3.2
823 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
824 format_expressions = [
829 '%b %dst %Y %I:%M%p',
830 '%b %dnd %Y %I:%M%p',
831 '%b %dth %Y %I:%M%p',
839 '%Y-%m-%dT%H:%M:%SZ',
840 '%Y-%m-%dT%H:%M:%S.%fZ',
841 '%Y-%m-%dT%H:%M:%S.%f0Z',
843 '%Y-%m-%dT%H:%M:%S.%f',
846 for expression in format_expressions:
848 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
851 if upload_date is None:
852 timetuple = email.utils.parsedate_tz(date_str)
854 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
857 def determine_ext(url, default_ext=u'unknown_video'):
860 guess = url.partition(u'?')[0].rpartition(u'.')[2]
861 if re.match(r'^[A-Za-z0-9]+$', guess):
866 def subtitles_filename(filename, sub_lang, sub_format):
867 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
869 def date_from_str(date_str):
871 Return a datetime object from a string in the format YYYYMMDD or
872 (now|today)[+-][0-9](day|week|month|year)(s)?"""
873 today = datetime.date.today()
874 if date_str == 'now'or date_str == 'today':
876 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
877 if match is not None:
878 sign = match.group('sign')
879 time = int(match.group('time'))
882 unit = match.group('unit')
891 delta = datetime.timedelta(**{unit: time})
893 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
895 def hyphenate_date(date_str):
897 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
898 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
899 if match is not None:
900 return '-'.join(match.groups())
904 class DateRange(object):
905 """Represents a time interval between two dates"""
906 def __init__(self, start=None, end=None):
907 """start and end must be strings in the format accepted by date"""
908 if start is not None:
909 self.start = date_from_str(start)
911 self.start = datetime.datetime.min.date()
913 self.end = date_from_str(end)
915 self.end = datetime.datetime.max.date()
916 if self.start > self.end:
917 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
920 """Returns a range that only contains the given day"""
922 def __contains__(self, date):
923 """Check if the date is in the range"""
924 if not isinstance(date, datetime.date):
925 date = date_from_str(date)
926 return self.start <= date <= self.end
928 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
932 """ Returns the platform name as a compat_str """
933 res = platform.platform()
934 if isinstance(res, bytes):
935 res = res.decode(preferredencoding())
937 assert isinstance(res, compat_str)
941 def _windows_write_string(s, out):
942 """ Returns True if the string was written using special methods,
943 False if it has yet to be written out."""
944 # Adapted from http://stackoverflow.com/a/3259271/35070
947 import ctypes.wintypes
955 fileno = out.fileno()
956 except AttributeError:
957 # If the output stream doesn't have a fileno, it's virtual
959 if fileno not in WIN_OUTPUT_IDS:
962 GetStdHandle = ctypes.WINFUNCTYPE(
963 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
964 ("GetStdHandle", ctypes.windll.kernel32))
965 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
967 WriteConsoleW = ctypes.WINFUNCTYPE(
968 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
969 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
970 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
971 written = ctypes.wintypes.DWORD(0)
973 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
974 FILE_TYPE_CHAR = 0x0002
975 FILE_TYPE_REMOTE = 0x8000
976 GetConsoleMode = ctypes.WINFUNCTYPE(
977 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
978 ctypes.POINTER(ctypes.wintypes.DWORD))(
979 ("GetConsoleMode", ctypes.windll.kernel32))
980 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
982 def not_a_console(handle):
983 if handle == INVALID_HANDLE_VALUE or handle is None:
985 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
986 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
991 def next_nonbmp_pos(s):
993 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
994 except StopIteration:
998 count = min(next_nonbmp_pos(s), 1024)
1000 ret = WriteConsoleW(
1001 h, s, count if count else 2, ctypes.byref(written), None)
1003 raise OSError('Failed to write string')
1004 if not count: # We just wrote a non-BMP character
1005 assert written.value == 2
1008 assert written.value > 0
1009 s = s[written.value:]
1013 def write_string(s, out=None, encoding=None):
1016 assert type(s) == compat_str
1018 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1019 if _windows_write_string(s, out):
1022 if ('b' in getattr(out, 'mode', '') or
1023 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1024 byt = s.encode(encoding or preferredencoding(), 'ignore')
1026 elif hasattr(out, 'buffer'):
1027 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1028 byt = s.encode(enc, 'ignore')
1029 out.buffer.write(byt)
1035 def bytes_to_intlist(bs):
1038 if isinstance(bs[0], int): # Python 3
1041 return [ord(c) for c in bs]
1044 def intlist_to_bytes(xs):
1047 if isinstance(chr(0), bytes): # Python 2
1048 return ''.join([chr(x) for x in xs])
1053 def get_cachedir(params={}):
1054 cache_root = os.environ.get('XDG_CACHE_HOME',
1055 os.path.expanduser('~/.cache'))
1056 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1059 # Cross-platform file locking
1060 if sys.platform == 'win32':
1061 import ctypes.wintypes
1064 class OVERLAPPED(ctypes.Structure):
1066 ('Internal', ctypes.wintypes.LPVOID),
1067 ('InternalHigh', ctypes.wintypes.LPVOID),
1068 ('Offset', ctypes.wintypes.DWORD),
1069 ('OffsetHigh', ctypes.wintypes.DWORD),
1070 ('hEvent', ctypes.wintypes.HANDLE),
1073 kernel32 = ctypes.windll.kernel32
1074 LockFileEx = kernel32.LockFileEx
1075 LockFileEx.argtypes = [
1076 ctypes.wintypes.HANDLE, # hFile
1077 ctypes.wintypes.DWORD, # dwFlags
1078 ctypes.wintypes.DWORD, # dwReserved
1079 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1080 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1081 ctypes.POINTER(OVERLAPPED) # Overlapped
1083 LockFileEx.restype = ctypes.wintypes.BOOL
1084 UnlockFileEx = kernel32.UnlockFileEx
1085 UnlockFileEx.argtypes = [
1086 ctypes.wintypes.HANDLE, # hFile
1087 ctypes.wintypes.DWORD, # dwReserved
1088 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1089 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1090 ctypes.POINTER(OVERLAPPED) # Overlapped
1092 UnlockFileEx.restype = ctypes.wintypes.BOOL
1093 whole_low = 0xffffffff
1094 whole_high = 0x7fffffff
1096 def _lock_file(f, exclusive):
1097 overlapped = OVERLAPPED()
1098 overlapped.Offset = 0
1099 overlapped.OffsetHigh = 0
1100 overlapped.hEvent = 0
1101 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1102 handle = msvcrt.get_osfhandle(f.fileno())
1103 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1104 whole_low, whole_high, f._lock_file_overlapped_p):
1105 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1107 def _unlock_file(f):
1108 assert f._lock_file_overlapped_p
1109 handle = msvcrt.get_osfhandle(f.fileno())
1110 if not UnlockFileEx(handle, 0,
1111 whole_low, whole_high, f._lock_file_overlapped_p):
1112 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1117 def _lock_file(f, exclusive):
1118 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1120 def _unlock_file(f):
1121 fcntl.lockf(f, fcntl.LOCK_UN)
1124 class locked_file(object):
1125 def __init__(self, filename, mode, encoding=None):
1126 assert mode in ['r', 'a', 'w']
1127 self.f = io.open(filename, mode, encoding=encoding)
1130 def __enter__(self):
1131 exclusive = self.mode != 'r'
1133 _lock_file(self.f, exclusive)
1139 def __exit__(self, etype, value, traceback):
1141 _unlock_file(self.f)
1148 def write(self, *args):
1149 return self.f.write(*args)
1151 def read(self, *args):
1152 return self.f.read(*args)
1155 def shell_quote(args):
1157 encoding = sys.getfilesystemencoding()
1158 if encoding is None:
1161 if isinstance(a, bytes):
1162 # We may get a filename encoded with 'encodeFilename'
1163 a = a.decode(encoding)
1164 quoted_args.append(pipes.quote(a))
1165 return u' '.join(quoted_args)
1168 def takewhile_inclusive(pred, seq):
1169 """ Like itertools.takewhile, but include the latest evaluated element
1170 (the first element so that Not pred(e)) """
1177 def smuggle_url(url, data):
1178 """ Pass additional data in a URL for internal use. """
1180 sdata = compat_urllib_parse.urlencode(
1181 {u'__youtubedl_smuggle': json.dumps(data)})
1182 return url + u'#' + sdata
1185 def unsmuggle_url(smug_url, default=None):
1186 if not '#__youtubedl_smuggle' in smug_url:
1187 return smug_url, default
1188 url, _, sdata = smug_url.rpartition(u'#')
1189 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1190 data = json.loads(jsond)
1194 def format_bytes(bytes):
1197 if type(bytes) is str:
1198 bytes = float(bytes)
1202 exponent = int(math.log(bytes, 1024.0))
1203 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1204 converted = float(bytes) / float(1024 ** exponent)
1205 return u'%.2f%s' % (converted, suffix)
1208 def get_term_width():
1209 columns = os.environ.get('COLUMNS', None)
1214 sp = subprocess.Popen(
1216 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1217 out, err = sp.communicate()
1218 return int(out.split()[1])
1224 def month_by_name(name):
1225 """ Return the number of a month by (locale-independently) English name """
1228 u'January', u'February', u'March', u'April', u'May', u'June',
1229 u'July', u'August', u'September', u'October', u'November', u'December']
1231 return ENGLISH_NAMES.index(name) + 1
1236 def fix_xml_ampersands(xml_str):
1237 """Replace all the '&' by '&' in XML"""
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1244 def setproctitle(title):
1245 assert isinstance(title, compat_str)
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
1254 libc.prctl(15, buf, 0, 0, 0)
1255 except AttributeError:
1256 return # Strange libc, just skip this
1259 def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1265 def url_basename(url):
1266 path = compat_urlparse.urlparse(url).path
1267 return path.strip(u'/').split(u'/')[-1]
1270 class HEADRequest(compat_urllib_request.Request):
1271 def get_method(self):
1275 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1278 v = getattr(v, get_attr, None)
1279 return default if v is None else (int(v) * invscale // scale)
1282 def str_to_int(int_str):
1285 int_str = re.sub(r'[,\.]', u'', int_str)
1289 def float_or_none(v, scale=1, invscale=1, default=None):
1290 return default if v is None else (float(v) * invscale / scale)
1293 def parse_duration(s):
1298 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1301 res = int(m.group('secs'))
1303 res += int(m.group('mins')) * 60
1304 if m.group('hours'):
1305 res += int(m.group('hours')) * 60 * 60
1309 def prepend_extension(filename, ext):
1310 name, real_ext = os.path.splitext(filename)
1311 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1314 def check_executable(exe, args=[]):
1315 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316 args can be a list of arguments for a short output (like -version) """
1318 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1324 class PagedList(object):
1325 def __init__(self, pagefunc, pagesize):
1326 self._pagefunc = pagefunc
1327 self._pagesize = pagesize
1330 # This is only useful for tests
1331 return len(self.getslice())
1333 def getslice(self, start=0, end=None):
1335 for pagenum in itertools.count(start // self._pagesize):
1336 firstid = pagenum * self._pagesize
1337 nextfirstid = pagenum * self._pagesize + self._pagesize
1338 if start >= nextfirstid:
1341 page_results = list(self._pagefunc(pagenum))
1344 start % self._pagesize
1345 if firstid <= start < nextfirstid
1349 ((end - 1) % self._pagesize) + 1
1350 if (end is not None and firstid <= end <= nextfirstid)
1353 if startv != 0 or endv is not None:
1354 page_results = page_results[startv:endv]
1355 res.extend(page_results)
1357 # A little optimization - if current page is not "full", ie. does
1358 # not contain page_size videos then we can assume that this page
1359 # is the last one - there are no more ids on further pages -
1360 # i.e. no need to query again.
1361 if len(page_results) + startv < self._pagesize:
1364 # If we got the whole page, but the next page is not interesting,
1365 # break out early as well
1366 if end == nextfirstid:
1371 def uppercase_escape(s):
1372 unicode_escape = codecs.getdecoder('unicode_escape')
1374 r'\\U[0-9a-fA-F]{8}',
1375 lambda m: unicode_escape(m.group(0))[0],
1379 struct.pack(u'!I', 0)
1381 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1382 def struct_pack(spec, *args):
1383 if isinstance(spec, compat_str):
1384 spec = spec.encode('ascii')
1385 return struct.pack(spec, *args)
1387 def struct_unpack(spec, *args):
1388 if isinstance(spec, compat_str):
1389 spec = spec.encode('ascii')
1390 return struct.unpack(spec, *args)
1392 struct_pack = struct.pack
1393 struct_unpack = struct.unpack
1396 def read_batch_urls(batch_fd):
1398 if not isinstance(url, compat_str):
1399 url = url.decode('utf-8', 'replace')
1400 BOM_UTF8 = u'\xef\xbb\xbf'
1401 if url.startswith(BOM_UTF8):
1402 url = url[len(BOM_UTF8):]
1404 if url.startswith(('#', ';', ']')):
1408 with contextlib.closing(batch_fd) as fd:
1409 return [url for url in map(fixup, fd) if url]
1412 def urlencode_postdata(*args, **kargs):
1413 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1417 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1418 def doctype(self, name, pubid, system):
1419 pass # Ignore doctypes
1421 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1422 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1423 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1426 if sys.version_info < (3, 0) and sys.platform == 'win32':
1427 def compat_getpass(prompt, *args, **kwargs):
1428 if isinstance(prompt, compat_str):
1429 prompt = prompt.encode(preferredencoding())
1430 return getpass.getpass(prompt, *args, **kwargs)
1432 compat_getpass = getpass.getpass
1444 def strip_jsonp(code):
1445 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1448 def qualities(quality_ids):
1449 """ Get a numeric quality value out of a list of possible values """
1452 return quality_ids.index(qid)
1458 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1461 subprocess_check_output = subprocess.check_output
1462 except AttributeError:
1463 def subprocess_check_output(*args, **kwargs):
1464 assert 'input' not in kwargs
1465 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1466 output, _ = p.communicate()
1469 raise subprocess.CalledProcessError(ret, p.args, output=output)