2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
295 replaced.append(c[0])
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
301 def htmlentity_transform(matchobj):
302 """Transforms an HTML entity to a character.
304 This function receives a match object and is intended to be used with
305 the re.sub() function.
307 entity = matchobj.group(1)
309 # Known non-numeric HTML entity
310 if entity in compat_html_entities.name2codepoint:
311 return compat_chr(compat_html_entities.name2codepoint[entity])
313 mobj = re.match(u'(?u)#(x?\\d+)', entity)
315 numstr = mobj.group(1)
316 if numstr.startswith(u'x'):
318 numstr = u'0%s' % numstr
321 return compat_chr(int(numstr, base))
323 # Unknown entity in name, return its literal representation
324 return (u'&%s;' % entity)
326 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
327 class BaseHTMLParser(compat_html_parser.HTMLParser):
329 compat_html_parser.HTMLParser.__init__(self)
332 def loads(self, html):
337 class AttrParser(BaseHTMLParser):
338 """Modified HTMLParser that isolates a tag with the specified attribute"""
339 def __init__(self, attribute, value):
340 self.attribute = attribute
345 self.watch_startpos = False
347 BaseHTMLParser.__init__(self)
349 def error(self, message):
350 if self.error_count > 10 or self.started:
351 raise compat_html_parser.HTMLParseError(message, self.getpos())
352 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
353 self.error_count += 1
356 def handle_starttag(self, tag, attrs):
359 self.find_startpos(None)
360 if self.attribute in attrs and attrs[self.attribute] == self.value:
363 self.watch_startpos = True
365 if not tag in self.depth: self.depth[tag] = 0
368 def handle_endtag(self, tag):
370 if tag in self.depth: self.depth[tag] -= 1
371 if self.depth[self.result[0]] == 0:
373 self.result.append(self.getpos())
375 def find_startpos(self, x):
376 """Needed to put the start position of the result (self.result[1])
377 after the opening tag with the requested id"""
378 if self.watch_startpos:
379 self.watch_startpos = False
380 self.result.append(self.getpos())
381 handle_entityref = handle_charref = handle_data = handle_comment = \
382 handle_decl = handle_pi = unknown_decl = find_startpos
384 def get_result(self):
385 if self.result is None:
387 if len(self.result) != 3:
389 lines = self.html.split('\n')
390 lines = lines[self.result[1][0]-1:self.result[2][0]]
391 lines[0] = lines[0][self.result[1][1]:]
393 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
394 lines[-1] = lines[-1][:self.result[2][1]]
395 return '\n'.join(lines).strip()
396 # Hack for https://github.com/rg3/youtube-dl/issues/662
397 if sys.version_info < (2, 7, 3):
398 AttrParser.parse_endtag = (lambda self, i:
399 i + len("</scr'+'ipt>")
400 if self.rawdata[i:].startswith("</scr'+'ipt>")
401 else compat_html_parser.HTMLParser.parse_endtag(self, i))
403 def get_element_by_id(id, html):
404 """Return the content of the tag with the specified ID in the passed HTML document"""
405 return get_element_by_attribute("id", id, html)
407 def get_element_by_attribute(attribute, value, html):
408 """Return the content of the tag with the specified attribute in the passed HTML document"""
409 parser = AttrParser(attribute, value)
412 except compat_html_parser.HTMLParseError:
414 return parser.get_result()
416 class MetaParser(BaseHTMLParser):
418 Modified HTMLParser that isolates a meta tag with the specified name
421 def __init__(self, name):
422 BaseHTMLParser.__init__(self)
427 def handle_starttag(self, tag, attrs):
431 if attrs.get('name') == self.name:
432 self.result = attrs.get('content')
434 def get_result(self):
437 def get_meta_content(name, html):
439 Return the content attribute from the meta tag with the given name attribute.
441 parser = MetaParser(name)
444 except compat_html_parser.HTMLParseError:
446 return parser.get_result()
449 def clean_html(html):
450 """Clean an HTML snippet into a readable string"""
452 html = html.replace('\n', ' ')
453 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
454 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
456 html = re.sub('<.*?>', '', html)
457 # Replace html entities
458 html = unescapeHTML(html)
462 def sanitize_open(filename, open_mode):
463 """Try to open the given filename, and slightly tweak it if this fails.
465 Attempts to open the given filename. If this fails, it tries to change
466 the filename slightly, step by step, until it's either able to open it
467 or it fails and raises a final exception, like the standard open()
470 It returns the tuple (stream, definitive_file_name).
474 if sys.platform == 'win32':
476 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
477 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
478 stream = open(encodeFilename(filename), open_mode)
479 return (stream, filename)
480 except (IOError, OSError) as err:
481 if err.errno in (errno.EACCES,):
484 # In case of error, try to remove win32 forbidden chars
485 alt_filename = os.path.join(
486 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
487 for path_part in os.path.split(filename)
489 if alt_filename == filename:
492 # An exception here should be caught in the caller
493 stream = open(encodeFilename(filename), open_mode)
494 return (stream, alt_filename)
497 def timeconvert(timestr):
498 """Convert RFC 2822 defined time string into system timestamp"""
500 timetuple = email.utils.parsedate_tz(timestr)
501 if timetuple is not None:
502 timestamp = email.utils.mktime_tz(timetuple)
505 def sanitize_filename(s, restricted=False, is_id=False):
506 """Sanitizes a string so it could be used as part of a filename.
507 If restricted is set, use a stricter subset of allowed characters.
508 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
510 def replace_insane(char):
511 if char == '?' or ord(char) < 32 or ord(char) == 127:
514 return '' if restricted else '\''
516 return '_-' if restricted else ' -'
517 elif char in '\\/|*<>':
519 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
521 if restricted and ord(char) > 127:
525 result = u''.join(map(replace_insane, s))
527 while '__' in result:
528 result = result.replace('__', '_')
529 result = result.strip('_')
530 # Common case of "Foreign band name - English song title"
531 if restricted and result.startswith('-_'):
537 def orderedSet(iterable):
538 """ Remove all duplicates from the input iterable """
549 assert type(s) == compat_str
551 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
555 def encodeFilename(s, for_subprocess=False):
557 @param s The name of the file
560 assert type(s) == compat_str
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
568 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
569 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
570 if not for_subprocess:
573 # For subprocess calls, encode with locale encoding
574 # Refer to http://stackoverflow.com/a/9951851/35070
575 encoding = preferredencoding()
577 encoding = sys.getfilesystemencoding()
580 return s.encode(encoding, 'ignore')
583 def encodeArgument(s):
584 if not isinstance(s, compat_str):
585 # Legacy code that uses byte strings
586 # Uncomment the following line after fixing all post processors
587 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
588 s = s.decode('ascii')
589 return encodeFilename(s, True)
592 def decodeOption(optval):
595 if isinstance(optval, bytes):
596 optval = optval.decode(preferredencoding())
598 assert isinstance(optval, compat_str)
601 def formatSeconds(secs):
603 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
605 return '%d:%02d' % (secs // 60, secs % 60)
610 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
611 if sys.version_info < (3, 2):
614 class HTTPSConnectionV3(httplib.HTTPSConnection):
615 def __init__(self, *args, **kwargs):
616 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
619 sock = socket.create_connection((self.host, self.port), self.timeout)
620 if getattr(self, '_tunnel_host', False):
624 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
626 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
628 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
629 def https_open(self, req):
630 return self.do_open(HTTPSConnectionV3, req)
631 return HTTPSHandlerV3(**kwargs)
633 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
634 context.verify_mode = (ssl.CERT_NONE
635 if opts_no_check_certificate
636 else ssl.CERT_REQUIRED)
637 context.set_default_verify_paths()
639 context.load_default_certs()
640 except AttributeError:
642 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
644 class ExtractorError(Exception):
645 """Error during info extraction."""
646 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
647 """ tb, if given, is the original traceback (so that it can be printed out).
648 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
651 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
653 if video_id is not None:
654 msg = video_id + ': ' + msg
656 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
657 super(ExtractorError, self).__init__(msg)
660 self.exc_info = sys.exc_info() # preserve original exception
662 self.video_id = video_id
664 def format_traceback(self):
665 if self.traceback is None:
667 return u''.join(traceback.format_tb(self.traceback))
670 class RegexNotFoundError(ExtractorError):
671 """Error when a regex didn't match"""
675 class DownloadError(Exception):
676 """Download Error exception.
678 This exception may be thrown by FileDownloader objects if they are not
679 configured to continue on errors. They will contain the appropriate
682 def __init__(self, msg, exc_info=None):
683 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
684 super(DownloadError, self).__init__(msg)
685 self.exc_info = exc_info
688 class SameFileError(Exception):
689 """Same File exception.
691 This exception will be thrown by FileDownloader objects if they detect
692 multiple files would have to be downloaded to the same file on disk.
697 class PostProcessingError(Exception):
698 """Post Processing exception.
700 This exception may be raised by PostProcessor's .run() method to
701 indicate an error in the postprocessing task.
703 def __init__(self, msg):
706 class MaxDownloadsReached(Exception):
707 """ --max-downloads limit has been reached. """
711 class UnavailableVideoError(Exception):
712 """Unavailable Format exception.
714 This exception will be thrown when a video is requested
715 in a format that is not available for that video.
720 class ContentTooShortError(Exception):
721 """Content Too Short exception.
723 This exception may be raised by FileDownloader objects when a file they
724 download is too small for what the server announced first, indicating
725 the connection was probably interrupted.
731 def __init__(self, downloaded, expected):
732 self.downloaded = downloaded
733 self.expected = expected
735 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
736 """Handler for HTTP requests and responses.
738 This class, when installed with an OpenerDirector, automatically adds
739 the standard headers to every HTTP request and handles gzipped and
740 deflated responses from web servers. If compression is to be avoided in
741 a particular request, the original request in the program code only has
742 to include the HTTP header "Youtubedl-No-Compression", which will be
743 removed before making the real request.
745 Part of this code was copied from:
747 http://techknack.net/python-urllib2-handlers/
749 Andrew Rowls, the author of that code, agreed to release it to the
756 return zlib.decompress(data, -zlib.MAX_WBITS)
758 return zlib.decompress(data)
761 def addinfourl_wrapper(stream, headers, url, code):
762 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
763 return compat_urllib_request.addinfourl(stream, headers, url, code)
764 ret = compat_urllib_request.addinfourl(stream, headers, url)
768 def http_request(self, req):
769 for h, v in std_headers.items():
770 if h not in req.headers:
772 if 'Youtubedl-no-compression' in req.headers:
773 if 'Accept-encoding' in req.headers:
774 del req.headers['Accept-encoding']
775 del req.headers['Youtubedl-no-compression']
776 if 'Youtubedl-user-agent' in req.headers:
777 if 'User-agent' in req.headers:
778 del req.headers['User-agent']
779 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
780 del req.headers['Youtubedl-user-agent']
783 def http_response(self, req, resp):
786 if resp.headers.get('Content-encoding', '') == 'gzip':
787 content = resp.read()
788 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
790 uncompressed = io.BytesIO(gz.read())
791 except IOError as original_ioerror:
792 # There may be junk add the end of the file
793 # See http://stackoverflow.com/q/4928560/35070 for details
794 for i in range(1, 1024):
796 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
797 uncompressed = io.BytesIO(gz.read())
802 raise original_ioerror
803 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
804 resp.msg = old_resp.msg
806 if resp.headers.get('Content-encoding', '') == 'deflate':
807 gz = io.BytesIO(self.deflate(resp.read()))
808 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
809 resp.msg = old_resp.msg
812 https_request = http_request
813 https_response = http_response
816 def parse_iso8601(date_str, delimiter='T'):
817 """ Return a UNIX timestamp from the given date """
823 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
826 timezone = datetime.timedelta()
828 date_str = date_str[:-len(m.group(0))]
829 if not m.group('sign'):
830 timezone = datetime.timedelta()
832 sign = 1 if m.group('sign') == '+' else -1
833 timezone = datetime.timedelta(
834 hours=sign * int(m.group('hours')),
835 minutes=sign * int(m.group('minutes')))
836 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
837 dt = datetime.datetime.strptime(date_str, date_format) - timezone
838 return calendar.timegm(dt.timetuple())
841 def unified_strdate(date_str):
842 """Return a string with the date in the format YYYYMMDD"""
849 date_str = date_str.replace(',', ' ')
850 # %z (UTC offset) is only supported in python>=3.2
851 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
852 format_expressions = [
857 '%b %dst %Y %I:%M%p',
858 '%b %dnd %Y %I:%M%p',
859 '%b %dth %Y %I:%M%p',
869 '%Y-%m-%dT%H:%M:%SZ',
870 '%Y-%m-%dT%H:%M:%S.%fZ',
871 '%Y-%m-%dT%H:%M:%S.%f0Z',
873 '%Y-%m-%dT%H:%M:%S.%f',
876 for expression in format_expressions:
878 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
881 if upload_date is None:
882 timetuple = email.utils.parsedate_tz(date_str)
884 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
887 def determine_ext(url, default_ext=u'unknown_video'):
890 guess = url.partition(u'?')[0].rpartition(u'.')[2]
891 if re.match(r'^[A-Za-z0-9]+$', guess):
896 def subtitles_filename(filename, sub_lang, sub_format):
897 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
899 def date_from_str(date_str):
901 Return a datetime object from a string in the format YYYYMMDD or
902 (now|today)[+-][0-9](day|week|month|year)(s)?"""
903 today = datetime.date.today()
904 if date_str == 'now'or date_str == 'today':
906 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
907 if match is not None:
908 sign = match.group('sign')
909 time = int(match.group('time'))
912 unit = match.group('unit')
921 delta = datetime.timedelta(**{unit: time})
923 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
925 def hyphenate_date(date_str):
927 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
928 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
929 if match is not None:
930 return '-'.join(match.groups())
934 class DateRange(object):
935 """Represents a time interval between two dates"""
936 def __init__(self, start=None, end=None):
937 """start and end must be strings in the format accepted by date"""
938 if start is not None:
939 self.start = date_from_str(start)
941 self.start = datetime.datetime.min.date()
943 self.end = date_from_str(end)
945 self.end = datetime.datetime.max.date()
946 if self.start > self.end:
947 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
950 """Returns a range that only contains the given day"""
952 def __contains__(self, date):
953 """Check if the date is in the range"""
954 if not isinstance(date, datetime.date):
955 date = date_from_str(date)
956 return self.start <= date <= self.end
958 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
962 """ Returns the platform name as a compat_str """
963 res = platform.platform()
964 if isinstance(res, bytes):
965 res = res.decode(preferredencoding())
967 assert isinstance(res, compat_str)
971 def _windows_write_string(s, out):
972 """ Returns True if the string was written using special methods,
973 False if it has yet to be written out."""
974 # Adapted from http://stackoverflow.com/a/3259271/35070
977 import ctypes.wintypes
985 fileno = out.fileno()
986 except AttributeError:
987 # If the output stream doesn't have a fileno, it's virtual
989 if fileno not in WIN_OUTPUT_IDS:
992 GetStdHandle = ctypes.WINFUNCTYPE(
993 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
994 ("GetStdHandle", ctypes.windll.kernel32))
995 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
997 WriteConsoleW = ctypes.WINFUNCTYPE(
998 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
999 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1000 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1001 written = ctypes.wintypes.DWORD(0)
1003 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1004 FILE_TYPE_CHAR = 0x0002
1005 FILE_TYPE_REMOTE = 0x8000
1006 GetConsoleMode = ctypes.WINFUNCTYPE(
1007 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1008 ctypes.POINTER(ctypes.wintypes.DWORD))(
1009 ("GetConsoleMode", ctypes.windll.kernel32))
1010 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1012 def not_a_console(handle):
1013 if handle == INVALID_HANDLE_VALUE or handle is None:
1015 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1016 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1018 if not_a_console(h):
1021 def next_nonbmp_pos(s):
1023 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1024 except StopIteration:
1028 count = min(next_nonbmp_pos(s), 1024)
1030 ret = WriteConsoleW(
1031 h, s, count if count else 2, ctypes.byref(written), None)
1033 raise OSError('Failed to write string')
1034 if not count: # We just wrote a non-BMP character
1035 assert written.value == 2
1038 assert written.value > 0
1039 s = s[written.value:]
1043 def write_string(s, out=None, encoding=None):
1046 assert type(s) == compat_str
1048 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1049 if _windows_write_string(s, out):
1052 if ('b' in getattr(out, 'mode', '') or
1053 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1054 byt = s.encode(encoding or preferredencoding(), 'ignore')
1056 elif hasattr(out, 'buffer'):
1057 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1058 byt = s.encode(enc, 'ignore')
1059 out.buffer.write(byt)
1065 def bytes_to_intlist(bs):
1068 if isinstance(bs[0], int): # Python 3
1071 return [ord(c) for c in bs]
1074 def intlist_to_bytes(xs):
1077 if isinstance(chr(0), bytes): # Python 2
1078 return ''.join([chr(x) for x in xs])
1083 def get_cachedir(params={}):
1084 cache_root = os.environ.get('XDG_CACHE_HOME',
1085 os.path.expanduser('~/.cache'))
1086 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1089 # Cross-platform file locking
1090 if sys.platform == 'win32':
1091 import ctypes.wintypes
1094 class OVERLAPPED(ctypes.Structure):
1096 ('Internal', ctypes.wintypes.LPVOID),
1097 ('InternalHigh', ctypes.wintypes.LPVOID),
1098 ('Offset', ctypes.wintypes.DWORD),
1099 ('OffsetHigh', ctypes.wintypes.DWORD),
1100 ('hEvent', ctypes.wintypes.HANDLE),
1103 kernel32 = ctypes.windll.kernel32
1104 LockFileEx = kernel32.LockFileEx
1105 LockFileEx.argtypes = [
1106 ctypes.wintypes.HANDLE, # hFile
1107 ctypes.wintypes.DWORD, # dwFlags
1108 ctypes.wintypes.DWORD, # dwReserved
1109 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1111 ctypes.POINTER(OVERLAPPED) # Overlapped
1113 LockFileEx.restype = ctypes.wintypes.BOOL
1114 UnlockFileEx = kernel32.UnlockFileEx
1115 UnlockFileEx.argtypes = [
1116 ctypes.wintypes.HANDLE, # hFile
1117 ctypes.wintypes.DWORD, # dwReserved
1118 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1120 ctypes.POINTER(OVERLAPPED) # Overlapped
1122 UnlockFileEx.restype = ctypes.wintypes.BOOL
1123 whole_low = 0xffffffff
1124 whole_high = 0x7fffffff
1126 def _lock_file(f, exclusive):
1127 overlapped = OVERLAPPED()
1128 overlapped.Offset = 0
1129 overlapped.OffsetHigh = 0
1130 overlapped.hEvent = 0
1131 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1132 handle = msvcrt.get_osfhandle(f.fileno())
1133 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1134 whole_low, whole_high, f._lock_file_overlapped_p):
1135 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1137 def _unlock_file(f):
1138 assert f._lock_file_overlapped_p
1139 handle = msvcrt.get_osfhandle(f.fileno())
1140 if not UnlockFileEx(handle, 0,
1141 whole_low, whole_high, f._lock_file_overlapped_p):
1142 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1147 def _lock_file(f, exclusive):
1148 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1150 def _unlock_file(f):
1151 fcntl.lockf(f, fcntl.LOCK_UN)
1154 class locked_file(object):
1155 def __init__(self, filename, mode, encoding=None):
1156 assert mode in ['r', 'a', 'w']
1157 self.f = io.open(filename, mode, encoding=encoding)
1160 def __enter__(self):
1161 exclusive = self.mode != 'r'
1163 _lock_file(self.f, exclusive)
1169 def __exit__(self, etype, value, traceback):
1171 _unlock_file(self.f)
1178 def write(self, *args):
1179 return self.f.write(*args)
1181 def read(self, *args):
1182 return self.f.read(*args)
1185 def shell_quote(args):
1187 encoding = sys.getfilesystemencoding()
1188 if encoding is None:
1191 if isinstance(a, bytes):
1192 # We may get a filename encoded with 'encodeFilename'
1193 a = a.decode(encoding)
1194 quoted_args.append(pipes.quote(a))
1195 return u' '.join(quoted_args)
1198 def takewhile_inclusive(pred, seq):
1199 """ Like itertools.takewhile, but include the latest evaluated element
1200 (the first element so that Not pred(e)) """
1207 def smuggle_url(url, data):
1208 """ Pass additional data in a URL for internal use. """
1210 sdata = compat_urllib_parse.urlencode(
1211 {u'__youtubedl_smuggle': json.dumps(data)})
1212 return url + u'#' + sdata
1215 def unsmuggle_url(smug_url, default=None):
1216 if not '#__youtubedl_smuggle' in smug_url:
1217 return smug_url, default
1218 url, _, sdata = smug_url.rpartition(u'#')
1219 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1220 data = json.loads(jsond)
1224 def format_bytes(bytes):
1227 if type(bytes) is str:
1228 bytes = float(bytes)
1232 exponent = int(math.log(bytes, 1024.0))
1233 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1234 converted = float(bytes) / float(1024 ** exponent)
1235 return u'%.2f%s' % (converted, suffix)
1238 def get_term_width():
1239 columns = os.environ.get('COLUMNS', None)
1244 sp = subprocess.Popen(
1246 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1247 out, err = sp.communicate()
1248 return int(out.split()[1])
1254 def month_by_name(name):
1255 """ Return the number of a month by (locale-independently) English name """
1258 u'January', u'February', u'March', u'April', u'May', u'June',
1259 u'July', u'August', u'September', u'October', u'November', u'December']
1261 return ENGLISH_NAMES.index(name) + 1
1266 def fix_xml_ampersands(xml_str):
1267 """Replace all the '&' by '&' in XML"""
1269 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1274 def setproctitle(title):
1275 assert isinstance(title, compat_str)
1277 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1280 title_bytes = title.encode('utf-8')
1281 buf = ctypes.create_string_buffer(len(title_bytes))
1282 buf.value = title_bytes
1284 libc.prctl(15, buf, 0, 0, 0)
1285 except AttributeError:
1286 return # Strange libc, just skip this
1289 def remove_start(s, start):
1290 if s.startswith(start):
1291 return s[len(start):]
1295 def remove_end(s, end):
1297 return s[:-len(end)]
1301 def url_basename(url):
1302 path = compat_urlparse.urlparse(url).path
1303 return path.strip(u'/').split(u'/')[-1]
1306 class HEADRequest(compat_urllib_request.Request):
1307 def get_method(self):
1311 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1314 v = getattr(v, get_attr, None)
1317 return default if v is None else (int(v) * invscale // scale)
1320 def str_or_none(v, default=None):
1321 return default if v is None else compat_str(v)
1324 def str_to_int(int_str):
1327 int_str = re.sub(r'[,\.]', u'', int_str)
1331 def float_or_none(v, scale=1, invscale=1, default=None):
1332 return default if v is None else (float(v) * invscale / scale)
1335 def parse_duration(s):
1340 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s)
1343 res = int(m.group('secs'))
1345 res += int(m.group('mins')) * 60
1346 if m.group('hours'):
1347 res += int(m.group('hours')) * 60 * 60
1349 res += float(m.group('ms'))
1353 def prepend_extension(filename, ext):
1354 name, real_ext = os.path.splitext(filename)
1355 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1358 def check_executable(exe, args=[]):
1359 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1360 args can be a list of arguments for a short output (like -version) """
1362 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1368 class PagedList(object):
1369 def __init__(self, pagefunc, pagesize):
1370 self._pagefunc = pagefunc
1371 self._pagesize = pagesize
1374 # This is only useful for tests
1375 return len(self.getslice())
1377 def getslice(self, start=0, end=None):
1379 for pagenum in itertools.count(start // self._pagesize):
1380 firstid = pagenum * self._pagesize
1381 nextfirstid = pagenum * self._pagesize + self._pagesize
1382 if start >= nextfirstid:
1385 page_results = list(self._pagefunc(pagenum))
1388 start % self._pagesize
1389 if firstid <= start < nextfirstid
1393 ((end - 1) % self._pagesize) + 1
1394 if (end is not None and firstid <= end <= nextfirstid)
1397 if startv != 0 or endv is not None:
1398 page_results = page_results[startv:endv]
1399 res.extend(page_results)
1401 # A little optimization - if current page is not "full", ie. does
1402 # not contain page_size videos then we can assume that this page
1403 # is the last one - there are no more ids on further pages -
1404 # i.e. no need to query again.
1405 if len(page_results) + startv < self._pagesize:
1408 # If we got the whole page, but the next page is not interesting,
1409 # break out early as well
1410 if end == nextfirstid:
1415 def uppercase_escape(s):
1416 unicode_escape = codecs.getdecoder('unicode_escape')
1418 r'\\U[0-9a-fA-F]{8}',
1419 lambda m: unicode_escape(m.group(0))[0],
1423 struct.pack(u'!I', 0)
1425 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1426 def struct_pack(spec, *args):
1427 if isinstance(spec, compat_str):
1428 spec = spec.encode('ascii')
1429 return struct.pack(spec, *args)
1431 def struct_unpack(spec, *args):
1432 if isinstance(spec, compat_str):
1433 spec = spec.encode('ascii')
1434 return struct.unpack(spec, *args)
1436 struct_pack = struct.pack
1437 struct_unpack = struct.unpack
1440 def read_batch_urls(batch_fd):
1442 if not isinstance(url, compat_str):
1443 url = url.decode('utf-8', 'replace')
1444 BOM_UTF8 = u'\xef\xbb\xbf'
1445 if url.startswith(BOM_UTF8):
1446 url = url[len(BOM_UTF8):]
1448 if url.startswith(('#', ';', ']')):
1452 with contextlib.closing(batch_fd) as fd:
1453 return [url for url in map(fixup, fd) if url]
1456 def urlencode_postdata(*args, **kargs):
1457 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1461 etree_iter = xml.etree.ElementTree.Element.iter
1462 except AttributeError: # Python <=2.6
1463 etree_iter = lambda n: n.findall('.//*')
1467 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1468 def doctype(self, name, pubid, system):
1469 pass # Ignore doctypes
1471 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1472 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1473 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1474 # Fix up XML parser in Python 2.x
1475 if sys.version_info < (3, 0):
1476 for n in etree_iter(tree):
1477 if n.text is not None:
1478 if not isinstance(n.text, compat_str):
1479 n.text = n.text.decode('utf-8')
1483 if sys.version_info < (3, 0) and sys.platform == 'win32':
1484 def compat_getpass(prompt, *args, **kwargs):
1485 if isinstance(prompt, compat_str):
1486 prompt = prompt.encode(preferredencoding())
1487 return getpass.getpass(prompt, *args, **kwargs)
1489 compat_getpass = getpass.getpass
1501 def strip_jsonp(code):
1502 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1505 def js_to_json(code):
1508 if key.startswith("'"):
1509 assert key.endswith("'")
1510 assert '"' not in key
1511 key = '"%s"' % key[1:-1]
1512 elif not key.startswith('"'):
1516 if value.startswith("'"):
1517 assert value.endswith("'")
1518 assert '"' not in value
1519 value = '"%s"' % value[1:-1]
1521 return m.group(1) + key + m.group(3) + value
1523 res = re.sub(r'''(?x)
1525 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1527 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1529 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1533 def qualities(quality_ids):
1534 """ Get a numeric quality value out of a list of possible values """
1537 return quality_ids.index(qid)
1543 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1546 subprocess_check_output = subprocess.check_output
1547 except AttributeError:
1548 def subprocess_check_output(*args, **kwargs):
1549 assert 'input' not in kwargs
1550 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1551 output, _ = p.communicate()
1554 raise subprocess.CalledProcessError(ret, p.args, output=output)