2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
300 replaced.append(c[0])
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
307 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
308 class BaseHTMLParser(compat_html_parser.HTMLParser):
310 compat_html_parser.HTMLParser.__init__(self)
313 def loads(self, html):
318 class AttrParser(BaseHTMLParser):
319 """Modified HTMLParser that isolates a tag with the specified attribute"""
320 def __init__(self, attribute, value):
321 self.attribute = attribute
326 self.watch_startpos = False
328 BaseHTMLParser.__init__(self)
330 def error(self, message):
331 if self.error_count > 10 or self.started:
332 raise compat_html_parser.HTMLParseError(message, self.getpos())
333 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
334 self.error_count += 1
337 def handle_starttag(self, tag, attrs):
340 self.find_startpos(None)
341 if self.attribute in attrs and attrs[self.attribute] == self.value:
344 self.watch_startpos = True
346 if not tag in self.depth: self.depth[tag] = 0
349 def handle_endtag(self, tag):
351 if tag in self.depth: self.depth[tag] -= 1
352 if self.depth[self.result[0]] == 0:
354 self.result.append(self.getpos())
356 def find_startpos(self, x):
357 """Needed to put the start position of the result (self.result[1])
358 after the opening tag with the requested id"""
359 if self.watch_startpos:
360 self.watch_startpos = False
361 self.result.append(self.getpos())
362 handle_entityref = handle_charref = handle_data = handle_comment = \
363 handle_decl = handle_pi = unknown_decl = find_startpos
365 def get_result(self):
366 if self.result is None:
368 if len(self.result) != 3:
370 lines = self.html.split('\n')
371 lines = lines[self.result[1][0]-1:self.result[2][0]]
372 lines[0] = lines[0][self.result[1][1]:]
374 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
375 lines[-1] = lines[-1][:self.result[2][1]]
376 return '\n'.join(lines).strip()
377 # Hack for https://github.com/rg3/youtube-dl/issues/662
378 if sys.version_info < (2, 7, 3):
379 AttrParser.parse_endtag = (lambda self, i:
380 i + len("</scr'+'ipt>")
381 if self.rawdata[i:].startswith("</scr'+'ipt>")
382 else compat_html_parser.HTMLParser.parse_endtag(self, i))
384 def get_element_by_id(id, html):
385 """Return the content of the tag with the specified ID in the passed HTML document"""
386 return get_element_by_attribute("id", id, html)
388 def get_element_by_attribute(attribute, value, html):
389 """Return the content of the tag with the specified attribute in the passed HTML document"""
390 parser = AttrParser(attribute, value)
393 except compat_html_parser.HTMLParseError:
395 return parser.get_result()
397 class MetaParser(BaseHTMLParser):
399 Modified HTMLParser that isolates a meta tag with the specified name
402 def __init__(self, name):
403 BaseHTMLParser.__init__(self)
408 def handle_starttag(self, tag, attrs):
412 if attrs.get('name') == self.name:
413 self.result = attrs.get('content')
415 def get_result(self):
418 def get_meta_content(name, html):
420 Return the content attribute from the meta tag with the given name attribute.
422 parser = MetaParser(name)
425 except compat_html_parser.HTMLParseError:
427 return parser.get_result()
430 def clean_html(html):
431 """Clean an HTML snippet into a readable string"""
433 html = html.replace('\n', ' ')
434 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
435 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
437 html = re.sub('<.*?>', '', html)
438 # Replace html entities
439 html = unescapeHTML(html)
443 def sanitize_open(filename, open_mode):
444 """Try to open the given filename, and slightly tweak it if this fails.
446 Attempts to open the given filename. If this fails, it tries to change
447 the filename slightly, step by step, until it's either able to open it
448 or it fails and raises a final exception, like the standard open()
451 It returns the tuple (stream, definitive_file_name).
455 if sys.platform == 'win32':
457 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
458 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
459 stream = open(encodeFilename(filename), open_mode)
460 return (stream, filename)
461 except (IOError, OSError) as err:
462 if err.errno in (errno.EACCES,):
465 # In case of error, try to remove win32 forbidden chars
466 alt_filename = os.path.join(
467 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
468 for path_part in os.path.split(filename)
470 if alt_filename == filename:
473 # An exception here should be caught in the caller
474 stream = open(encodeFilename(filename), open_mode)
475 return (stream, alt_filename)
478 def timeconvert(timestr):
479 """Convert RFC 2822 defined time string into system timestamp"""
481 timetuple = email.utils.parsedate_tz(timestr)
482 if timetuple is not None:
483 timestamp = email.utils.mktime_tz(timetuple)
486 def sanitize_filename(s, restricted=False, is_id=False):
487 """Sanitizes a string so it could be used as part of a filename.
488 If restricted is set, use a stricter subset of allowed characters.
489 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
491 def replace_insane(char):
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
495 return '' if restricted else '\''
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
502 if restricted and ord(char) > 127:
506 result = u''.join(map(replace_insane, s))
508 while '__' in result:
509 result = result.replace('__', '_')
510 result = result.strip('_')
511 # Common case of "Foreign band name - English song title"
512 if restricted and result.startswith('-_'):
518 def orderedSet(iterable):
519 """ Remove all duplicates from the input iterable """
527 def _htmlentity_transform(entity):
528 """Transforms an HTML entity to a character."""
529 # Known non-numeric HTML entity
530 if entity in compat_html_entities.name2codepoint:
531 return compat_chr(compat_html_entities.name2codepoint[entity])
533 mobj = re.match(r'#(x?[0-9]+)', entity)
535 numstr = mobj.group(1)
536 if numstr.startswith(u'x'):
538 numstr = u'0%s' % numstr
541 return compat_chr(int(numstr, base))
543 # Unknown entity in name, return its literal representation
544 return (u'&%s;' % entity)
550 assert type(s) == compat_str
553 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
556 def encodeFilename(s, for_subprocess=False):
558 @param s The name of the file
561 assert type(s) == compat_str
563 # Python 3 has a Unicode API
564 if sys.version_info >= (3, 0):
567 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
568 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
569 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
570 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
571 if not for_subprocess:
574 # For subprocess calls, encode with locale encoding
575 # Refer to http://stackoverflow.com/a/9951851/35070
576 encoding = preferredencoding()
578 encoding = sys.getfilesystemencoding()
581 return s.encode(encoding, 'ignore')
584 def encodeArgument(s):
585 if not isinstance(s, compat_str):
586 # Legacy code that uses byte strings
587 # Uncomment the following line after fixing all post processors
588 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
589 s = s.decode('ascii')
590 return encodeFilename(s, True)
593 def decodeOption(optval):
596 if isinstance(optval, bytes):
597 optval = optval.decode(preferredencoding())
599 assert isinstance(optval, compat_str)
602 def formatSeconds(secs):
604 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
606 return '%d:%02d' % (secs // 60, secs % 60)
611 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
612 if sys.version_info < (3, 2):
615 class HTTPSConnectionV3(httplib.HTTPSConnection):
616 def __init__(self, *args, **kwargs):
617 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
620 sock = socket.create_connection((self.host, self.port), self.timeout)
621 if getattr(self, '_tunnel_host', False):
625 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
627 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
629 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
630 def https_open(self, req):
631 return self.do_open(HTTPSConnectionV3, req)
632 return HTTPSHandlerV3(**kwargs)
633 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
634 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
635 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
636 if opts_no_check_certificate:
637 context.verify_mode = ssl.CERT_NONE
638 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
640 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
641 context.verify_mode = (ssl.CERT_NONE
642 if opts_no_check_certificate
643 else ssl.CERT_REQUIRED)
644 context.set_default_verify_paths()
646 context.load_default_certs()
647 except AttributeError:
649 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
651 class ExtractorError(Exception):
652 """Error during info extraction."""
653 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
654 """ tb, if given, is the original traceback (so that it can be printed out).
655 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
658 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
660 if video_id is not None:
661 msg = video_id + ': ' + msg
663 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
664 super(ExtractorError, self).__init__(msg)
667 self.exc_info = sys.exc_info() # preserve original exception
669 self.video_id = video_id
671 def format_traceback(self):
672 if self.traceback is None:
674 return u''.join(traceback.format_tb(self.traceback))
677 class RegexNotFoundError(ExtractorError):
678 """Error when a regex didn't match"""
682 class DownloadError(Exception):
683 """Download Error exception.
685 This exception may be thrown by FileDownloader objects if they are not
686 configured to continue on errors. They will contain the appropriate
689 def __init__(self, msg, exc_info=None):
690 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
691 super(DownloadError, self).__init__(msg)
692 self.exc_info = exc_info
695 class SameFileError(Exception):
696 """Same File exception.
698 This exception will be thrown by FileDownloader objects if they detect
699 multiple files would have to be downloaded to the same file on disk.
704 class PostProcessingError(Exception):
705 """Post Processing exception.
707 This exception may be raised by PostProcessor's .run() method to
708 indicate an error in the postprocessing task.
710 def __init__(self, msg):
713 class MaxDownloadsReached(Exception):
714 """ --max-downloads limit has been reached. """
718 class UnavailableVideoError(Exception):
719 """Unavailable Format exception.
721 This exception will be thrown when a video is requested
722 in a format that is not available for that video.
727 class ContentTooShortError(Exception):
728 """Content Too Short exception.
730 This exception may be raised by FileDownloader objects when a file they
731 download is too small for what the server announced first, indicating
732 the connection was probably interrupted.
738 def __init__(self, downloaded, expected):
739 self.downloaded = downloaded
740 self.expected = expected
742 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
743 """Handler for HTTP requests and responses.
745 This class, when installed with an OpenerDirector, automatically adds
746 the standard headers to every HTTP request and handles gzipped and
747 deflated responses from web servers. If compression is to be avoided in
748 a particular request, the original request in the program code only has
749 to include the HTTP header "Youtubedl-No-Compression", which will be
750 removed before making the real request.
752 Part of this code was copied from:
754 http://techknack.net/python-urllib2-handlers/
756 Andrew Rowls, the author of that code, agreed to release it to the
763 return zlib.decompress(data, -zlib.MAX_WBITS)
765 return zlib.decompress(data)
768 def addinfourl_wrapper(stream, headers, url, code):
769 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
770 return compat_urllib_request.addinfourl(stream, headers, url, code)
771 ret = compat_urllib_request.addinfourl(stream, headers, url)
775 def http_request(self, req):
776 for h, v in std_headers.items():
777 if h not in req.headers:
779 if 'Youtubedl-no-compression' in req.headers:
780 if 'Accept-encoding' in req.headers:
781 del req.headers['Accept-encoding']
782 del req.headers['Youtubedl-no-compression']
783 if 'Youtubedl-user-agent' in req.headers:
784 if 'User-agent' in req.headers:
785 del req.headers['User-agent']
786 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
787 del req.headers['Youtubedl-user-agent']
790 def http_response(self, req, resp):
793 if resp.headers.get('Content-encoding', '') == 'gzip':
794 content = resp.read()
795 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
797 uncompressed = io.BytesIO(gz.read())
798 except IOError as original_ioerror:
799 # There may be junk add the end of the file
800 # See http://stackoverflow.com/q/4928560/35070 for details
801 for i in range(1, 1024):
803 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
804 uncompressed = io.BytesIO(gz.read())
809 raise original_ioerror
810 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
811 resp.msg = old_resp.msg
813 if resp.headers.get('Content-encoding', '') == 'deflate':
814 gz = io.BytesIO(self.deflate(resp.read()))
815 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
816 resp.msg = old_resp.msg
819 https_request = http_request
820 https_response = http_response
823 def parse_iso8601(date_str, delimiter='T'):
824 """ Return a UNIX timestamp from the given date """
830 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
833 timezone = datetime.timedelta()
835 date_str = date_str[:-len(m.group(0))]
836 if not m.group('sign'):
837 timezone = datetime.timedelta()
839 sign = 1 if m.group('sign') == '+' else -1
840 timezone = datetime.timedelta(
841 hours=sign * int(m.group('hours')),
842 minutes=sign * int(m.group('minutes')))
843 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
844 dt = datetime.datetime.strptime(date_str, date_format) - timezone
845 return calendar.timegm(dt.timetuple())
848 def unified_strdate(date_str):
849 """Return a string with the date in the format YYYYMMDD"""
856 date_str = date_str.replace(',', ' ')
857 # %z (UTC offset) is only supported in python>=3.2
858 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
859 format_expressions = [
864 '%b %dst %Y %I:%M%p',
865 '%b %dnd %Y %I:%M%p',
866 '%b %dth %Y %I:%M%p',
876 '%Y-%m-%dT%H:%M:%SZ',
877 '%Y-%m-%dT%H:%M:%S.%fZ',
878 '%Y-%m-%dT%H:%M:%S.%f0Z',
880 '%Y-%m-%dT%H:%M:%S.%f',
883 for expression in format_expressions:
885 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
888 if upload_date is None:
889 timetuple = email.utils.parsedate_tz(date_str)
891 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
894 def determine_ext(url, default_ext=u'unknown_video'):
897 guess = url.partition(u'?')[0].rpartition(u'.')[2]
898 if re.match(r'^[A-Za-z0-9]+$', guess):
903 def subtitles_filename(filename, sub_lang, sub_format):
904 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
906 def date_from_str(date_str):
908 Return a datetime object from a string in the format YYYYMMDD or
909 (now|today)[+-][0-9](day|week|month|year)(s)?"""
910 today = datetime.date.today()
911 if date_str == 'now'or date_str == 'today':
913 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
914 if match is not None:
915 sign = match.group('sign')
916 time = int(match.group('time'))
919 unit = match.group('unit')
928 delta = datetime.timedelta(**{unit: time})
930 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
932 def hyphenate_date(date_str):
934 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
935 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
936 if match is not None:
937 return '-'.join(match.groups())
941 class DateRange(object):
942 """Represents a time interval between two dates"""
943 def __init__(self, start=None, end=None):
944 """start and end must be strings in the format accepted by date"""
945 if start is not None:
946 self.start = date_from_str(start)
948 self.start = datetime.datetime.min.date()
950 self.end = date_from_str(end)
952 self.end = datetime.datetime.max.date()
953 if self.start > self.end:
954 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
957 """Returns a range that only contains the given day"""
959 def __contains__(self, date):
960 """Check if the date is in the range"""
961 if not isinstance(date, datetime.date):
962 date = date_from_str(date)
963 return self.start <= date <= self.end
965 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
969 """ Returns the platform name as a compat_str """
970 res = platform.platform()
971 if isinstance(res, bytes):
972 res = res.decode(preferredencoding())
974 assert isinstance(res, compat_str)
978 def _windows_write_string(s, out):
979 """ Returns True if the string was written using special methods,
980 False if it has yet to be written out."""
981 # Adapted from http://stackoverflow.com/a/3259271/35070
984 import ctypes.wintypes
992 fileno = out.fileno()
993 except AttributeError:
994 # If the output stream doesn't have a fileno, it's virtual
996 if fileno not in WIN_OUTPUT_IDS:
999 GetStdHandle = ctypes.WINFUNCTYPE(
1000 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1001 ("GetStdHandle", ctypes.windll.kernel32))
1002 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1004 WriteConsoleW = ctypes.WINFUNCTYPE(
1005 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1006 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1007 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1008 written = ctypes.wintypes.DWORD(0)
1010 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1011 FILE_TYPE_CHAR = 0x0002
1012 FILE_TYPE_REMOTE = 0x8000
1013 GetConsoleMode = ctypes.WINFUNCTYPE(
1014 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1015 ctypes.POINTER(ctypes.wintypes.DWORD))(
1016 ("GetConsoleMode", ctypes.windll.kernel32))
1017 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1019 def not_a_console(handle):
1020 if handle == INVALID_HANDLE_VALUE or handle is None:
1022 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1023 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1025 if not_a_console(h):
1028 def next_nonbmp_pos(s):
1030 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1031 except StopIteration:
1035 count = min(next_nonbmp_pos(s), 1024)
1037 ret = WriteConsoleW(
1038 h, s, count if count else 2, ctypes.byref(written), None)
1040 raise OSError('Failed to write string')
1041 if not count: # We just wrote a non-BMP character
1042 assert written.value == 2
1045 assert written.value > 0
1046 s = s[written.value:]
1050 def write_string(s, out=None, encoding=None):
1053 assert type(s) == compat_str
1055 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1056 if _windows_write_string(s, out):
1059 if ('b' in getattr(out, 'mode', '') or
1060 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1061 byt = s.encode(encoding or preferredencoding(), 'ignore')
1063 elif hasattr(out, 'buffer'):
1064 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1065 byt = s.encode(enc, 'ignore')
1066 out.buffer.write(byt)
1072 def bytes_to_intlist(bs):
1075 if isinstance(bs[0], int): # Python 3
1078 return [ord(c) for c in bs]
1081 def intlist_to_bytes(xs):
1084 if isinstance(chr(0), bytes): # Python 2
1085 return ''.join([chr(x) for x in xs])
1090 # Cross-platform file locking
1091 if sys.platform == 'win32':
1092 import ctypes.wintypes
1095 class OVERLAPPED(ctypes.Structure):
1097 ('Internal', ctypes.wintypes.LPVOID),
1098 ('InternalHigh', ctypes.wintypes.LPVOID),
1099 ('Offset', ctypes.wintypes.DWORD),
1100 ('OffsetHigh', ctypes.wintypes.DWORD),
1101 ('hEvent', ctypes.wintypes.HANDLE),
1104 kernel32 = ctypes.windll.kernel32
1105 LockFileEx = kernel32.LockFileEx
1106 LockFileEx.argtypes = [
1107 ctypes.wintypes.HANDLE, # hFile
1108 ctypes.wintypes.DWORD, # dwFlags
1109 ctypes.wintypes.DWORD, # dwReserved
1110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1111 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1112 ctypes.POINTER(OVERLAPPED) # Overlapped
1114 LockFileEx.restype = ctypes.wintypes.BOOL
1115 UnlockFileEx = kernel32.UnlockFileEx
1116 UnlockFileEx.argtypes = [
1117 ctypes.wintypes.HANDLE, # hFile
1118 ctypes.wintypes.DWORD, # dwReserved
1119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1120 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1121 ctypes.POINTER(OVERLAPPED) # Overlapped
1123 UnlockFileEx.restype = ctypes.wintypes.BOOL
1124 whole_low = 0xffffffff
1125 whole_high = 0x7fffffff
1127 def _lock_file(f, exclusive):
1128 overlapped = OVERLAPPED()
1129 overlapped.Offset = 0
1130 overlapped.OffsetHigh = 0
1131 overlapped.hEvent = 0
1132 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1133 handle = msvcrt.get_osfhandle(f.fileno())
1134 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1135 whole_low, whole_high, f._lock_file_overlapped_p):
1136 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1138 def _unlock_file(f):
1139 assert f._lock_file_overlapped_p
1140 handle = msvcrt.get_osfhandle(f.fileno())
1141 if not UnlockFileEx(handle, 0,
1142 whole_low, whole_high, f._lock_file_overlapped_p):
1143 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1148 def _lock_file(f, exclusive):
1149 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1151 def _unlock_file(f):
1152 fcntl.flock(f, fcntl.LOCK_UN)
1155 class locked_file(object):
1156 def __init__(self, filename, mode, encoding=None):
1157 assert mode in ['r', 'a', 'w']
1158 self.f = io.open(filename, mode, encoding=encoding)
1161 def __enter__(self):
1162 exclusive = self.mode != 'r'
1164 _lock_file(self.f, exclusive)
1170 def __exit__(self, etype, value, traceback):
1172 _unlock_file(self.f)
1179 def write(self, *args):
1180 return self.f.write(*args)
1182 def read(self, *args):
1183 return self.f.read(*args)
1186 def shell_quote(args):
1188 encoding = sys.getfilesystemencoding()
1189 if encoding is None:
1192 if isinstance(a, bytes):
1193 # We may get a filename encoded with 'encodeFilename'
1194 a = a.decode(encoding)
1195 quoted_args.append(pipes.quote(a))
1196 return u' '.join(quoted_args)
1199 def takewhile_inclusive(pred, seq):
1200 """ Like itertools.takewhile, but include the latest evaluated element
1201 (the first element so that Not pred(e)) """
1208 def smuggle_url(url, data):
1209 """ Pass additional data in a URL for internal use. """
1211 sdata = compat_urllib_parse.urlencode(
1212 {u'__youtubedl_smuggle': json.dumps(data)})
1213 return url + u'#' + sdata
1216 def unsmuggle_url(smug_url, default=None):
1217 if not '#__youtubedl_smuggle' in smug_url:
1218 return smug_url, default
1219 url, _, sdata = smug_url.rpartition(u'#')
1220 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1221 data = json.loads(jsond)
1225 def format_bytes(bytes):
1228 if type(bytes) is str:
1229 bytes = float(bytes)
1233 exponent = int(math.log(bytes, 1024.0))
1234 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1235 converted = float(bytes) / float(1024 ** exponent)
1236 return u'%.2f%s' % (converted, suffix)
1239 def get_term_width():
1240 columns = os.environ.get('COLUMNS', None)
1245 sp = subprocess.Popen(
1247 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1248 out, err = sp.communicate()
1249 return int(out.split()[1])
1255 def month_by_name(name):
1256 """ Return the number of a month by (locale-independently) English name """
1259 u'January', u'February', u'March', u'April', u'May', u'June',
1260 u'July', u'August', u'September', u'October', u'November', u'December']
1262 return ENGLISH_NAMES.index(name) + 1
1267 def fix_xml_ampersands(xml_str):
1268 """Replace all the '&' by '&' in XML"""
1270 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1275 def setproctitle(title):
1276 assert isinstance(title, compat_str)
1278 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1281 title_bytes = title.encode('utf-8')
1282 buf = ctypes.create_string_buffer(len(title_bytes))
1283 buf.value = title_bytes
1285 libc.prctl(15, buf, 0, 0, 0)
1286 except AttributeError:
1287 return # Strange libc, just skip this
1290 def remove_start(s, start):
1291 if s.startswith(start):
1292 return s[len(start):]
1296 def remove_end(s, end):
1298 return s[:-len(end)]
1302 def url_basename(url):
1303 path = compat_urlparse.urlparse(url).path
1304 return path.strip(u'/').split(u'/')[-1]
1307 class HEADRequest(compat_urllib_request.Request):
1308 def get_method(self):
1312 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1315 v = getattr(v, get_attr, None)
1318 return default if v is None else (int(v) * invscale // scale)
1321 def str_or_none(v, default=None):
1322 return default if v is None else compat_str(v)
1325 def str_to_int(int_str):
1326 """ A more relaxed version of int_or_none """
1329 int_str = re.sub(r'[,\.\+]', u'', int_str)
1333 def float_or_none(v, scale=1, invscale=1, default=None):
1334 return default if v is None else (float(v) * invscale / scale)
1337 def parse_duration(s):
1344 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1347 res = int(m.group('secs'))
1349 res += int(m.group('mins')) * 60
1350 if m.group('hours'):
1351 res += int(m.group('hours')) * 60 * 60
1353 res += float(m.group('ms'))
1357 def prepend_extension(filename, ext):
1358 name, real_ext = os.path.splitext(filename)
1359 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1362 def check_executable(exe, args=[]):
1363 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1364 args can be a list of arguments for a short output (like -version) """
1366 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1372 class PagedList(object):
1373 def __init__(self, pagefunc, pagesize):
1374 self._pagefunc = pagefunc
1375 self._pagesize = pagesize
1378 # This is only useful for tests
1379 return len(self.getslice())
1381 def getslice(self, start=0, end=None):
1383 for pagenum in itertools.count(start // self._pagesize):
1384 firstid = pagenum * self._pagesize
1385 nextfirstid = pagenum * self._pagesize + self._pagesize
1386 if start >= nextfirstid:
1389 page_results = list(self._pagefunc(pagenum))
1392 start % self._pagesize
1393 if firstid <= start < nextfirstid
1397 ((end - 1) % self._pagesize) + 1
1398 if (end is not None and firstid <= end <= nextfirstid)
1401 if startv != 0 or endv is not None:
1402 page_results = page_results[startv:endv]
1403 res.extend(page_results)
1405 # A little optimization - if current page is not "full", ie. does
1406 # not contain page_size videos then we can assume that this page
1407 # is the last one - there are no more ids on further pages -
1408 # i.e. no need to query again.
1409 if len(page_results) + startv < self._pagesize:
1412 # If we got the whole page, but the next page is not interesting,
1413 # break out early as well
1414 if end == nextfirstid:
1419 def uppercase_escape(s):
1420 unicode_escape = codecs.getdecoder('unicode_escape')
1422 r'\\U[0-9a-fA-F]{8}',
1423 lambda m: unicode_escape(m.group(0))[0],
1427 struct.pack(u'!I', 0)
1429 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1430 def struct_pack(spec, *args):
1431 if isinstance(spec, compat_str):
1432 spec = spec.encode('ascii')
1433 return struct.pack(spec, *args)
1435 def struct_unpack(spec, *args):
1436 if isinstance(spec, compat_str):
1437 spec = spec.encode('ascii')
1438 return struct.unpack(spec, *args)
1440 struct_pack = struct.pack
1441 struct_unpack = struct.unpack
1444 def read_batch_urls(batch_fd):
1446 if not isinstance(url, compat_str):
1447 url = url.decode('utf-8', 'replace')
1448 BOM_UTF8 = u'\xef\xbb\xbf'
1449 if url.startswith(BOM_UTF8):
1450 url = url[len(BOM_UTF8):]
1452 if url.startswith(('#', ';', ']')):
1456 with contextlib.closing(batch_fd) as fd:
1457 return [url for url in map(fixup, fd) if url]
1460 def urlencode_postdata(*args, **kargs):
1461 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1465 etree_iter = xml.etree.ElementTree.Element.iter
1466 except AttributeError: # Python <=2.6
1467 etree_iter = lambda n: n.findall('.//*')
1471 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1472 def doctype(self, name, pubid, system):
1473 pass # Ignore doctypes
1475 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1476 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1477 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1478 # Fix up XML parser in Python 2.x
1479 if sys.version_info < (3, 0):
1480 for n in etree_iter(tree):
1481 if n.text is not None:
1482 if not isinstance(n.text, compat_str):
1483 n.text = n.text.decode('utf-8')
1487 if sys.version_info < (3, 0) and sys.platform == 'win32':
1488 def compat_getpass(prompt, *args, **kwargs):
1489 if isinstance(prompt, compat_str):
1490 prompt = prompt.encode(preferredencoding())
1491 return getpass.getpass(prompt, *args, **kwargs)
1493 compat_getpass = getpass.getpass
1505 def strip_jsonp(code):
1506 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1509 def js_to_json(code):
1512 if key.startswith("'"):
1513 assert key.endswith("'")
1514 assert '"' not in key
1515 key = '"%s"' % key[1:-1]
1516 elif not key.startswith('"'):
1520 if value.startswith("'"):
1521 assert value.endswith("'")
1522 assert '"' not in value
1523 value = '"%s"' % value[1:-1]
1525 return m.group(1) + key + m.group(3) + value
1527 res = re.sub(r'''(?x)
1529 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1531 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1533 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1537 def qualities(quality_ids):
1538 """ Get a numeric quality value out of a list of possible values """
1541 return quality_ids.index(qid)
1547 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1550 subprocess_check_output = subprocess.check_output
1551 except AttributeError:
1552 def subprocess_check_output(*args, **kwargs):
1553 assert 'input' not in kwargs
1554 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1555 output, _ = p.communicate()
1558 raise subprocess.CalledProcessError(ret, p.args, output=output)