2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
300 replaced.append(c[0])
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
307 def xpath_text(node, xpath, name=None, fatal=False):
311 name = xpath if name is None else name
312 raise ExtractorError('Could not find XML element %s' % name)
318 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
319 class BaseHTMLParser(compat_html_parser.HTMLParser):
321 compat_html_parser.HTMLParser.__init__(self)
324 def loads(self, html):
329 class AttrParser(BaseHTMLParser):
330 """Modified HTMLParser that isolates a tag with the specified attribute"""
331 def __init__(self, attribute, value):
332 self.attribute = attribute
337 self.watch_startpos = False
339 BaseHTMLParser.__init__(self)
341 def error(self, message):
342 if self.error_count > 10 or self.started:
343 raise compat_html_parser.HTMLParseError(message, self.getpos())
344 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
345 self.error_count += 1
348 def handle_starttag(self, tag, attrs):
351 self.find_startpos(None)
352 if self.attribute in attrs and attrs[self.attribute] == self.value:
355 self.watch_startpos = True
357 if not tag in self.depth: self.depth[tag] = 0
360 def handle_endtag(self, tag):
362 if tag in self.depth: self.depth[tag] -= 1
363 if self.depth[self.result[0]] == 0:
365 self.result.append(self.getpos())
367 def find_startpos(self, x):
368 """Needed to put the start position of the result (self.result[1])
369 after the opening tag with the requested id"""
370 if self.watch_startpos:
371 self.watch_startpos = False
372 self.result.append(self.getpos())
373 handle_entityref = handle_charref = handle_data = handle_comment = \
374 handle_decl = handle_pi = unknown_decl = find_startpos
376 def get_result(self):
377 if self.result is None:
379 if len(self.result) != 3:
381 lines = self.html.split('\n')
382 lines = lines[self.result[1][0]-1:self.result[2][0]]
383 lines[0] = lines[0][self.result[1][1]:]
385 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
386 lines[-1] = lines[-1][:self.result[2][1]]
387 return '\n'.join(lines).strip()
388 # Hack for https://github.com/rg3/youtube-dl/issues/662
389 if sys.version_info < (2, 7, 3):
390 AttrParser.parse_endtag = (lambda self, i:
391 i + len("</scr'+'ipt>")
392 if self.rawdata[i:].startswith("</scr'+'ipt>")
393 else compat_html_parser.HTMLParser.parse_endtag(self, i))
395 def get_element_by_id(id, html):
396 """Return the content of the tag with the specified ID in the passed HTML document"""
397 return get_element_by_attribute("id", id, html)
399 def get_element_by_attribute(attribute, value, html):
400 """Return the content of the tag with the specified attribute in the passed HTML document"""
401 parser = AttrParser(attribute, value)
404 except compat_html_parser.HTMLParseError:
406 return parser.get_result()
408 class MetaParser(BaseHTMLParser):
410 Modified HTMLParser that isolates a meta tag with the specified name
413 def __init__(self, name):
414 BaseHTMLParser.__init__(self)
419 def handle_starttag(self, tag, attrs):
423 if attrs.get('name') == self.name:
424 self.result = attrs.get('content')
426 def get_result(self):
429 def get_meta_content(name, html):
431 Return the content attribute from the meta tag with the given name attribute.
433 parser = MetaParser(name)
436 except compat_html_parser.HTMLParseError:
438 return parser.get_result()
441 def clean_html(html):
442 """Clean an HTML snippet into a readable string"""
444 html = html.replace('\n', ' ')
445 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
446 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
448 html = re.sub('<.*?>', '', html)
449 # Replace html entities
450 html = unescapeHTML(html)
454 def sanitize_open(filename, open_mode):
455 """Try to open the given filename, and slightly tweak it if this fails.
457 Attempts to open the given filename. If this fails, it tries to change
458 the filename slightly, step by step, until it's either able to open it
459 or it fails and raises a final exception, like the standard open()
462 It returns the tuple (stream, definitive_file_name).
466 if sys.platform == 'win32':
468 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
469 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
470 stream = open(encodeFilename(filename), open_mode)
471 return (stream, filename)
472 except (IOError, OSError) as err:
473 if err.errno in (errno.EACCES,):
476 # In case of error, try to remove win32 forbidden chars
477 alt_filename = os.path.join(
478 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
479 for path_part in os.path.split(filename)
481 if alt_filename == filename:
484 # An exception here should be caught in the caller
485 stream = open(encodeFilename(filename), open_mode)
486 return (stream, alt_filename)
489 def timeconvert(timestr):
490 """Convert RFC 2822 defined time string into system timestamp"""
492 timetuple = email.utils.parsedate_tz(timestr)
493 if timetuple is not None:
494 timestamp = email.utils.mktime_tz(timetuple)
497 def sanitize_filename(s, restricted=False, is_id=False):
498 """Sanitizes a string so it could be used as part of a filename.
499 If restricted is set, use a stricter subset of allowed characters.
500 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
502 def replace_insane(char):
503 if char == '?' or ord(char) < 32 or ord(char) == 127:
506 return '' if restricted else '\''
508 return '_-' if restricted else ' -'
509 elif char in '\\/|*<>':
511 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
513 if restricted and ord(char) > 127:
517 result = u''.join(map(replace_insane, s))
519 while '__' in result:
520 result = result.replace('__', '_')
521 result = result.strip('_')
522 # Common case of "Foreign band name - English song title"
523 if restricted and result.startswith('-_'):
529 def orderedSet(iterable):
530 """ Remove all duplicates from the input iterable """
538 def _htmlentity_transform(entity):
539 """Transforms an HTML entity to a character."""
540 # Known non-numeric HTML entity
541 if entity in compat_html_entities.name2codepoint:
542 return compat_chr(compat_html_entities.name2codepoint[entity])
544 mobj = re.match(r'#(x?[0-9]+)', entity)
546 numstr = mobj.group(1)
547 if numstr.startswith(u'x'):
549 numstr = u'0%s' % numstr
552 return compat_chr(int(numstr, base))
554 # Unknown entity in name, return its literal representation
555 return (u'&%s;' % entity)
561 assert type(s) == compat_str
564 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
567 def encodeFilename(s, for_subprocess=False):
569 @param s The name of the file
572 assert type(s) == compat_str
574 # Python 3 has a Unicode API
575 if sys.version_info >= (3, 0):
578 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
580 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
581 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
582 if not for_subprocess:
585 # For subprocess calls, encode with locale encoding
586 # Refer to http://stackoverflow.com/a/9951851/35070
587 encoding = preferredencoding()
589 encoding = sys.getfilesystemencoding()
592 return s.encode(encoding, 'ignore')
595 def encodeArgument(s):
596 if not isinstance(s, compat_str):
597 # Legacy code that uses byte strings
598 # Uncomment the following line after fixing all post processors
599 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
600 s = s.decode('ascii')
601 return encodeFilename(s, True)
604 def decodeOption(optval):
607 if isinstance(optval, bytes):
608 optval = optval.decode(preferredencoding())
610 assert isinstance(optval, compat_str)
613 def formatSeconds(secs):
615 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
617 return '%d:%02d' % (secs // 60, secs % 60)
622 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
623 if sys.version_info < (3, 2):
626 class HTTPSConnectionV3(httplib.HTTPSConnection):
627 def __init__(self, *args, **kwargs):
628 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
631 sock = socket.create_connection((self.host, self.port), self.timeout)
632 if getattr(self, '_tunnel_host', False):
636 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
638 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
640 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
641 def https_open(self, req):
642 return self.do_open(HTTPSConnectionV3, req)
643 return HTTPSHandlerV3(**kwargs)
644 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
645 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
646 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
647 if opts_no_check_certificate:
648 context.verify_mode = ssl.CERT_NONE
649 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
651 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
652 context.verify_mode = (ssl.CERT_NONE
653 if opts_no_check_certificate
654 else ssl.CERT_REQUIRED)
655 context.set_default_verify_paths()
657 context.load_default_certs()
658 except AttributeError:
660 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
662 class ExtractorError(Exception):
663 """Error during info extraction."""
664 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
665 """ tb, if given, is the original traceback (so that it can be printed out).
666 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
669 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
671 if video_id is not None:
672 msg = video_id + ': ' + msg
674 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
675 super(ExtractorError, self).__init__(msg)
678 self.exc_info = sys.exc_info() # preserve original exception
680 self.video_id = video_id
682 def format_traceback(self):
683 if self.traceback is None:
685 return u''.join(traceback.format_tb(self.traceback))
688 class RegexNotFoundError(ExtractorError):
689 """Error when a regex didn't match"""
693 class DownloadError(Exception):
694 """Download Error exception.
696 This exception may be thrown by FileDownloader objects if they are not
697 configured to continue on errors. They will contain the appropriate
700 def __init__(self, msg, exc_info=None):
701 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
702 super(DownloadError, self).__init__(msg)
703 self.exc_info = exc_info
706 class SameFileError(Exception):
707 """Same File exception.
709 This exception will be thrown by FileDownloader objects if they detect
710 multiple files would have to be downloaded to the same file on disk.
715 class PostProcessingError(Exception):
716 """Post Processing exception.
718 This exception may be raised by PostProcessor's .run() method to
719 indicate an error in the postprocessing task.
721 def __init__(self, msg):
724 class MaxDownloadsReached(Exception):
725 """ --max-downloads limit has been reached. """
729 class UnavailableVideoError(Exception):
730 """Unavailable Format exception.
732 This exception will be thrown when a video is requested
733 in a format that is not available for that video.
738 class ContentTooShortError(Exception):
739 """Content Too Short exception.
741 This exception may be raised by FileDownloader objects when a file they
742 download is too small for what the server announced first, indicating
743 the connection was probably interrupted.
749 def __init__(self, downloaded, expected):
750 self.downloaded = downloaded
751 self.expected = expected
753 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
754 """Handler for HTTP requests and responses.
756 This class, when installed with an OpenerDirector, automatically adds
757 the standard headers to every HTTP request and handles gzipped and
758 deflated responses from web servers. If compression is to be avoided in
759 a particular request, the original request in the program code only has
760 to include the HTTP header "Youtubedl-No-Compression", which will be
761 removed before making the real request.
763 Part of this code was copied from:
765 http://techknack.net/python-urllib2-handlers/
767 Andrew Rowls, the author of that code, agreed to release it to the
774 return zlib.decompress(data, -zlib.MAX_WBITS)
776 return zlib.decompress(data)
779 def addinfourl_wrapper(stream, headers, url, code):
780 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
781 return compat_urllib_request.addinfourl(stream, headers, url, code)
782 ret = compat_urllib_request.addinfourl(stream, headers, url)
786 def http_request(self, req):
787 for h, v in std_headers.items():
788 if h not in req.headers:
790 if 'Youtubedl-no-compression' in req.headers:
791 if 'Accept-encoding' in req.headers:
792 del req.headers['Accept-encoding']
793 del req.headers['Youtubedl-no-compression']
794 if 'Youtubedl-user-agent' in req.headers:
795 if 'User-agent' in req.headers:
796 del req.headers['User-agent']
797 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
798 del req.headers['Youtubedl-user-agent']
801 def http_response(self, req, resp):
804 if resp.headers.get('Content-encoding', '') == 'gzip':
805 content = resp.read()
806 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
808 uncompressed = io.BytesIO(gz.read())
809 except IOError as original_ioerror:
810 # There may be junk add the end of the file
811 # See http://stackoverflow.com/q/4928560/35070 for details
812 for i in range(1, 1024):
814 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
815 uncompressed = io.BytesIO(gz.read())
820 raise original_ioerror
821 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
822 resp.msg = old_resp.msg
824 if resp.headers.get('Content-encoding', '') == 'deflate':
825 gz = io.BytesIO(self.deflate(resp.read()))
826 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
827 resp.msg = old_resp.msg
830 https_request = http_request
831 https_response = http_response
834 def parse_iso8601(date_str, delimiter='T'):
835 """ Return a UNIX timestamp from the given date """
841 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
844 timezone = datetime.timedelta()
846 date_str = date_str[:-len(m.group(0))]
847 if not m.group('sign'):
848 timezone = datetime.timedelta()
850 sign = 1 if m.group('sign') == '+' else -1
851 timezone = datetime.timedelta(
852 hours=sign * int(m.group('hours')),
853 minutes=sign * int(m.group('minutes')))
854 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
855 dt = datetime.datetime.strptime(date_str, date_format) - timezone
856 return calendar.timegm(dt.timetuple())
859 def unified_strdate(date_str):
860 """Return a string with the date in the format YYYYMMDD"""
867 date_str = date_str.replace(',', ' ')
868 # %z (UTC offset) is only supported in python>=3.2
869 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
870 format_expressions = [
875 '%b %dst %Y %I:%M%p',
876 '%b %dnd %Y %I:%M%p',
877 '%b %dth %Y %I:%M%p',
887 '%Y-%m-%dT%H:%M:%SZ',
888 '%Y-%m-%dT%H:%M:%S.%fZ',
889 '%Y-%m-%dT%H:%M:%S.%f0Z',
891 '%Y-%m-%dT%H:%M:%S.%f',
894 for expression in format_expressions:
896 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
899 if upload_date is None:
900 timetuple = email.utils.parsedate_tz(date_str)
902 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
905 def determine_ext(url, default_ext=u'unknown_video'):
908 guess = url.partition(u'?')[0].rpartition(u'.')[2]
909 if re.match(r'^[A-Za-z0-9]+$', guess):
914 def subtitles_filename(filename, sub_lang, sub_format):
915 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
917 def date_from_str(date_str):
919 Return a datetime object from a string in the format YYYYMMDD or
920 (now|today)[+-][0-9](day|week|month|year)(s)?"""
921 today = datetime.date.today()
922 if date_str == 'now'or date_str == 'today':
924 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
925 if match is not None:
926 sign = match.group('sign')
927 time = int(match.group('time'))
930 unit = match.group('unit')
939 delta = datetime.timedelta(**{unit: time})
941 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
943 def hyphenate_date(date_str):
945 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
946 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
947 if match is not None:
948 return '-'.join(match.groups())
952 class DateRange(object):
953 """Represents a time interval between two dates"""
954 def __init__(self, start=None, end=None):
955 """start and end must be strings in the format accepted by date"""
956 if start is not None:
957 self.start = date_from_str(start)
959 self.start = datetime.datetime.min.date()
961 self.end = date_from_str(end)
963 self.end = datetime.datetime.max.date()
964 if self.start > self.end:
965 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
968 """Returns a range that only contains the given day"""
970 def __contains__(self, date):
971 """Check if the date is in the range"""
972 if not isinstance(date, datetime.date):
973 date = date_from_str(date)
974 return self.start <= date <= self.end
976 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
980 """ Returns the platform name as a compat_str """
981 res = platform.platform()
982 if isinstance(res, bytes):
983 res = res.decode(preferredencoding())
985 assert isinstance(res, compat_str)
989 def _windows_write_string(s, out):
990 """ Returns True if the string was written using special methods,
991 False if it has yet to be written out."""
992 # Adapted from http://stackoverflow.com/a/3259271/35070
995 import ctypes.wintypes
1003 fileno = out.fileno()
1004 except AttributeError:
1005 # If the output stream doesn't have a fileno, it's virtual
1007 if fileno not in WIN_OUTPUT_IDS:
1010 GetStdHandle = ctypes.WINFUNCTYPE(
1011 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1012 ("GetStdHandle", ctypes.windll.kernel32))
1013 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1015 WriteConsoleW = ctypes.WINFUNCTYPE(
1016 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1017 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1018 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1019 written = ctypes.wintypes.DWORD(0)
1021 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1022 FILE_TYPE_CHAR = 0x0002
1023 FILE_TYPE_REMOTE = 0x8000
1024 GetConsoleMode = ctypes.WINFUNCTYPE(
1025 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1026 ctypes.POINTER(ctypes.wintypes.DWORD))(
1027 ("GetConsoleMode", ctypes.windll.kernel32))
1028 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1030 def not_a_console(handle):
1031 if handle == INVALID_HANDLE_VALUE or handle is None:
1033 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1034 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1036 if not_a_console(h):
1039 def next_nonbmp_pos(s):
1041 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1042 except StopIteration:
1046 count = min(next_nonbmp_pos(s), 1024)
1048 ret = WriteConsoleW(
1049 h, s, count if count else 2, ctypes.byref(written), None)
1051 raise OSError('Failed to write string')
1052 if not count: # We just wrote a non-BMP character
1053 assert written.value == 2
1056 assert written.value > 0
1057 s = s[written.value:]
1061 def write_string(s, out=None, encoding=None):
1064 assert type(s) == compat_str
1066 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1067 if _windows_write_string(s, out):
1070 if ('b' in getattr(out, 'mode', '') or
1071 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1072 byt = s.encode(encoding or preferredencoding(), 'ignore')
1074 elif hasattr(out, 'buffer'):
1075 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1076 byt = s.encode(enc, 'ignore')
1077 out.buffer.write(byt)
1083 def bytes_to_intlist(bs):
1086 if isinstance(bs[0], int): # Python 3
1089 return [ord(c) for c in bs]
1092 def intlist_to_bytes(xs):
1095 if isinstance(chr(0), bytes): # Python 2
1096 return ''.join([chr(x) for x in xs])
1101 # Cross-platform file locking
1102 if sys.platform == 'win32':
1103 import ctypes.wintypes
1106 class OVERLAPPED(ctypes.Structure):
1108 ('Internal', ctypes.wintypes.LPVOID),
1109 ('InternalHigh', ctypes.wintypes.LPVOID),
1110 ('Offset', ctypes.wintypes.DWORD),
1111 ('OffsetHigh', ctypes.wintypes.DWORD),
1112 ('hEvent', ctypes.wintypes.HANDLE),
1115 kernel32 = ctypes.windll.kernel32
1116 LockFileEx = kernel32.LockFileEx
1117 LockFileEx.argtypes = [
1118 ctypes.wintypes.HANDLE, # hFile
1119 ctypes.wintypes.DWORD, # dwFlags
1120 ctypes.wintypes.DWORD, # dwReserved
1121 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1122 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1123 ctypes.POINTER(OVERLAPPED) # Overlapped
1125 LockFileEx.restype = ctypes.wintypes.BOOL
1126 UnlockFileEx = kernel32.UnlockFileEx
1127 UnlockFileEx.argtypes = [
1128 ctypes.wintypes.HANDLE, # hFile
1129 ctypes.wintypes.DWORD, # dwReserved
1130 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1131 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1132 ctypes.POINTER(OVERLAPPED) # Overlapped
1134 UnlockFileEx.restype = ctypes.wintypes.BOOL
1135 whole_low = 0xffffffff
1136 whole_high = 0x7fffffff
1138 def _lock_file(f, exclusive):
1139 overlapped = OVERLAPPED()
1140 overlapped.Offset = 0
1141 overlapped.OffsetHigh = 0
1142 overlapped.hEvent = 0
1143 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1144 handle = msvcrt.get_osfhandle(f.fileno())
1145 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1146 whole_low, whole_high, f._lock_file_overlapped_p):
1147 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1149 def _unlock_file(f):
1150 assert f._lock_file_overlapped_p
1151 handle = msvcrt.get_osfhandle(f.fileno())
1152 if not UnlockFileEx(handle, 0,
1153 whole_low, whole_high, f._lock_file_overlapped_p):
1154 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1159 def _lock_file(f, exclusive):
1160 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1162 def _unlock_file(f):
1163 fcntl.flock(f, fcntl.LOCK_UN)
1166 class locked_file(object):
1167 def __init__(self, filename, mode, encoding=None):
1168 assert mode in ['r', 'a', 'w']
1169 self.f = io.open(filename, mode, encoding=encoding)
1172 def __enter__(self):
1173 exclusive = self.mode != 'r'
1175 _lock_file(self.f, exclusive)
1181 def __exit__(self, etype, value, traceback):
1183 _unlock_file(self.f)
1190 def write(self, *args):
1191 return self.f.write(*args)
1193 def read(self, *args):
1194 return self.f.read(*args)
1197 def shell_quote(args):
1199 encoding = sys.getfilesystemencoding()
1200 if encoding is None:
1203 if isinstance(a, bytes):
1204 # We may get a filename encoded with 'encodeFilename'
1205 a = a.decode(encoding)
1206 quoted_args.append(pipes.quote(a))
1207 return u' '.join(quoted_args)
1210 def takewhile_inclusive(pred, seq):
1211 """ Like itertools.takewhile, but include the latest evaluated element
1212 (the first element so that Not pred(e)) """
1219 def smuggle_url(url, data):
1220 """ Pass additional data in a URL for internal use. """
1222 sdata = compat_urllib_parse.urlencode(
1223 {u'__youtubedl_smuggle': json.dumps(data)})
1224 return url + u'#' + sdata
1227 def unsmuggle_url(smug_url, default=None):
1228 if not '#__youtubedl_smuggle' in smug_url:
1229 return smug_url, default
1230 url, _, sdata = smug_url.rpartition(u'#')
1231 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1232 data = json.loads(jsond)
1236 def format_bytes(bytes):
1239 if type(bytes) is str:
1240 bytes = float(bytes)
1244 exponent = int(math.log(bytes, 1024.0))
1245 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1246 converted = float(bytes) / float(1024 ** exponent)
1247 return u'%.2f%s' % (converted, suffix)
1250 def get_term_width():
1251 columns = os.environ.get('COLUMNS', None)
1256 sp = subprocess.Popen(
1258 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1259 out, err = sp.communicate()
1260 return int(out.split()[1])
1266 def month_by_name(name):
1267 """ Return the number of a month by (locale-independently) English name """
1270 u'January', u'February', u'March', u'April', u'May', u'June',
1271 u'July', u'August', u'September', u'October', u'November', u'December']
1273 return ENGLISH_NAMES.index(name) + 1
1278 def fix_xml_ampersands(xml_str):
1279 """Replace all the '&' by '&' in XML"""
1281 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1286 def setproctitle(title):
1287 assert isinstance(title, compat_str)
1289 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1292 title_bytes = title.encode('utf-8')
1293 buf = ctypes.create_string_buffer(len(title_bytes))
1294 buf.value = title_bytes
1296 libc.prctl(15, buf, 0, 0, 0)
1297 except AttributeError:
1298 return # Strange libc, just skip this
1301 def remove_start(s, start):
1302 if s.startswith(start):
1303 return s[len(start):]
1307 def remove_end(s, end):
1309 return s[:-len(end)]
1313 def url_basename(url):
1314 path = compat_urlparse.urlparse(url).path
1315 return path.strip(u'/').split(u'/')[-1]
1318 class HEADRequest(compat_urllib_request.Request):
1319 def get_method(self):
1323 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1326 v = getattr(v, get_attr, None)
1329 return default if v is None else (int(v) * invscale // scale)
1332 def str_or_none(v, default=None):
1333 return default if v is None else compat_str(v)
1336 def str_to_int(int_str):
1337 """ A more relaxed version of int_or_none """
1340 int_str = re.sub(r'[,\.\+]', u'', int_str)
1344 def float_or_none(v, scale=1, invscale=1, default=None):
1345 return default if v is None else (float(v) * invscale / scale)
1348 def parse_duration(s):
1355 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1358 res = int(m.group('secs'))
1360 res += int(m.group('mins')) * 60
1361 if m.group('hours'):
1362 res += int(m.group('hours')) * 60 * 60
1364 res += float(m.group('ms'))
1368 def prepend_extension(filename, ext):
1369 name, real_ext = os.path.splitext(filename)
1370 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1373 def check_executable(exe, args=[]):
1374 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1375 args can be a list of arguments for a short output (like -version) """
1377 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1383 class PagedList(object):
1384 def __init__(self, pagefunc, pagesize):
1385 self._pagefunc = pagefunc
1386 self._pagesize = pagesize
1389 # This is only useful for tests
1390 return len(self.getslice())
1392 def getslice(self, start=0, end=None):
1394 for pagenum in itertools.count(start // self._pagesize):
1395 firstid = pagenum * self._pagesize
1396 nextfirstid = pagenum * self._pagesize + self._pagesize
1397 if start >= nextfirstid:
1400 page_results = list(self._pagefunc(pagenum))
1403 start % self._pagesize
1404 if firstid <= start < nextfirstid
1408 ((end - 1) % self._pagesize) + 1
1409 if (end is not None and firstid <= end <= nextfirstid)
1412 if startv != 0 or endv is not None:
1413 page_results = page_results[startv:endv]
1414 res.extend(page_results)
1416 # A little optimization - if current page is not "full", ie. does
1417 # not contain page_size videos then we can assume that this page
1418 # is the last one - there are no more ids on further pages -
1419 # i.e. no need to query again.
1420 if len(page_results) + startv < self._pagesize:
1423 # If we got the whole page, but the next page is not interesting,
1424 # break out early as well
1425 if end == nextfirstid:
1430 def uppercase_escape(s):
1431 unicode_escape = codecs.getdecoder('unicode_escape')
1433 r'\\U[0-9a-fA-F]{8}',
1434 lambda m: unicode_escape(m.group(0))[0],
1438 struct.pack(u'!I', 0)
1440 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1441 def struct_pack(spec, *args):
1442 if isinstance(spec, compat_str):
1443 spec = spec.encode('ascii')
1444 return struct.pack(spec, *args)
1446 def struct_unpack(spec, *args):
1447 if isinstance(spec, compat_str):
1448 spec = spec.encode('ascii')
1449 return struct.unpack(spec, *args)
1451 struct_pack = struct.pack
1452 struct_unpack = struct.unpack
1455 def read_batch_urls(batch_fd):
1457 if not isinstance(url, compat_str):
1458 url = url.decode('utf-8', 'replace')
1459 BOM_UTF8 = u'\xef\xbb\xbf'
1460 if url.startswith(BOM_UTF8):
1461 url = url[len(BOM_UTF8):]
1463 if url.startswith(('#', ';', ']')):
1467 with contextlib.closing(batch_fd) as fd:
1468 return [url for url in map(fixup, fd) if url]
1471 def urlencode_postdata(*args, **kargs):
1472 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1476 etree_iter = xml.etree.ElementTree.Element.iter
1477 except AttributeError: # Python <=2.6
1478 etree_iter = lambda n: n.findall('.//*')
1482 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1483 def doctype(self, name, pubid, system):
1484 pass # Ignore doctypes
1486 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1487 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1488 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1489 # Fix up XML parser in Python 2.x
1490 if sys.version_info < (3, 0):
1491 for n in etree_iter(tree):
1492 if n.text is not None:
1493 if not isinstance(n.text, compat_str):
1494 n.text = n.text.decode('utf-8')
1498 if sys.version_info < (3, 0) and sys.platform == 'win32':
1499 def compat_getpass(prompt, *args, **kwargs):
1500 if isinstance(prompt, compat_str):
1501 prompt = prompt.encode(preferredencoding())
1502 return getpass.getpass(prompt, *args, **kwargs)
1504 compat_getpass = getpass.getpass
1516 def strip_jsonp(code):
1517 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1520 def js_to_json(code):
1523 if key.startswith("'"):
1524 assert key.endswith("'")
1525 assert '"' not in key
1526 key = '"%s"' % key[1:-1]
1527 elif not key.startswith('"'):
1531 if value.startswith("'"):
1532 assert value.endswith("'")
1533 assert '"' not in value
1534 value = '"%s"' % value[1:-1]
1536 return m.group(1) + key + m.group(3) + value
1538 res = re.sub(r'''(?x)
1540 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1542 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1544 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548 def qualities(quality_ids):
1549 """ Get a numeric quality value out of a list of possible values """
1552 return quality_ids.index(qid)
1558 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1561 subprocess_check_output = subprocess.check_output
1562 except AttributeError:
1563 def subprocess_check_output(*args, **kwargs):
1564 assert 'input' not in kwargs
1565 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1566 output, _ = p.communicate()
1569 raise subprocess.CalledProcessError(ret, p.args, output=output)