2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
207 # Environment variables should be decoded with filesystem encoding
208 # otherwise this results in issues like #3854 #2918 #3217
209 if sys.version_info >= (3, 0):
210 compat_getenv = os.getenv
211 compat_expanduser = os.path.expanduser
213 def compat_getenv(key, default=None):
214 env = os.getenv(key, default)
216 env = env.decode(get_filesystem_encoding())
219 def compat_expanduser(path):
220 """Expand ~ and ~user constructs.
222 If user or $HOME is unknown, do nothing."""
226 while i < n and path[i] not in '/\\':
229 if 'HOME' in os.environ:
230 userhome = compat_getenv('HOME')
231 elif 'USERPROFILE' in os.environ:
232 userhome = compat_getenv('USERPROFILE')
233 elif not 'HOMEPATH' in os.environ:
237 drive = compat_getenv('HOMEDRIVE')
240 userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
243 userhome = os.path.join(os.path.dirname(userhome), path[1:i])
245 return userhome + path[i:]
248 # This is not clearly defined otherwise
249 compiled_regex_type = type(re.compile(''))
252 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
253 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
254 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
255 'Accept-Encoding': 'gzip, deflate',
256 'Accept-Language': 'en-us,en;q=0.5',
259 def preferredencoding():
260 """Get preferred encoding.
262 Returns the best encoding scheme for the system, based on
263 locale.getpreferredencoding() and some further tweaks.
266 pref = locale.getpreferredencoding()
273 if sys.version_info < (3,0):
275 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
278 assert type(s) == type(u'')
282 def write_json_file(obj, fn):
283 """ Encode obj as JSON and write it to fn, atomically """
287 'prefix': os.path.basename(fn) + '.',
288 'dir': os.path.dirname(fn),
292 # In Python 2.x, json.dump expects a bytestream.
293 # In Python 3.x, it writes to a character stream
294 if sys.version_info < (3, 0):
302 tf = tempfile.NamedTemporaryFile(**args)
307 os.rename(tf.name, fn)
316 if sys.version_info >= (2, 7):
317 def find_xpath_attr(node, xpath, key, val):
318 """ Find the xpath xpath[@key=val] """
319 assert re.match(r'^[a-zA-Z-]+$', key)
320 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
321 expr = xpath + u"[@%s='%s']" % (key, val)
322 return node.find(expr)
324 def find_xpath_attr(node, xpath, key, val):
325 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
326 # .//node does not match if a node is a direct child of . !
327 if isinstance(xpath, unicode):
328 xpath = xpath.encode('ascii')
330 for f in node.findall(xpath):
331 if f.attrib.get(key) == val:
335 # On python2.6 the xml.etree.ElementTree.Element methods don't support
336 # the namespace parameter
337 def xpath_with_ns(path, ns_map):
338 components = [c.split(':') for c in path.split('/')]
342 replaced.append(c[0])
345 replaced.append('{%s}%s' % (ns_map[ns], tag))
346 return '/'.join(replaced)
349 def xpath_text(node, xpath, name=None, fatal=False):
350 if sys.version_info < (2, 7): # Crazy 2.6
351 xpath = xpath.encode('ascii')
356 name = xpath if name is None else name
357 raise ExtractorError('Could not find XML element %s' % name)
363 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
364 class BaseHTMLParser(compat_html_parser.HTMLParser):
366 compat_html_parser.HTMLParser.__init__(self)
369 def loads(self, html):
374 class AttrParser(BaseHTMLParser):
375 """Modified HTMLParser that isolates a tag with the specified attribute"""
376 def __init__(self, attribute, value):
377 self.attribute = attribute
382 self.watch_startpos = False
384 BaseHTMLParser.__init__(self)
386 def error(self, message):
387 if self.error_count > 10 or self.started:
388 raise compat_html_parser.HTMLParseError(message, self.getpos())
389 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
390 self.error_count += 1
393 def handle_starttag(self, tag, attrs):
396 self.find_startpos(None)
397 if self.attribute in attrs and attrs[self.attribute] == self.value:
400 self.watch_startpos = True
402 if not tag in self.depth: self.depth[tag] = 0
405 def handle_endtag(self, tag):
407 if tag in self.depth: self.depth[tag] -= 1
408 if self.depth[self.result[0]] == 0:
410 self.result.append(self.getpos())
412 def find_startpos(self, x):
413 """Needed to put the start position of the result (self.result[1])
414 after the opening tag with the requested id"""
415 if self.watch_startpos:
416 self.watch_startpos = False
417 self.result.append(self.getpos())
418 handle_entityref = handle_charref = handle_data = handle_comment = \
419 handle_decl = handle_pi = unknown_decl = find_startpos
421 def get_result(self):
422 if self.result is None:
424 if len(self.result) != 3:
426 lines = self.html.split('\n')
427 lines = lines[self.result[1][0]-1:self.result[2][0]]
428 lines[0] = lines[0][self.result[1][1]:]
430 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
431 lines[-1] = lines[-1][:self.result[2][1]]
432 return '\n'.join(lines).strip()
433 # Hack for https://github.com/rg3/youtube-dl/issues/662
434 if sys.version_info < (2, 7, 3):
435 AttrParser.parse_endtag = (lambda self, i:
436 i + len("</scr'+'ipt>")
437 if self.rawdata[i:].startswith("</scr'+'ipt>")
438 else compat_html_parser.HTMLParser.parse_endtag(self, i))
440 def get_element_by_id(id, html):
441 """Return the content of the tag with the specified ID in the passed HTML document"""
442 return get_element_by_attribute("id", id, html)
444 def get_element_by_attribute(attribute, value, html):
445 """Return the content of the tag with the specified attribute in the passed HTML document"""
446 parser = AttrParser(attribute, value)
449 except compat_html_parser.HTMLParseError:
451 return parser.get_result()
453 class MetaParser(BaseHTMLParser):
455 Modified HTMLParser that isolates a meta tag with the specified name
458 def __init__(self, name):
459 BaseHTMLParser.__init__(self)
464 def handle_starttag(self, tag, attrs):
468 if attrs.get('name') == self.name:
469 self.result = attrs.get('content')
471 def get_result(self):
474 def get_meta_content(name, html):
476 Return the content attribute from the meta tag with the given name attribute.
478 parser = MetaParser(name)
481 except compat_html_parser.HTMLParseError:
483 return parser.get_result()
486 def clean_html(html):
487 """Clean an HTML snippet into a readable string"""
489 html = html.replace('\n', ' ')
490 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
491 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
493 html = re.sub('<.*?>', '', html)
494 # Replace html entities
495 html = unescapeHTML(html)
499 def sanitize_open(filename, open_mode):
500 """Try to open the given filename, and slightly tweak it if this fails.
502 Attempts to open the given filename. If this fails, it tries to change
503 the filename slightly, step by step, until it's either able to open it
504 or it fails and raises a final exception, like the standard open()
507 It returns the tuple (stream, definitive_file_name).
511 if sys.platform == 'win32':
513 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
514 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
515 stream = open(encodeFilename(filename), open_mode)
516 return (stream, filename)
517 except (IOError, OSError) as err:
518 if err.errno in (errno.EACCES,):
521 # In case of error, try to remove win32 forbidden chars
522 alt_filename = os.path.join(
523 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
524 for path_part in os.path.split(filename)
526 if alt_filename == filename:
529 # An exception here should be caught in the caller
530 stream = open(encodeFilename(filename), open_mode)
531 return (stream, alt_filename)
534 def timeconvert(timestr):
535 """Convert RFC 2822 defined time string into system timestamp"""
537 timetuple = email.utils.parsedate_tz(timestr)
538 if timetuple is not None:
539 timestamp = email.utils.mktime_tz(timetuple)
542 def sanitize_filename(s, restricted=False, is_id=False):
543 """Sanitizes a string so it could be used as part of a filename.
544 If restricted is set, use a stricter subset of allowed characters.
545 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
547 def replace_insane(char):
548 if char == '?' or ord(char) < 32 or ord(char) == 127:
551 return '' if restricted else '\''
553 return '_-' if restricted else ' -'
554 elif char in '\\/|*<>':
556 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
558 if restricted and ord(char) > 127:
562 result = u''.join(map(replace_insane, s))
564 while '__' in result:
565 result = result.replace('__', '_')
566 result = result.strip('_')
567 # Common case of "Foreign band name - English song title"
568 if restricted and result.startswith('-_'):
574 def orderedSet(iterable):
575 """ Remove all duplicates from the input iterable """
583 def _htmlentity_transform(entity):
584 """Transforms an HTML entity to a character."""
585 # Known non-numeric HTML entity
586 if entity in compat_html_entities.name2codepoint:
587 return compat_chr(compat_html_entities.name2codepoint[entity])
589 mobj = re.match(r'#(x?[0-9]+)', entity)
591 numstr = mobj.group(1)
592 if numstr.startswith(u'x'):
594 numstr = u'0%s' % numstr
597 return compat_chr(int(numstr, base))
599 # Unknown entity in name, return its literal representation
600 return (u'&%s;' % entity)
606 assert type(s) == compat_str
609 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
612 def encodeFilename(s, for_subprocess=False):
614 @param s The name of the file
617 assert type(s) == compat_str
619 # Python 3 has a Unicode API
620 if sys.version_info >= (3, 0):
623 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
624 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
625 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
626 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
627 if not for_subprocess:
630 # For subprocess calls, encode with locale encoding
631 # Refer to http://stackoverflow.com/a/9951851/35070
632 encoding = preferredencoding()
634 encoding = sys.getfilesystemencoding()
637 return s.encode(encoding, 'ignore')
640 def encodeArgument(s):
641 if not isinstance(s, compat_str):
642 # Legacy code that uses byte strings
643 # Uncomment the following line after fixing all post processors
644 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
645 s = s.decode('ascii')
646 return encodeFilename(s, True)
649 def decodeOption(optval):
652 if isinstance(optval, bytes):
653 optval = optval.decode(preferredencoding())
655 assert isinstance(optval, compat_str)
658 def formatSeconds(secs):
660 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
662 return '%d:%02d' % (secs // 60, secs % 60)
667 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
668 if sys.version_info < (3, 2):
671 class HTTPSConnectionV3(httplib.HTTPSConnection):
672 def __init__(self, *args, **kwargs):
673 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
676 sock = socket.create_connection((self.host, self.port), self.timeout)
677 if getattr(self, '_tunnel_host', False):
681 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
683 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
685 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
686 def https_open(self, req):
687 return self.do_open(HTTPSConnectionV3, req)
688 return HTTPSHandlerV3(**kwargs)
689 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
690 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
691 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
692 if opts_no_check_certificate:
693 context.verify_mode = ssl.CERT_NONE
694 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
696 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
697 context.verify_mode = (ssl.CERT_NONE
698 if opts_no_check_certificate
699 else ssl.CERT_REQUIRED)
700 context.set_default_verify_paths()
702 context.load_default_certs()
703 except AttributeError:
705 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
707 class ExtractorError(Exception):
708 """Error during info extraction."""
709 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
710 """ tb, if given, is the original traceback (so that it can be printed out).
711 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
714 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
716 if video_id is not None:
717 msg = video_id + ': ' + msg
719 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
720 super(ExtractorError, self).__init__(msg)
723 self.exc_info = sys.exc_info() # preserve original exception
725 self.video_id = video_id
727 def format_traceback(self):
728 if self.traceback is None:
730 return u''.join(traceback.format_tb(self.traceback))
733 class RegexNotFoundError(ExtractorError):
734 """Error when a regex didn't match"""
738 class DownloadError(Exception):
739 """Download Error exception.
741 This exception may be thrown by FileDownloader objects if they are not
742 configured to continue on errors. They will contain the appropriate
745 def __init__(self, msg, exc_info=None):
746 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
747 super(DownloadError, self).__init__(msg)
748 self.exc_info = exc_info
751 class SameFileError(Exception):
752 """Same File exception.
754 This exception will be thrown by FileDownloader objects if they detect
755 multiple files would have to be downloaded to the same file on disk.
760 class PostProcessingError(Exception):
761 """Post Processing exception.
763 This exception may be raised by PostProcessor's .run() method to
764 indicate an error in the postprocessing task.
766 def __init__(self, msg):
769 class MaxDownloadsReached(Exception):
770 """ --max-downloads limit has been reached. """
774 class UnavailableVideoError(Exception):
775 """Unavailable Format exception.
777 This exception will be thrown when a video is requested
778 in a format that is not available for that video.
783 class ContentTooShortError(Exception):
784 """Content Too Short exception.
786 This exception may be raised by FileDownloader objects when a file they
787 download is too small for what the server announced first, indicating
788 the connection was probably interrupted.
794 def __init__(self, downloaded, expected):
795 self.downloaded = downloaded
796 self.expected = expected
798 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
799 """Handler for HTTP requests and responses.
801 This class, when installed with an OpenerDirector, automatically adds
802 the standard headers to every HTTP request and handles gzipped and
803 deflated responses from web servers. If compression is to be avoided in
804 a particular request, the original request in the program code only has
805 to include the HTTP header "Youtubedl-No-Compression", which will be
806 removed before making the real request.
808 Part of this code was copied from:
810 http://techknack.net/python-urllib2-handlers/
812 Andrew Rowls, the author of that code, agreed to release it to the
819 return zlib.decompress(data, -zlib.MAX_WBITS)
821 return zlib.decompress(data)
824 def addinfourl_wrapper(stream, headers, url, code):
825 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
826 return compat_urllib_request.addinfourl(stream, headers, url, code)
827 ret = compat_urllib_request.addinfourl(stream, headers, url)
831 def http_request(self, req):
832 for h, v in std_headers.items():
833 if h not in req.headers:
835 if 'Youtubedl-no-compression' in req.headers:
836 if 'Accept-encoding' in req.headers:
837 del req.headers['Accept-encoding']
838 del req.headers['Youtubedl-no-compression']
839 if 'Youtubedl-user-agent' in req.headers:
840 if 'User-agent' in req.headers:
841 del req.headers['User-agent']
842 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
843 del req.headers['Youtubedl-user-agent']
845 if sys.version_info < (2, 7) and '#' in req.get_full_url():
846 # Python 2.6 is brain-dead when it comes to fragments
847 req._Request__original = req._Request__original.partition('#')[0]
848 req._Request__r_type = req._Request__r_type.partition('#')[0]
852 def http_response(self, req, resp):
855 if resp.headers.get('Content-encoding', '') == 'gzip':
856 content = resp.read()
857 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
859 uncompressed = io.BytesIO(gz.read())
860 except IOError as original_ioerror:
861 # There may be junk add the end of the file
862 # See http://stackoverflow.com/q/4928560/35070 for details
863 for i in range(1, 1024):
865 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
866 uncompressed = io.BytesIO(gz.read())
871 raise original_ioerror
872 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
873 resp.msg = old_resp.msg
875 if resp.headers.get('Content-encoding', '') == 'deflate':
876 gz = io.BytesIO(self.deflate(resp.read()))
877 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
878 resp.msg = old_resp.msg
881 https_request = http_request
882 https_response = http_response
885 def parse_iso8601(date_str, delimiter='T'):
886 """ Return a UNIX timestamp from the given date """
892 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
895 timezone = datetime.timedelta()
897 date_str = date_str[:-len(m.group(0))]
898 if not m.group('sign'):
899 timezone = datetime.timedelta()
901 sign = 1 if m.group('sign') == '+' else -1
902 timezone = datetime.timedelta(
903 hours=sign * int(m.group('hours')),
904 minutes=sign * int(m.group('minutes')))
905 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
906 dt = datetime.datetime.strptime(date_str, date_format) - timezone
907 return calendar.timegm(dt.timetuple())
910 def unified_strdate(date_str):
911 """Return a string with the date in the format YYYYMMDD"""
918 date_str = date_str.replace(',', ' ')
919 # %z (UTC offset) is only supported in python>=3.2
920 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
921 format_expressions = [
926 '%b %dst %Y %I:%M%p',
927 '%b %dnd %Y %I:%M%p',
928 '%b %dth %Y %I:%M%p',
939 '%Y-%m-%dT%H:%M:%SZ',
940 '%Y-%m-%dT%H:%M:%S.%fZ',
941 '%Y-%m-%dT%H:%M:%S.%f0Z',
943 '%Y-%m-%dT%H:%M:%S.%f',
946 for expression in format_expressions:
948 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
951 if upload_date is None:
952 timetuple = email.utils.parsedate_tz(date_str)
954 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
957 def determine_ext(url, default_ext=u'unknown_video'):
960 guess = url.partition(u'?')[0].rpartition(u'.')[2]
961 if re.match(r'^[A-Za-z0-9]+$', guess):
966 def subtitles_filename(filename, sub_lang, sub_format):
967 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
969 def date_from_str(date_str):
971 Return a datetime object from a string in the format YYYYMMDD or
972 (now|today)[+-][0-9](day|week|month|year)(s)?"""
973 today = datetime.date.today()
974 if date_str == 'now'or date_str == 'today':
976 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
977 if match is not None:
978 sign = match.group('sign')
979 time = int(match.group('time'))
982 unit = match.group('unit')
991 delta = datetime.timedelta(**{unit: time})
993 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
995 def hyphenate_date(date_str):
997 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
998 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
999 if match is not None:
1000 return '-'.join(match.groups())
1004 class DateRange(object):
1005 """Represents a time interval between two dates"""
1006 def __init__(self, start=None, end=None):
1007 """start and end must be strings in the format accepted by date"""
1008 if start is not None:
1009 self.start = date_from_str(start)
1011 self.start = datetime.datetime.min.date()
1013 self.end = date_from_str(end)
1015 self.end = datetime.datetime.max.date()
1016 if self.start > self.end:
1017 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1020 """Returns a range that only contains the given day"""
1022 def __contains__(self, date):
1023 """Check if the date is in the range"""
1024 if not isinstance(date, datetime.date):
1025 date = date_from_str(date)
1026 return self.start <= date <= self.end
1028 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
1031 def platform_name():
1032 """ Returns the platform name as a compat_str """
1033 res = platform.platform()
1034 if isinstance(res, bytes):
1035 res = res.decode(preferredencoding())
1037 assert isinstance(res, compat_str)
1041 def _windows_write_string(s, out):
1042 """ Returns True if the string was written using special methods,
1043 False if it has yet to be written out."""
1044 # Adapted from http://stackoverflow.com/a/3259271/35070
1047 import ctypes.wintypes
1055 fileno = out.fileno()
1056 except AttributeError:
1057 # If the output stream doesn't have a fileno, it's virtual
1059 if fileno not in WIN_OUTPUT_IDS:
1062 GetStdHandle = ctypes.WINFUNCTYPE(
1063 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1064 ("GetStdHandle", ctypes.windll.kernel32))
1065 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1067 WriteConsoleW = ctypes.WINFUNCTYPE(
1068 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1069 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1070 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1071 written = ctypes.wintypes.DWORD(0)
1073 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1074 FILE_TYPE_CHAR = 0x0002
1075 FILE_TYPE_REMOTE = 0x8000
1076 GetConsoleMode = ctypes.WINFUNCTYPE(
1077 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1078 ctypes.POINTER(ctypes.wintypes.DWORD))(
1079 ("GetConsoleMode", ctypes.windll.kernel32))
1080 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1082 def not_a_console(handle):
1083 if handle == INVALID_HANDLE_VALUE or handle is None:
1085 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1086 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1088 if not_a_console(h):
1091 def next_nonbmp_pos(s):
1093 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1094 except StopIteration:
1098 count = min(next_nonbmp_pos(s), 1024)
1100 ret = WriteConsoleW(
1101 h, s, count if count else 2, ctypes.byref(written), None)
1103 raise OSError('Failed to write string')
1104 if not count: # We just wrote a non-BMP character
1105 assert written.value == 2
1108 assert written.value > 0
1109 s = s[written.value:]
1113 def write_string(s, out=None, encoding=None):
1116 assert type(s) == compat_str
1118 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1119 if _windows_write_string(s, out):
1122 if ('b' in getattr(out, 'mode', '') or
1123 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1124 byt = s.encode(encoding or preferredencoding(), 'ignore')
1126 elif hasattr(out, 'buffer'):
1127 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1128 byt = s.encode(enc, 'ignore')
1129 out.buffer.write(byt)
1135 def bytes_to_intlist(bs):
1138 if isinstance(bs[0], int): # Python 3
1141 return [ord(c) for c in bs]
1144 def intlist_to_bytes(xs):
1147 if isinstance(chr(0), bytes): # Python 2
1148 return ''.join([chr(x) for x in xs])
1153 # Cross-platform file locking
1154 if sys.platform == 'win32':
1155 import ctypes.wintypes
1158 class OVERLAPPED(ctypes.Structure):
1160 ('Internal', ctypes.wintypes.LPVOID),
1161 ('InternalHigh', ctypes.wintypes.LPVOID),
1162 ('Offset', ctypes.wintypes.DWORD),
1163 ('OffsetHigh', ctypes.wintypes.DWORD),
1164 ('hEvent', ctypes.wintypes.HANDLE),
1167 kernel32 = ctypes.windll.kernel32
1168 LockFileEx = kernel32.LockFileEx
1169 LockFileEx.argtypes = [
1170 ctypes.wintypes.HANDLE, # hFile
1171 ctypes.wintypes.DWORD, # dwFlags
1172 ctypes.wintypes.DWORD, # dwReserved
1173 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1174 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1175 ctypes.POINTER(OVERLAPPED) # Overlapped
1177 LockFileEx.restype = ctypes.wintypes.BOOL
1178 UnlockFileEx = kernel32.UnlockFileEx
1179 UnlockFileEx.argtypes = [
1180 ctypes.wintypes.HANDLE, # hFile
1181 ctypes.wintypes.DWORD, # dwReserved
1182 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1183 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1184 ctypes.POINTER(OVERLAPPED) # Overlapped
1186 UnlockFileEx.restype = ctypes.wintypes.BOOL
1187 whole_low = 0xffffffff
1188 whole_high = 0x7fffffff
1190 def _lock_file(f, exclusive):
1191 overlapped = OVERLAPPED()
1192 overlapped.Offset = 0
1193 overlapped.OffsetHigh = 0
1194 overlapped.hEvent = 0
1195 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1196 handle = msvcrt.get_osfhandle(f.fileno())
1197 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1198 whole_low, whole_high, f._lock_file_overlapped_p):
1199 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1201 def _unlock_file(f):
1202 assert f._lock_file_overlapped_p
1203 handle = msvcrt.get_osfhandle(f.fileno())
1204 if not UnlockFileEx(handle, 0,
1205 whole_low, whole_high, f._lock_file_overlapped_p):
1206 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1211 def _lock_file(f, exclusive):
1212 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1214 def _unlock_file(f):
1215 fcntl.flock(f, fcntl.LOCK_UN)
1218 class locked_file(object):
1219 def __init__(self, filename, mode, encoding=None):
1220 assert mode in ['r', 'a', 'w']
1221 self.f = io.open(filename, mode, encoding=encoding)
1224 def __enter__(self):
1225 exclusive = self.mode != 'r'
1227 _lock_file(self.f, exclusive)
1233 def __exit__(self, etype, value, traceback):
1235 _unlock_file(self.f)
1242 def write(self, *args):
1243 return self.f.write(*args)
1245 def read(self, *args):
1246 return self.f.read(*args)
1249 def get_filesystem_encoding():
1250 encoding = sys.getfilesystemencoding()
1251 return encoding if encoding is not None else 'utf-8'
1254 def shell_quote(args):
1256 encoding = get_filesystem_encoding()
1258 if isinstance(a, bytes):
1259 # We may get a filename encoded with 'encodeFilename'
1260 a = a.decode(encoding)
1261 quoted_args.append(pipes.quote(a))
1262 return u' '.join(quoted_args)
1265 def takewhile_inclusive(pred, seq):
1266 """ Like itertools.takewhile, but include the latest evaluated element
1267 (the first element so that Not pred(e)) """
1274 def smuggle_url(url, data):
1275 """ Pass additional data in a URL for internal use. """
1277 sdata = compat_urllib_parse.urlencode(
1278 {u'__youtubedl_smuggle': json.dumps(data)})
1279 return url + u'#' + sdata
1282 def unsmuggle_url(smug_url, default=None):
1283 if not '#__youtubedl_smuggle' in smug_url:
1284 return smug_url, default
1285 url, _, sdata = smug_url.rpartition(u'#')
1286 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1287 data = json.loads(jsond)
1291 def format_bytes(bytes):
1294 if type(bytes) is str:
1295 bytes = float(bytes)
1299 exponent = int(math.log(bytes, 1024.0))
1300 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1301 converted = float(bytes) / float(1024 ** exponent)
1302 return u'%.2f%s' % (converted, suffix)
1305 def get_term_width():
1306 columns = compat_getenv('COLUMNS', None)
1311 sp = subprocess.Popen(
1313 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1314 out, err = sp.communicate()
1315 return int(out.split()[1])
1321 def month_by_name(name):
1322 """ Return the number of a month by (locale-independently) English name """
1325 u'January', u'February', u'March', u'April', u'May', u'June',
1326 u'July', u'August', u'September', u'October', u'November', u'December']
1328 return ENGLISH_NAMES.index(name) + 1
1333 def fix_xml_ampersands(xml_str):
1334 """Replace all the '&' by '&' in XML"""
1336 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1341 def setproctitle(title):
1342 assert isinstance(title, compat_str)
1344 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1347 title_bytes = title.encode('utf-8')
1348 buf = ctypes.create_string_buffer(len(title_bytes))
1349 buf.value = title_bytes
1351 libc.prctl(15, buf, 0, 0, 0)
1352 except AttributeError:
1353 return # Strange libc, just skip this
1356 def remove_start(s, start):
1357 if s.startswith(start):
1358 return s[len(start):]
1362 def remove_end(s, end):
1364 return s[:-len(end)]
1368 def url_basename(url):
1369 path = compat_urlparse.urlparse(url).path
1370 return path.strip(u'/').split(u'/')[-1]
1373 class HEADRequest(compat_urllib_request.Request):
1374 def get_method(self):
1378 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1381 v = getattr(v, get_attr, None)
1384 return default if v is None else (int(v) * invscale // scale)
1387 def str_or_none(v, default=None):
1388 return default if v is None else compat_str(v)
1391 def str_to_int(int_str):
1392 """ A more relaxed version of int_or_none """
1395 int_str = re.sub(r'[,\.\+]', u'', int_str)
1399 def float_or_none(v, scale=1, invscale=1, default=None):
1400 return default if v is None else (float(v) * invscale / scale)
1403 def parse_duration(s):
1410 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1413 res = int(m.group('secs'))
1415 res += int(m.group('mins')) * 60
1416 if m.group('hours'):
1417 res += int(m.group('hours')) * 60 * 60
1419 res += float(m.group('ms'))
1423 def prepend_extension(filename, ext):
1424 name, real_ext = os.path.splitext(filename)
1425 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1428 def check_executable(exe, args=[]):
1429 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1430 args can be a list of arguments for a short output (like -version) """
1432 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1438 class PagedList(object):
1440 # This is only useful for tests
1441 return len(self.getslice())
1444 class OnDemandPagedList(PagedList):
1445 def __init__(self, pagefunc, pagesize):
1446 self._pagefunc = pagefunc
1447 self._pagesize = pagesize
1449 def getslice(self, start=0, end=None):
1451 for pagenum in itertools.count(start // self._pagesize):
1452 firstid = pagenum * self._pagesize
1453 nextfirstid = pagenum * self._pagesize + self._pagesize
1454 if start >= nextfirstid:
1457 page_results = list(self._pagefunc(pagenum))
1460 start % self._pagesize
1461 if firstid <= start < nextfirstid
1465 ((end - 1) % self._pagesize) + 1
1466 if (end is not None and firstid <= end <= nextfirstid)
1469 if startv != 0 or endv is not None:
1470 page_results = page_results[startv:endv]
1471 res.extend(page_results)
1473 # A little optimization - if current page is not "full", ie. does
1474 # not contain page_size videos then we can assume that this page
1475 # is the last one - there are no more ids on further pages -
1476 # i.e. no need to query again.
1477 if len(page_results) + startv < self._pagesize:
1480 # If we got the whole page, but the next page is not interesting,
1481 # break out early as well
1482 if end == nextfirstid:
1487 class InAdvancePagedList(PagedList):
1488 def __init__(self, pagefunc, pagecount, pagesize):
1489 self._pagefunc = pagefunc
1490 self._pagecount = pagecount
1491 self._pagesize = pagesize
1493 def getslice(self, start=0, end=None):
1495 start_page = start // self._pagesize
1497 self._pagecount if end is None else (end // self._pagesize + 1))
1498 skip_elems = start - start_page * self._pagesize
1499 only_more = None if end is None else end - start
1500 for pagenum in range(start_page, end_page):
1501 page = list(self._pagefunc(pagenum))
1503 page = page[skip_elems:]
1505 if only_more is not None:
1506 if len(page) < only_more:
1507 only_more -= len(page)
1509 page = page[:only_more]
1516 def uppercase_escape(s):
1517 unicode_escape = codecs.getdecoder('unicode_escape')
1519 r'\\U[0-9a-fA-F]{8}',
1520 lambda m: unicode_escape(m.group(0))[0],
1524 def escape_rfc3986(s):
1525 """Escape non-ASCII characters as suggested by RFC 3986"""
1526 if sys.version_info < (3, 0) and isinstance(s, unicode):
1527 s = s.encode('utf-8')
1528 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1531 def escape_url(url):
1532 """Escape URL as suggested by RFC 3986"""
1533 url_parsed = compat_urllib_parse_urlparse(url)
1534 return url_parsed._replace(
1535 path=escape_rfc3986(url_parsed.path),
1536 params=escape_rfc3986(url_parsed.params),
1537 query=escape_rfc3986(url_parsed.query),
1538 fragment=escape_rfc3986(url_parsed.fragment)
1542 struct.pack(u'!I', 0)
1544 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1545 def struct_pack(spec, *args):
1546 if isinstance(spec, compat_str):
1547 spec = spec.encode('ascii')
1548 return struct.pack(spec, *args)
1550 def struct_unpack(spec, *args):
1551 if isinstance(spec, compat_str):
1552 spec = spec.encode('ascii')
1553 return struct.unpack(spec, *args)
1555 struct_pack = struct.pack
1556 struct_unpack = struct.unpack
1559 def read_batch_urls(batch_fd):
1561 if not isinstance(url, compat_str):
1562 url = url.decode('utf-8', 'replace')
1563 BOM_UTF8 = u'\xef\xbb\xbf'
1564 if url.startswith(BOM_UTF8):
1565 url = url[len(BOM_UTF8):]
1567 if url.startswith(('#', ';', ']')):
1571 with contextlib.closing(batch_fd) as fd:
1572 return [url for url in map(fixup, fd) if url]
1575 def urlencode_postdata(*args, **kargs):
1576 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1580 etree_iter = xml.etree.ElementTree.Element.iter
1581 except AttributeError: # Python <=2.6
1582 etree_iter = lambda n: n.findall('.//*')
1586 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1587 def doctype(self, name, pubid, system):
1588 pass # Ignore doctypes
1590 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1591 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1592 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1593 # Fix up XML parser in Python 2.x
1594 if sys.version_info < (3, 0):
1595 for n in etree_iter(tree):
1596 if n.text is not None:
1597 if not isinstance(n.text, compat_str):
1598 n.text = n.text.decode('utf-8')
1602 if sys.version_info < (3, 0) and sys.platform == 'win32':
1603 def compat_getpass(prompt, *args, **kwargs):
1604 if isinstance(prompt, compat_str):
1605 prompt = prompt.encode(preferredencoding())
1606 return getpass.getpass(prompt, *args, **kwargs)
1608 compat_getpass = getpass.getpass
1620 def strip_jsonp(code):
1621 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1624 def js_to_json(code):
1627 if key.startswith("'"):
1628 assert key.endswith("'")
1629 assert '"' not in key
1630 key = '"%s"' % key[1:-1]
1631 elif not key.startswith('"'):
1635 if value.startswith("'"):
1636 assert value.endswith("'")
1637 assert '"' not in value
1638 value = '"%s"' % value[1:-1]
1640 return m.group(1) + key + m.group(3) + value
1642 res = re.sub(r'''(?x)
1644 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1646 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1648 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1652 def qualities(quality_ids):
1653 """ Get a numeric quality value out of a list of possible values """
1656 return quality_ids.index(qid)
1662 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1665 subprocess_check_output = subprocess.check_output
1666 except AttributeError:
1667 def subprocess_check_output(*args, **kwargs):
1668 assert 'input' not in kwargs
1669 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1670 output, _ = p.communicate()
1673 raise subprocess.CalledProcessError(ret, p.args, output=output)
1677 def limit_length(s, length):
1678 """ Add ellipses to overly long strings """
1683 return s[:length - len(ELLIPSES)] + ELLIPSES