#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
import calendar
import codecs
import contextlib
import datetime
import email.utils
import errno
-import getpass
import gzip
import itertools
import io
compat_chr,
compat_getenv,
compat_html_entities,
- compat_html_parser,
compat_parse_qs,
compat_str,
compat_urllib_error,
"""
try:
pref = locale.getpreferredencoding()
- u'TEST'.encode(pref)
+ 'TEST'.encode(pref)
except:
pref = 'UTF-8'
def write_json_file(obj, fn):
- """ Encode obj as JSON and write it to fn, atomically """
+ """ Encode obj as JSON and write it to fn, atomically if possible """
+
+ fn = encodeFilename(fn)
+ if sys.version_info < (3, 0) and sys.platform != 'win32':
+ encoding = get_filesystem_encoding()
+ # os.path.basename returns a bytes object, but NamedTemporaryFile
+ # will fail if the filename contains non ascii characters unless we
+ # use a unicode object
+ path_basename = lambda f: os.path.basename(fn).decode(encoding)
+ # the same for os.path.dirname
+ path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
+ else:
+ path_basename = os.path.basename
+ path_dirname = os.path.dirname
args = {
'suffix': '.tmp',
- 'prefix': os.path.basename(fn) + '.',
- 'dir': os.path.dirname(fn),
+ 'prefix': path_basename(fn) + '.',
+ 'dir': path_dirname(fn),
'delete': False,
}
try:
with tf:
json.dump(obj, tf)
+ if sys.platform == 'win32':
+ # Need to remove existing file on Windows, else os.rename raises
+ # WindowsError or FileExistsError.
+ try:
+ os.unlink(fn)
+ except OSError:
+ pass
os.rename(tf.name, fn)
except:
try:
return n.text
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
- def __init(self):
- compat_html_parser.HTMLParser.__init__(self)
- self.html = None
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
-class AttrParser(BaseHTMLParser):
- """Modified HTMLParser that isolates a tag with the specified attribute"""
- def __init__(self, attribute, value):
- self.attribute = attribute
- self.value = value
- self.result = None
- self.started = False
- self.depth = {}
- self.watch_startpos = False
- self.error_count = 0
- BaseHTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise compat_html_parser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if self.attribute in attrs and attrs[self.attribute] == self.value:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
- AttrParser.parse_endtag = (lambda self, i:
- i + len("</scr'+'ipt>")
- if self.rawdata[i:].startswith("</scr'+'ipt>")
- else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html)
+
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
- parser = AttrParser(attribute, value)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
-class MetaParser(BaseHTMLParser):
- """
- Modified HTMLParser that isolates a meta tag with the specified name
- attribute.
- """
- def __init__(self, name):
- BaseHTMLParser.__init__(self)
- self.name = name
- self.content = None
- self.result = None
-
- def handle_starttag(self, tag, attrs):
- if tag != 'meta':
- return
- attrs = dict(attrs)
- if attrs.get('name') == self.name:
- self.result = attrs.get('content')
+ m = re.search(r'''(?xs)
+ <([a-zA-Z0-9:._-]+)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s+%s=['"]?%s['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (re.escape(attribute), re.escape(value)), html)
- def get_result(self):
- return self.result
+ if not m:
+ return None
+ res = m.group('content')
-def get_meta_content(name, html):
- """
- Return the content attribute from the meta tag with the given name attribute.
- """
- parser = MetaParser(name)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
+ if res.startswith('"') or res.startswith("'"):
+ res = res[1:-1]
+
+ return unescapeHTML(res)
def clean_html(html):
It returns the tuple (stream, definitive_file_name).
"""
try:
- if filename == u'-':
+ if filename == '-':
if sys.platform == 'win32':
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
# In case of error, try to remove win32 forbidden chars
alt_filename = os.path.join(
- re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
+ re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
for path_part in os.path.split(filename)
)
if alt_filename == filename:
return '_'
return char
- result = u''.join(map(replace_insane, s))
+ result = ''.join(map(replace_insane, s))
if not is_id:
while '__' in result:
result = result.replace('__', '_')
mobj = re.match(r'#(x?[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
- if numstr.startswith(u'x'):
+ if numstr.startswith('x'):
base = 16
- numstr = u'0%s' % numstr
+ numstr = '0%s' % numstr
else:
base = 10
return compat_chr(int(numstr, base))
# Unknown entity in name, return its literal representation
- return (u'&%s;' % entity)
+ return ('&%s;' % entity)
def unescapeHTML(s):
return s
if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
- # Pass u'' directly to use Unicode APIs on Windows 2000 and up
+ # Pass '' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
if not for_subprocess:
pass # Python < 3.4
return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+
class ExtractorError(Exception):
"""Error during info extraction."""
def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
if video_id is not None:
msg = video_id + ': ' + msg
if cause:
- msg += u' (caused by %r)' % cause
+ msg += ' (caused by %r)' % cause
if not expected:
- msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
+ if ytdl_is_updateable():
+ update_cmd = 'type youtube-dl -U to update'
+ else:
+ update_cmd = 'see https://yt-dl.org/update on how to update'
+ msg += '; please report this issue on https://yt-dl.org/bug .'
+ msg += ' Make sure you are using the latest version; %s.' % update_cmd
+ msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
super(ExtractorError, self).__init__(msg)
self.traceback = tb
def format_traceback(self):
if self.traceback is None:
return None
- return u''.join(traceback.format_tb(self.traceback))
+ return ''.join(traceback.format_tb(self.traceback))
class RegexNotFoundError(ExtractorError):
upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
return upload_date
-def determine_ext(url, default_ext=u'unknown_video'):
+def determine_ext(url, default_ext='unknown_video'):
if url is None:
return default_ext
- guess = url.partition(u'?')[0].rpartition(u'.')[2]
+ guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
else:
return default_ext
def subtitles_filename(filename, sub_lang, sub_format):
- return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+ return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
def date_from_str(date_str):
"""
def intlist_to_bytes(xs):
if not xs:
return b''
- if isinstance(chr(0), bytes): # Python 2
- return ''.join([chr(x) for x in xs])
- else:
- return bytes(xs)
+ return struct_pack('%dB' % len(xs), *xs)
# Cross-platform file locking
# We may get a filename encoded with 'encodeFilename'
a = a.decode(encoding)
quoted_args.append(pipes.quote(a))
- return u' '.join(quoted_args)
+ return ' '.join(quoted_args)
def takewhile_inclusive(pred, seq):
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse.urlencode(
- {u'__youtubedl_smuggle': json.dumps(data)})
- return url + u'#' + sdata
+ {'__youtubedl_smuggle': json.dumps(data)})
+ return url + '#' + sdata
def unsmuggle_url(smug_url, default=None):
if not '#__youtubedl_smuggle' in smug_url:
return smug_url, default
- url, _, sdata = smug_url.rpartition(u'#')
- jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
+ url, _, sdata = smug_url.rpartition('#')
+ jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
data = json.loads(jsond)
return url, data
def format_bytes(bytes):
if bytes is None:
- return u'N/A'
+ return 'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
- suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+ suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
- return u'%.2f%s' % (converted, suffix)
+ return '%.2f%s' % (converted, suffix)
def get_term_width():
""" Return the number of a month by (locale-independently) English name """
ENGLISH_NAMES = [
- u'January', u'February', u'March', u'April', u'May', u'June',
- u'July', u'August', u'September', u'October', u'November', u'December']
+ 'January', 'February', 'March', 'April', 'May', 'June',
+ 'July', 'August', 'September', 'October', 'November', 'December']
try:
return ENGLISH_NAMES.index(name) + 1
except ValueError:
"""Replace all the '&' by '&' in XML"""
return re.sub(
r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
- u'&',
+ '&',
xml_str)
def url_basename(url):
path = compat_urlparse.urlparse(url).path
- return path.strip(u'/').split(u'/')[-1]
+ return path.strip('/').split('/')[-1]
class HEADRequest(compat_urllib_request.Request):
""" A more relaxed version of int_or_none """
if int_str is None:
return None
- int_str = re.sub(r'[,\.\+]', u'', int_str)
+ int_str = re.sub(r'[,\.\+]', '', int_str)
return int(int_str)
s = s.strip()
m = re.match(
- r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
+ r'''(?ix)T?
+ (?:
+ (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
+ (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
+ )?
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
if not m:
return None
res = int(m.group('secs'))
def prepend_extension(filename, ext):
name, real_ext = os.path.splitext(filename)
- return u'{0}.{1}{2}'.format(name, ext, real_ext)
+ return '{0}.{1}{2}'.format(name, ext, real_ext)
def check_executable(exe, args=[]):
def get_exe_version(exe, args=['--version'],
version_re=r'version\s+([0-9._-a-zA-Z]+)',
- unrecognized=u'present'):
+ unrecognized='present'):
""" Returns the version of the specified executable,
or False if the executable is not present """
try:
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, unicode):
s = s.encode('utf-8')
- return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
def escape_url(url):
).geturl()
try:
- struct.pack(u'!I', 0)
+ struct.pack('!I', 0)
except TypeError:
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack(spec, *args):
def fixup(url):
if not isinstance(url, compat_str):
url = url.decode('utf-8', 'replace')
- BOM_UTF8 = u'\xef\xbb\xbf'
+ BOM_UTF8 = '\xef\xbb\xbf'
if url.startswith(BOM_UTF8):
url = url[len(BOM_UTF8):]
url = url.strip()
def strip_jsonp(code):
- return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
+ return re.sub(
+ r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
def js_to_json(code):
return version_tuple(version) < version_tuple(limit)
except ValueError:
return not assume_new
+
+
+def ytdl_is_updateable():
+ """ Returns if youtube-dl can be updated with -U """
+ from zipimport import zipimporter
+
+ return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')