from __future__ import unicode_literals
+import base64
import calendar
import codecs
import contextlib
from .compat import (
compat_basestring,
compat_chr,
+ compat_etree_fromstring,
compat_html_entities,
compat_http_client,
compat_kwargs,
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
- if sys.version_info < (2, 7): # Crazy 2.6
- xpath = xpath.encode('ascii')
+ def _find_xpath(xpath):
+ if sys.version_info < (2, 7): # Crazy 2.6
+ xpath = xpath.encode('ascii')
+ return node.find(xpath)
+
+ if isinstance(xpath, (str, compat_str)):
+ n = _find_xpath(xpath)
+ else:
+ for xp in xpath:
+ n = _find_xpath(xp)
+ if n is not None:
+ break
- n = node.find(xpath)
if n is None:
if default is not NO_DEFAULT:
return default
return unescapeHTML(res)
-def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
- attributes = re.findall(attributes_regex, attributes_str)
- attributes_dict = {}
- if attributes:
- for (attribute_name, attribute_value) in attributes:
- attributes_dict[attribute_name] = attribute_value
- return attributes_dict
-
-
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
if drive_or_unc:
norm_path.pop(0)
sanitized_path = [
- path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
+ path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
for path_part in norm_path]
if drive_or_unc:
sanitized_path.insert(0, drive_or_unc + os.path.sep)
numstr = '0%s' % numstr
else:
base = 10
- return compat_chr(int(numstr, base))
+ # See https://github.com/rg3/youtube-dl/issues/7518
+ try:
+ return compat_chr(int(numstr, base))
+ except ValueError:
+ pass
# Unknown entity in name, return its literal representation
return ('&%s;' % entity)
# expected HTTP responses to meet HTTP/1.0 or later (see also
# https://github.com/rg3/youtube-dl/issues/6727)
if sys.version_info < (3, 0):
- kwargs['strict'] = True
+ kwargs[b'strict'] = True
hc = http_class(*args, **kwargs)
source_address = ydl_handler._params.get('source_address')
if source_address is not None:
if date_str is None:
return None
+ date_str = re.sub(r'\.[0-9]+', '', date_str)
+
if timezone is None:
m = re.search(
- r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+ r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
date_str)
if not m:
timezone = datetime.timedelta()
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
- date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
- dt = datetime.datetime.strptime(date_str, date_format) - timezone
- return calendar.timegm(dt.timetuple())
+ try:
+ date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+ dt = datetime.datetime.strptime(date_str, date_format) - timezone
+ return calendar.timegm(dt.timetuple())
+ except ValueError:
+ pass
def unified_strdate(date_str, day_first=True):
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
- return upload_date
+ if upload_date is not None:
+ return compat_str(upload_date)
def determine_ext(url, default_ext='unknown_video'):
v = getattr(v, get_attr, None)
if v == '':
v = None
- return default if v is None else (int(v) * invscale // scale)
+ if v is None:
+ return default
+ try:
+ return int(v) * invscale // scale
+ except ValueError:
+ return default
def str_or_none(v, default=None):
def float_or_none(v, scale=1, invscale=1, default=None):
- return default if v is None else (float(v) * invscale / scale)
+ if v is None:
+ return default
+ try:
+ return float(v) * invscale / scale
+ except ValueError:
+ return default
def parse_duration(s):
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
-try:
- etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError: # Python <=2.6
- etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
- class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
- def doctype(self, name, pubid, system):
- pass # Ignore doctypes
-
- parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
- kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
- tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
- # Fix up XML parser in Python 2.x
- if sys.version_info < (3, 0):
- for n in etree_iter(tree):
- if n.text is not None:
- if not isinstance(n.text, compat_str):
- n.text = n.text.decode('utf-8')
- return tree
-
-
US_RATINGS = {
'G': 0,
'PG': 10,
if v in ('true', 'false', 'null'):
return v
if v.startswith('"'):
- return v
- if v.startswith("'"):
+ v = re.sub(r"\\'", "'", v[1:-1])
+ elif v.startswith("'"):
v = v[1:-1]
v = re.sub(r"\\\\|\\'|\"", lambda m: {
'\\\\': '\\\\',
return mimetype2ext(getheader('Content-Type'))
+def encode_data_uri(data, mime_type):
+ return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
def age_restricted(content_limit, age_limit):
""" Returns True iff the content should be blocked """
return out
- dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+ dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')