return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
+ # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+ # .//node does not match if a node is a direct child of . !
+ if isinstance(xpath, unicode):
+ xpath = xpath.encode('ascii')
+
for f in node.findall(xpath):
if f.attrib.get(key) == val:
return f
return '/'.join(replaced)
+def xpath_text(node, xpath, name=None, fatal=False):
+ if sys.version_info < (2, 7): # Crazy 2.6
+ xpath = xpath.encode('ascii')
+
+ n = node.find(xpath)
+ if n is None:
+ if fatal:
+ name = xpath if name is None else name
+ raise ExtractorError('Could not find XML element %s' % name)
+ else:
+ return None
+ return n.text
+
+
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class BaseHTMLParser(compat_html_parser.HTMLParser):
def __init(self):
expected = True
if video_id is not None:
msg = video_id + ': ' + msg
+ if cause:
+ msg += u' (caused by %r)' % cause
if not expected:
msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
super(ExtractorError, self).__init__(msg)
del req.headers['User-agent']
req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
del req.headers['Youtubedl-user-agent']
+
+ if sys.version_info < (2, 7) and '#' in req.get_full_url():
+ # Python 2.6 is brain-dead when it comes to fragments
+ req._Request__original = req._Request__original.partition('#')[0]
+ req._Request__r_type = req._Request__r_type.partition('#')[0]
+
return req
def http_response(self, req, resp):
'%d/%m/%Y',
'%d/%m/%y',
'%Y/%m/%d %H:%M:%S',
+ '%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
+ '%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
'%d.%m.%Y %H.%M',
'%Y-%m-%dT%H:%M:%SZ',
class PagedList(object):
- def __init__(self, pagefunc, pagesize):
- self._pagefunc = pagefunc
- self._pagesize = pagesize
-
def __len__(self):
# This is only useful for tests
return len(self.getslice())
+
+class OnDemandPagedList(PagedList):
+ def __init__(self, pagefunc, pagesize):
+ self._pagefunc = pagefunc
+ self._pagesize = pagesize
+
def getslice(self, start=0, end=None):
res = []
for pagenum in itertools.count(start // self._pagesize):
return res
+class InAdvancePagedList(PagedList):
+ def __init__(self, pagefunc, pagecount, pagesize):
+ self._pagefunc = pagefunc
+ self._pagecount = pagecount
+ self._pagesize = pagesize
+
+ def getslice(self, start=0, end=None):
+ res = []
+ start_page = start // self._pagesize
+ end_page = (
+ self._pagecount if end is None else (end // self._pagesize + 1))
+ skip_elems = start - start_page * self._pagesize
+ only_more = None if end is None else end - start
+ for pagenum in range(start_page, end_page):
+ page = list(self._pagefunc(pagenum))
+ if skip_elems:
+ page = page[skip_elems:]
+ skip_elems = None
+ if only_more is not None:
+ if len(page) < only_more:
+ only_more -= len(page)
+ else:
+ page = page[:only_more]
+ res.extend(page)
+ break
+ res.extend(page)
+ return res
+
+
def uppercase_escape(s):
unicode_escape = codecs.getdecoder('unicode_escape')
return re.sub(
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, unicode):
s = s.encode('utf-8')
- return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") #"%/;:@&=+$,!~*'()?#[]+" #?#[]+
+ return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
def escape_url(url):
}
+def parse_age_limit(s):
+ if s is None:
+ return None
+ m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
+ return int(m.group('age')) if m else US_RATINGS.get(s, None)
+
+
def strip_jsonp(code):
return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
def js_to_json(code):
def fix_kv(m):
- key = m.group(2)
- if key.startswith("'"):
- assert key.endswith("'")
- assert '"' not in key
- key = '"%s"' % key[1:-1]
- elif not key.startswith('"'):
- key = '"%s"' % key
-
- value = m.group(4)
- if value.startswith("'"):
- assert value.endswith("'")
- assert '"' not in value
- value = '"%s"' % value[1:-1]
-
- return m.group(1) + key + m.group(3) + value
+ v = m.group(0)
+ if v in ('true', 'false', 'null'):
+ return v
+ if v.startswith('"'):
+ return v
+ if v.startswith("'"):
+ v = v[1:-1]
+ v = re.sub(r"\\\\|\\'|\"", lambda m: {
+ '\\\\': '\\\\',
+ "\\'": "'",
+ '"': '\\"',
+ }[m.group(0)], v)
+ return '"%s"' % v
res = re.sub(r'''(?x)
- ([{,]\s*)
- ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
- (:\s*)
- ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
+ "(?:[^"\\]*(?:\\\\|\\")?)*"|
+ '(?:[^'\\]*(?:\\\\|\\')?)*'|
+ [a-zA-Z_][a-zA-Z_0-9]*
''', fix_kv, code)
res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
return res
if ret:
raise subprocess.CalledProcessError(ret, p.args, output=output)
return output
+
+
+def limit_length(s, length):
+ """ Add ellipses to overly long strings """
+ if s is None:
+ return None
+ ELLIPSES = '...'
+ if len(s) > length:
+ return s[:length - len(ELLIPSES)] + ELLIPSES
+ return s