compiled_regex_type = type(re.compile(''))
std_headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
+KNOWN_EXTENSIONS = (
+ 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+ 'flv', 'f4v', 'f4a', 'f4b',
+ 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+ 'mkv', 'mka', 'mk3d',
+ 'avi', 'divx',
+ 'mov',
+ 'asf', 'wmv', 'wma',
+ '3gp', '3g2',
+ 'mp3',
+ 'flac',
+ 'ape',
+ 'wav',
+ 'f4f', 'f4m', 'm3u8', 'smil')
+
def preferredencoding():
"""Get preferred encoding.
raise original_ioerror
resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
+ del resp.headers['Content-encoding']
# deflate
if resp.headers.get('Content-encoding', '') == 'deflate':
gz = io.BytesIO(self.deflate(resp.read()))
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
+ del resp.headers['Content-encoding']
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
# https://github.com/rg3/youtube-dl/issues/6457).
if 300 <= resp.code < 400:
guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
- elif guess.rstrip('/') in (
- 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
- 'flv', 'f4v', 'f4a', 'f4b',
- 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
- 'mkv', 'mka', 'mk3d',
- 'avi', 'divx',
- 'mov',
- 'asf', 'wmv', 'wma',
- '3gp', '3g2',
- 'mp3',
- 'flac',
- 'ape',
- 'wav',
- 'f4f', 'f4m', 'm3u8', 'smil'):
+ # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
+ elif guess.rstrip('/') in KNOWN_EXTENSIONS:
return guess.rstrip('/')
else:
return default_ext
if sign == '-':
time = -time
unit = match.group('unit')
- # A bad aproximation?
+ # A bad approximation?
if unit == 'month':
unit = 'day'
time *= 30
if s is None:
return None
- # The lower-case forms are of course incorrect and inofficial,
+ # The lower-case forms are of course incorrect and unofficial,
# but we support those too
_UNIT_TABLE = {
'B': 1,
return s
+def remove_quotes(s):
+ if s is None or len(s) < 2:
+ return s
+ for quote in ('"', "'", ):
+ if s[0] == quote and s[-1] == quote:
+ return s[1:-1]
+ return s
+
+
def url_basename(url):
path = compat_urlparse.urlparse(url).path
return path.strip('/').split('/')[-1]
return dict((encode(k), encode(v)) for k, v in d.items())
+def dict_get(d, key_or_keys, default=None, skip_false_values=True):
+ if isinstance(key_or_keys, (list, tuple)):
+ for key in key_or_keys:
+ if key not in d or d[key] is None or skip_false_values and not d[key]:
+ continue
+ return d[key]
+ return default
+ return d.get(key_or_keys, default)
+
+
+def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
+ return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
+
+
US_RATINGS = {
'G': 0,
'PG': 10,
def strip_jsonp(code):
return re.sub(
- r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+ r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
def js_to_json(code):
return ' '.join(shlex_quote(a) for a in args)
+def error_to_compat_str(err):
+ err_str = str(err)
+ # On python 2 error byte string must be decoded with proper
+ # encoding rather than ascii
+ if sys.version_info[0] < 3:
+ err_str = err_str.decode(preferredencoding())
+ return err_str
+
+
def mimetype2ext(mt):
_, _, res = mt.rpartition('/')
return {
- 'x-ms-wmv': 'wmv',
- 'x-mp4-fragmented': 'mp4',
+ '3gpp': '3gp',
'ttml+xml': 'ttml',
+ 'x-flv': 'flv',
+ 'x-mp4-fragmented': 'mp4',
+ 'x-ms-wmv': 'wmv',
}.get(res, res)
def parse_dfxp_time_expr(time_expr):
if not time_expr:
- return 0.0
+ return
mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
if mobj:
return float(mobj.group('time_offset'))
- mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+ mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
if mobj:
- return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+ return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
def srt_subtitles_timecode(seconds):
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
})
- def parse_node(node):
- str_or_empty = functools.partial(str_or_none, default='')
+ class TTMLPElementParser(object):
+ out = ''
- out = str_or_empty(node.text)
+ def start(self, tag, attrib):
+ if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+ self.out += '\n'
- for child in node:
- if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- out += '\n' + str_or_empty(child.tail)
- elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
- out += str_or_empty(parse_node(child))
- else:
- out += str_or_empty(xml.etree.ElementTree.tostring(child))
+ def end(self, tag):
+ pass
+
+ def data(self, data):
+ self.out += data
- return out
+ def close(self):
+ return self.out.strip()
+
+ def parse_node(node):
+ target = TTMLPElementParser()
+ parser = xml.etree.ElementTree.XMLParser(target=target)
+ parser.feed(xml.etree.ElementTree.tostring(node))
+ return parser.close()
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)):
- begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+ begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+ dur = parse_dfxp_time_expr(para.attrib.get('dur'))
+ if begin_time is None:
+ continue
if not end_time:
- end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
+ if not dur:
+ continue
+ end_time = begin_time + dur
out.append('%d\n%s --> %s\n%s\n\n' % (
index,
srt_subtitles_timecode(begin_time),