X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=6fe05723484303839947357124fc9d9ebfadcf4e;hb=4c59dc4c34cfd1f3f1d325e7739d146471bab3c3;hp=d11e46c80d76c2405cdb05d26f056cd45ae27bab;hpb=181c8655c798562c85ae2af06f1ece7b01632ea9;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d11e46c80..6fe057234 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -192,6 +192,13 @@ try: except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +try: + from shlex import quote as shlex_quote +except ImportError: # Python < 3.3 + def shlex_quote(s): + return "'" + s.replace("'", "'\"'\"'") + "'" + + def compat_ord(c): if type(c) is int: return c else: return ord(c) @@ -233,18 +240,24 @@ else: def write_json_file(obj, fn): """ Encode obj as JSON and write it to fn, atomically """ + args = { + 'suffix': '.tmp', + 'prefix': os.path.basename(fn) + '.', + 'dir': os.path.dirname(fn), + 'delete': False, + } + # In Python 2.x, json.dump expects a bytestream. # In Python 3.x, it writes to a character stream if sys.version_info < (3, 0): - mode = 'wb' - encoding = None + args['mode'] = 'wb' else: - mode = 'w' - encoding = 'utf-8' - tf = tempfile.NamedTemporaryFile( - suffix='.tmp', prefix=os.path.basename(fn) + '.', - dir=os.path.dirname(fn), - delete=False) + args.update({ + 'mode': 'w', + 'encoding': 'utf-8', + }) + + tf = tempfile.NamedTemporaryFile(**args) try: with tf: @@ -285,30 +298,6 @@ def xpath_with_ns(path, ns_map): replaced.append('{%s}%s' % (ns_map[ns], tag)) return '/'.join(replaced) -def htmlentity_transform(matchobj): - """Transforms an HTML entity to a character. - - This function receives a match object and is intended to be used with - the re.sub() function. - """ - entity = matchobj.group(1) - - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) - - mobj = re.match(u'(?u)#(x?\\d+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith(u'x'): - base = 16 - numstr = u'0%s' % numstr - else: - base = 10 - return compat_chr(int(numstr, base)) - - # Unknown entity in name, return its literal representation - return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix class BaseHTMLParser(compat_html_parser.HTMLParser): @@ -530,13 +519,33 @@ def orderedSet(iterable): return res +def _htmlentity_transform(entity): + """Transforms an HTML entity to a character.""" + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + mobj = re.match(r'#(x?[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return compat_chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + + def unescapeHTML(s): if s is None: return None assert type(s) == compat_str - result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) - return result + return re.sub( + r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) def encodeFilename(s, for_subprocess=False): @@ -753,10 +762,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): - for h,v in std_headers.items(): - if h in req.headers: - del req.headers[h] - req.add_header(h, v) + for h, v in std_headers.items(): + if h not in req.headers: + req.add_header(h, v) if 'Youtubedl-no-compression' in req.headers: if 'Accept-encoding' in req.headers: del req.headers['Accept-encoding'] @@ -849,6 +857,7 @@ def unified_strdate(date_str): '%Y/%m/%d', '%d.%m.%Y', '%d/%m/%Y', + '%d/%m/%y', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%d.%m.%Y %H:%M', @@ -1132,10 +1141,10 @@ else: import fcntl def _lock_file(f, exclusive): - fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) def _unlock_file(f): - fcntl.lockf(f, fcntl.LOCK_UN) + fcntl.flock(f, fcntl.LOCK_UN) class locked_file(object): @@ -1279,6 +1288,12 @@ def remove_start(s, start): return s +def remove_end(s, end): + if s.endswith(end): + return s[:-len(end)] + return s + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip(u'/').split(u'/')[-1] @@ -1303,6 +1318,7 @@ def str_or_none(v, default=None): def str_to_int(int_str): + """ A more relaxed version of int_or_none """ if int_str is None: return None int_str = re.sub(r'[,\.]', u'', int_str) @@ -1317,8 +1333,10 @@ def parse_duration(s): if s is None: return None + s = s.strip() + m = re.match( - r'(?:(?:(?P[0-9]+)[:h])?(?P[0-9]+)[:m])?(?P[0-9]+)s?(?::[0-9]+)?$', s) + r'(?:(?:(?P[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P[0-9]+)(?P\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s) if not m: return None res = int(m.group('secs')) @@ -1326,6 +1344,8 @@ def parse_duration(s): res += int(m.group('mins')) * 60 if m.group('hours'): res += int(m.group('hours')) * 60 * 60 + if m.group('ms'): + res += float(m.group('ms')) return res @@ -1436,6 +1456,12 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +try: + etree_iter = xml.etree.ElementTree.Element.iter +except AttributeError: # Python <=2.6 + etree_iter = lambda n: n.findall('.//*') + + def parse_xml(s): class TreeBuilder(xml.etree.ElementTree.TreeBuilder): def doctype(self, name, pubid, system): @@ -1443,7 +1469,14 @@ def parse_xml(s): parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) + tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) + # Fix up XML parser in Python 2.x + if sys.version_info < (3, 0): + for n in etree_iter(tree): + if n.text is not None: + if not isinstance(n.text, compat_str): + n.text = n.text.decode('utf-8') + return tree if sys.version_info < (3, 0) and sys.platform == 'win32': @@ -1468,6 +1501,34 @@ def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) +def js_to_json(code): + def fix_kv(m): + key = m.group(2) + if key.startswith("'"): + assert key.endswith("'") + assert '"' not in key + key = '"%s"' % key[1:-1] + elif not key.startswith('"'): + key = '"%s"' % key + + value = m.group(4) + if value.startswith("'"): + assert value.endswith("'") + assert '"' not in value + value = '"%s"' % value[1:-1] + + return m.group(1) + key + m.group(3) + value + + res = re.sub(r'''(?x) + ([{,]\s*) + ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) + (:\s*) + ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) + ''', fix_kv, code) + res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + return res + + def qualities(quality_ids): """ Get a numeric quality value out of a list of possible values """ def q(qid):