projects
/
youtube-dl
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
[utils] Decode HTML5 entities
[youtube-dl]
/
youtube_dl
/
utils.py
diff --git
a/youtube_dl/utils.py
b/youtube_dl/utils.py
index 229de4b39a0ee6408277fcf1f999b53c1d78ee78..f77ab865093604f217ab6b2da824f340327df652 100644
(file)
--- a/
youtube_dl/utils.py
+++ b/
youtube_dl/utils.py
@@
-39,6
+39,7
@@
from .compat import (
compat_chr,
compat_etree_fromstring,
compat_html_entities,
compat_chr,
compat_etree_fromstring,
compat_html_entities,
+ compat_html_entities_html5,
compat_http_client,
compat_kwargs,
compat_parse_qs,
compat_http_client,
compat_kwargs,
compat_parse_qs,
@@
-456,12
+457,19
@@
def orderedSet(iterable):
return res
return res
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity
_with_semicolon
):
"""Transforms an HTML entity to a character."""
"""Transforms an HTML entity to a character."""
+ entity = entity_with_semicolon[:-1]
+
# Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])
# Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])
+ # TODO: HTML5 allows entities without a semicolon. For example,
+ # 'Éric' should be decoded as 'Éric'.
+ if entity_with_semicolon in compat_html_entities_html5:
+ return compat_html_entities_html5[entity_with_semicolon]
+
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
@@
-486,7
+494,7
@@
def unescapeHTML(s):
assert type(s) == compat_str
return re.sub(
assert type(s) == compat_str
return re.sub(
- r'&([^;]+
);
', lambda m: _htmlentity_transform(m.group(1)), s)
+ r'&([^;]+
;)
', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding():
def get_subprocess_encoding():