From: fnord Date: Wed, 15 Jul 2015 20:30:47 +0000 (-0500) Subject: Generic: use compat_urllib_parse_unquote to prevent utf8 mangling X-Git-Url: http://git.bitcoin.ninja/?a=commitdiff_plain;h=45eedbe58c8ab6344f11f1e1376d01648c1967ee;p=youtube-dl Generic: use compat_urllib_parse_unquote to prevent utf8 mangling of the entire page in python 2. -requires- fixed compat_urllib_parse_unquote example - the following will save with a mangled playlist title, instead of the kanji for 'tsunami'. This affects all utf8encoded urls as well youtube-dl -f18 -o '%(playlist_title)s-%(title)s.%(ext)s' \ https://gist.githubusercontent.com/atomicdryad/fcb97465e6060fc519e1/raw/61c14c1e3a4985471dcf56c281d24d7e781a4e0e/tsunami.html --- diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 392ad3648..fc1bf2b6e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1115,7 +1115,7 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/rg3/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse.unquote(webpage) + webpage = compat_urllib_parse_unquote(webpage) # it's tempting to parse this further, but you would # have to take into account all the variations like