Deal with implicitly UTF-16 decoded webpages
authorPhilipp Hagemeister <phihag@phihag.de>
Tue, 21 Jan 2014 00:39:39 +0000 (01:39 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Tue, 21 Jan 2014 00:39:40 +0000 (01:39 +0100)
These webpages don't specify an encoding and rely on the BOM

youtube_dl/extractor/common.py

index 692d828da9ef9739e1b05908e6a9f39259b0940b..6c5d77e583586b266d480fa4c01acf9949cfe3a7 100644 (file)
@@ -220,6 +220,8 @@ class InfoExtractor(object):
                           webpage_bytes[:1024])
             if m:
                 encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
             else:
                 encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):