[generic] Add support for BOMs (Fixes #4753)
authorPhilipp Hagemeister <phihag@phihag.de>
Fri, 23 Jan 2015 00:21:30 +0000 (01:21 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Fri, 23 Jan 2015 00:21:30 +0000 (01:21 +0100)
test/test_utils.py
youtube_dl/extractor/generic.py
youtube_dl/utils.py

index 206760d995c98299c60458181f88b5017e65d964..bdd7f268af46d5eaaef35507f839989a8343316b 100644 (file)
@@ -28,6 +28,7 @@ from youtube_dl.utils import (
     fix_xml_ampersands,
     InAdvancePagedList,
     intlist_to_bytes,
+    is_html,
     js_to_json,
     limit_length,
     OnDemandPagedList,
@@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
         self.assertTrue(age_restricted(18, 14))
         self.assertFalse(age_restricted(18, 18))
 
+    def test_is_html(self):
+        self.assertFalse(is_html(b'\x49\x44\x43<html'))
+        self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa'))
+        self.assertTrue(is_html(  # UTF-8 with BOM
+            b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa'))
+        self.assertTrue(is_html(  # UTF-16-LE
+            b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00'
+        ))
+        self.assertTrue(is_html(  # UTF-16-BE
+            b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4'
+        ))
+        self.assertTrue(is_html(  # UTF-32-BE
+            b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4'))
+        self.assertTrue(is_html(  # UTF-32-LE
+            b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00'))
+
 if __name__ == '__main__':
     unittest.main()
index 7a5bf939237ff45731fd3befca5ad0b7dfc0df1f..b893d8149f2c63e081341d2dec2d71ba05f3e64c 100644 (file)
@@ -17,6 +17,7 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     HEADRequest,
+    is_html,
     orderedSet,
     parse_xml,
     smuggle_url,
@@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
         first_bytes = full_response.read(512)
-        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
+        if not is_html(first_bytes):
             self._downloader.report_warning(
                 'URL could be a direct video link, returning it as such.')
             upload_date = unified_strdate(
index 764474c3301695f77b6421b67b745d3c157d0937..b433b591bdfc690064523e226227fdbe34373c37 100644 (file)
@@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
     if content_limit is None:
         return False  # Content available for everyone
     return age_limit < content_limit
+
+
+def is_html(first_bytes):
+    """ Detect whether a file contains HTML by examining its first bytes. """
+
+    BOMS = [
+        (b'\xef\xbb\xbf', 'utf-8'),
+        (b'\x00\x00\xfe\xff', 'utf-32-be'),
+        (b'\xff\xfe\x00\x00', 'utf-32-le'),
+        (b'\xff\xfe', 'utf-16-le'),
+        (b'\xfe\xff', 'utf-16-be'),
+    ]
+    for bom, enc in BOMS:
+        if first_bytes.startswith(bom):
+            s = first_bytes[len(bom):].decode(enc, 'replace')
+            break
+    else:
+        s = first_bytes.decode('utf-8', 'replace')
+
+    return re.match(r'^\s*<', s)