[commonmistakes] Detect BOMs at the beginning of URLs
authorPhilipp Hagemeister <phihag@phihag.de>
Tue, 10 Feb 2015 00:39:43 +0000 (01:39 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Tue, 10 Feb 2015 00:40:55 +0000 (01:40 +0100)
Reported at https://bugzilla.redhat.com/show_bug.cgi?id=1093517 .

youtube_dl/extractor/__init__.py
youtube_dl/extractor/commonmistakes.py

index 55ca0d6e4667928ad062ffbc3b26a19a7f819ccd..fb1e7f325c5c9e781bae5b3136fa71e4db50b92e 100644 (file)
@@ -74,7 +74,7 @@ from .collegehumor import CollegeHumorIE
 from .collegerama import CollegeRamaIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
 from .comcarcoff import ComCarCoffIE
 from .collegerama import CollegeRamaIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
 from .comcarcoff import ComCarCoffIE
-from .commonmistakes import CommonMistakesIE
+from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .condenast import CondeNastIE
 from .cracked import CrackedIE
 from .criterion import CriterionIE
 from .condenast import CondeNastIE
 from .cracked import CrackedIE
 from .criterion import CriterionIE
index dbbf27a7486ab78a239df5a21a761b911a1e4a6e..2f86e2381f447faa7b42d6056e685bc04f101f9c 100644 (file)
@@ -27,3 +27,20 @@ class CommonMistakesIE(InfoExtractor):
         if not self._downloader.params.get('verbose'):
             msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.'
         raise ExtractorError(msg, expected=True)
         if not self._downloader.params.get('verbose'):
             msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.'
         raise ExtractorError(msg, expected=True)
+
+
+class UnicodeBOMIE(InfoExtractor):
+        IE_DESC = False
+        _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$'
+
+        _TESTS = [{
+            'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        }]
+
+        def _real_extract(self, url):
+            real_url = self._match_id(url)
+            self.report_warning(
+                'Your URL starts with a Byte Order Mark (BOM). '
+                'Removing the BOM and looking for "%s" ...' % real_url)
+            return self.url_result(real_url)