Instead of replacing accented characters with an underscore when sanitizing file...
authorAdam Thalhammer <s3544305@student.rmit.edu.au>
Mon, 2 May 2016 03:21:39 +0000 (13:21 +1000)
committerAdam Thalhammer <s3544305@student.rmit.edu.au>
Mon, 2 May 2016 03:21:39 +0000 (13:21 +1000)
test/test_utils.py
youtube_dl/utils.py

index e16a6761b7e9a70589c6da7b48c9f54e2c03e734..0072ba241978b11738051dae4ae64a025d0de34b 100644 (file)
@@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True))
         self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True))
 
-        tests = 'a\xe4b\u4e2d\u56fd\u7684c'
-        self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c')
+        tests = 'aäb\u4e2d\u56fd\u7684c'
+        self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c')
         self.assertTrue(sanitize_filename('\xf6', restricted=True) != '')  # No empty filename
 
         forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#'
@@ -155,6 +155,11 @@ class TestUtil(unittest.TestCase):
         self.assertTrue(sanitize_filename('-', restricted=True) != '')
         self.assertTrue(sanitize_filename(':', restricted=True) != '')
 
+        self.assertEqual(sanitize_filename(
+            'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True),
+            'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy')
+        pass
+
     def test_sanitize_ids(self):
         self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
         self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
index 7bcc85e2b530cb2eadb714e100b56d8f4637b87d..f74f6226809fcb538ea243530097c397387c0e00 100644 (file)
@@ -14,8 +14,8 @@ import email.utils
 import errno
 import functools
 import gzip
-import itertools
 import io
+import itertools
 import json
 import locale
 import math
@@ -24,8 +24,8 @@ import os
 import pipes
 import platform
 import re
-import ssl
 import socket
+import ssl
 import struct
 import subprocess
 import sys
@@ -365,6 +365,11 @@ def sanitize_filename(s, restricted=False, is_id=False):
     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
     """
     def replace_insane(char):
+        accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
+                           itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
+                                           'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
+        if restricted and char in accents:
+            return accents[char]
         if char == '?' or ord(char) < 32 or ord(char) == 127:
             return ''
         elif char == '"':