aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2016-05-06 01:44:03 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2016-05-06 01:44:03 +0800
commit109db8ea64f9b6001d4854c1689f41ed9eb8136b (patch)
tree943156e202be242aa31c85a21a4e2d9ff196ab8b
parent915620fd6894d92f89f9e5c9362d20f94e787e57 (diff)
parentc587cbb793a6eda4fc7b1c7b4163e236abec1a00 (diff)
downloadyoutube-dl-109db8ea64f9b6001d4854c1689f41ed9eb8136b.tar.xz
Merge pull request #9367 from codesparkle/master
Feature: --restrict-filenames: replace accented characters by their unaccented counterpart instead of "_"
-rw-r--r--test/test_utils.py8
-rw-r--r--youtube_dl/utils.py11
2 files changed, 15 insertions, 4 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index e16a6761b..00ada95ec 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True))
self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True))
- tests = 'a\xe4b\u4e2d\u56fd\u7684c'
- self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c')
+ tests = 'aäb\u4e2d\u56fd\u7684c'
+ self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c')
self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename
forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#'
@@ -155,6 +155,10 @@ class TestUtil(unittest.TestCase):
self.assertTrue(sanitize_filename('-', restricted=True) != '')
self.assertTrue(sanitize_filename(':', restricted=True) != '')
+ self.assertEqual(sanitize_filename(
+ 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True),
+ 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy')
+
def test_sanitize_ids(self):
self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 7bcc85e2b..a5922b2b5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -14,8 +14,8 @@ import email.utils
import errno
import functools
import gzip
-import itertools
import io
+import itertools
import json
import locale
import math
@@ -24,8 +24,8 @@ import os
import pipes
import platform
import re
-import ssl
import socket
+import ssl
import struct
import subprocess
import sys
@@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = (
'wav',
'f4f', 'f4m', 'm3u8', 'smil')
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
+ itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
+ 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
+
def preferredencoding():
"""Get preferred encoding.
@@ -365,6 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
"""
def replace_insane(char):
+ if restricted and char in ACCENT_CHARS:
+ return ACCENT_CHARS[char]
if char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':