diff options
Diffstat (limited to 'youtube_dl/casefold.py')
| -rw-r--r-- | youtube_dl/casefold.py | 24 | 
1 files changed, 20 insertions, 4 deletions
| diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py index 546269a3c..7e91c3811 100644 --- a/youtube_dl/casefold.py +++ b/youtube_dl/casefold.py @@ -1,8 +1,20 @@  # coding: utf-8  from __future__ import unicode_literals -from .compat import compat_str +from .compat import ( +    compat_str, +    compat_chr, +) +# Below is included the text of icu/CaseFolding.txt retrieved from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt +# In case newly foldable Unicode characters are defined, paste the new version +# of the text inside the ''' marks. +# The text is expected to have only blank lines andlines with 1st character #, +# all ignored, and fold definitions like this: +# `from_hex_code; space_separated_to_hex_code_list; comment` + +_map_str = '''  # CaseFolding-15.0.0.txt  # Date: 2022-02-02, 23:35:35 GMT  # © 2022 Unicode®, Inc. @@ -65,7 +77,6 @@ from .compat import compat_str  #  have the value C for the status field, and the code point itself for the mapping field.  # ================================================================= -_map_str = '''  0041; C; 0061; # LATIN CAPITAL LETTER A  0042; C; 0062; # LATIN CAPITAL LETTER B  0043; C; 0063; # LATIN CAPITAL LETTER C @@ -1627,17 +1638,22 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z  1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO  1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA  ''' + +_parse_unichr = lambda s: compat_chr(int(s, 16)) +  _map = dict( -    (unichr(int(from_, 16)), ''.join((unichr(int(v, 16)) for v in to_.split(' ')))) +    (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' '))))      for from_, type_, to_, _ in ( -        l.split('; ', 3) for l in _map_str.splitlines() if l) +        l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#')      if type_ in ('C', 'F'))  del _map_str +  def casefold(s):      assert isinstance(s, compat_str)      return ''.join((_map.get(c, c) for c in s)) +  __all__ = [      casefold  ] | 
