Improve gen-unicode.py to support unicode ranges (#2944)
This patch fixes #2936 Co-authored-by: Gabor Loki loki@inf.u-szeged.hu JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
This commit is contained in:
committed by
Dániel Bátyai
parent
e902b870aa
commit
2b8c428694
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
|
||||
0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
|
||||
0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
|
||||
0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
|
||||
0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
|
||||
0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
|
||||
0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
|
||||
0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
|
||||
0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
|
||||
0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
|
||||
0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
|
||||
0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
|
||||
0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
|
||||
0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
|
||||
0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
|
||||
0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
|
||||
0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
|
||||
0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
|
||||
0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
|
||||
0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
|
||||
0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
|
||||
0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
|
||||
0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
|
||||
0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
|
||||
0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
|
||||
0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
|
||||
0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
|
||||
0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
|
||||
0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
|
||||
0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
|
||||
0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
|
||||
0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
|
||||
0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
|
||||
0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
|
||||
0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
|
||||
0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
|
||||
0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
|
||||
0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
|
||||
0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
|
||||
0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
|
||||
0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
|
||||
0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
|
||||
0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
|
||||
0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
|
||||
0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
|
||||
0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
|
||||
0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
|
||||
0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
|
||||
0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
|
||||
0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
|
||||
0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
|
||||
0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
|
||||
0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
|
||||
0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
|
||||
0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
|
||||
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
|
||||
0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
|
||||
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
|
||||
0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
// Copyright JS Foundation and other contributors, http://js.foundation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
var 测试 = "您好";
|
||||
assert(测试 === "您好");
|
||||
+69
-48
@@ -57,67 +57,87 @@ class UniCodeSource(object):
|
||||
generated_source.write("\n".join(self.__header))
|
||||
generated_source.write("\n".join(self.__data))
|
||||
|
||||
class UnicodeCategorizer(object):
|
||||
def __init__(self):
|
||||
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
|
||||
# Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
|
||||
# letter: Lu Ll Lt Lm Lo Nl
|
||||
# non-letter-indent-part:
|
||||
# digit: Nd
|
||||
# punctuation mark: Mn Mc
|
||||
# connector punctuation: Pc
|
||||
# separators: Zs
|
||||
self._unicode_categories = {
|
||||
'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
|
||||
'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
|
||||
'separators_category' : ["Zs"]
|
||||
}
|
||||
|
||||
# functions for unicode ranges
|
||||
self._categories = {
|
||||
'letters' : [],
|
||||
'non_letters' : [],
|
||||
'separators' : []
|
||||
}
|
||||
|
||||
def _store_by_category(self, unicode_id, category):
|
||||
"""
|
||||
Store the given unicode_id by its category
|
||||
"""
|
||||
for target_category in self._categories:
|
||||
if category in self._unicode_categories[target_category + '_category']:
|
||||
self._categories[target_category].append(unicode_id)
|
||||
|
||||
def read_categories(unicode_data_file):
|
||||
"""
|
||||
Read the corresponding unicode values and store them in category lists.
|
||||
def read_categories(self, unicode_data_file):
|
||||
"""
|
||||
Read the corresponding unicode values and store them in category lists.
|
||||
|
||||
:return: List of letters, non_letter and separators.
|
||||
"""
|
||||
:return: List of letters, non_letter and separators.
|
||||
"""
|
||||
|
||||
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
|
||||
# letter: Lu Ll Lt Lm Lo Nl
|
||||
# non-letter-indent-part:
|
||||
# digit: Nd
|
||||
# punctuation mark: Mn Mc
|
||||
# connector punctuation: Pc
|
||||
# separators: Zs
|
||||
letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
|
||||
non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
|
||||
separator_category = ["Zs"]
|
||||
range_start_id = 0
|
||||
|
||||
letters = []
|
||||
non_letters = []
|
||||
separators = []
|
||||
with open(unicode_data_file) as unicode_data:
|
||||
for line in csv.reader(unicode_data, delimiter=';'):
|
||||
unicode_id = int(line[0], 16)
|
||||
|
||||
with open(unicode_data_file) as unicode_data:
|
||||
for line in csv.reader(unicode_data, delimiter=';'):
|
||||
unicode_id = int(line[0], 16)
|
||||
# Skip supplementary planes and ascii chars
|
||||
if unicode_id >= 0x10000 or unicode_id < 128:
|
||||
continue
|
||||
|
||||
# Skip supplementary planes and ascii chars
|
||||
if unicode_id >= 0x10000 or unicode_id < 128:
|
||||
continue
|
||||
category = line[2]
|
||||
|
||||
category = line[2]
|
||||
if range_start_id != 0:
|
||||
while range_start_id <= unicode_id:
|
||||
self._store_by_category(range_start_id, category)
|
||||
range_start_id += 1
|
||||
range_start_id = 0
|
||||
continue
|
||||
|
||||
if category in letter_category:
|
||||
letters.append(unicode_id)
|
||||
elif category in non_letter_category:
|
||||
non_letters.append(unicode_id)
|
||||
elif category in separator_category:
|
||||
separators.append(unicode_id)
|
||||
if line[1].startswith('<'):
|
||||
# Save the start position of the range
|
||||
range_start_id = unicode_id
|
||||
|
||||
# This separator char is handled separatly
|
||||
non_breaking_space = 0x00A0
|
||||
if non_breaking_space in separators:
|
||||
separators.remove(int(non_breaking_space))
|
||||
self._store_by_category(unicode_id, category)
|
||||
|
||||
# These separator chars are not in the unicode data file or not in Zs category
|
||||
mongolian_vowel_separator = 0x180E
|
||||
medium_mathematical_space = 0x205F
|
||||
zero_width_space = 0x200B
|
||||
# This separator char is handled separatly
|
||||
separators = self._categories['separators']
|
||||
non_breaking_space = 0x00A0
|
||||
if non_breaking_space in separators:
|
||||
separators.remove(int(non_breaking_space))
|
||||
|
||||
if mongolian_vowel_separator not in separators:
|
||||
bisect.insort(separators, int(mongolian_vowel_separator))
|
||||
if medium_mathematical_space not in separators:
|
||||
bisect.insort(separators, int(medium_mathematical_space))
|
||||
if zero_width_space not in separators:
|
||||
bisect.insort(separators, int(zero_width_space))
|
||||
# These separator chars are not in the unicode data file or not in Zs category
|
||||
mongolian_vowel_separator = 0x180E
|
||||
medium_mathematical_space = 0x205F
|
||||
zero_width_space = 0x200B
|
||||
|
||||
return letters, non_letters, separators
|
||||
if mongolian_vowel_separator not in separators:
|
||||
bisect.insort(separators, int(mongolian_vowel_separator))
|
||||
if medium_mathematical_space not in separators:
|
||||
bisect.insort(separators, int(medium_mathematical_space))
|
||||
if zero_width_space not in separators:
|
||||
bisect.insort(separators, int(zero_width_space))
|
||||
|
||||
return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
|
||||
|
||||
|
||||
def group_ranges(i):
|
||||
@@ -159,7 +179,8 @@ def split_list(category_list):
|
||||
|
||||
|
||||
def generate_ranges(script_args):
|
||||
letters, non_letters, separators = read_categories(script_args.unicode_data)
|
||||
categorizer = UnicodeCategorizer()
|
||||
letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
|
||||
|
||||
letter_tables = split_list(list(group_ranges(letters)))
|
||||
non_letter_tables = split_list(list(group_ranges(non_letters)))
|
||||
|
||||
Reference in New Issue
Block a user