Update RegExp unicode mode case folding to conform to the standard (#4004)

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai daniel.batyai@h-lab.eu
This commit is contained in:
Dániel Bátyai
2020-07-20 15:51:43 +02:00
committed by GitHub
parent 33359ac506
commit 321215fdbb
9 changed files with 284 additions and 84 deletions
+101 -32
View File
@@ -27,10 +27,18 @@ from gen_c_source import LICENSE, format_code
from settings import PROJECT_DIR
UNICODE_DATA_FILE = 'UnicodeData.txt'
SPECIAL_CASING_FILE = 'SpecialCasing.txt'
DERIVED_PROPS_FILE = 'DerivedCoreProperties.txt'
PROP_LIST_FILE = 'PropList.txt'
CASE_FOLDING_FILE = 'CaseFolding.txt'
RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
RANGES_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges-sup.inc.h')
CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
CONVERSIONS_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions-sup.inc.h')
FOLDING_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding.inc.h')
FOLDING_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding-sup.inc.h')
UNICODE_PLANE_TYPE_BASIC = 0
UNICODE_PLANE_TYPE_SUPPLEMENTARY = 1
@@ -266,11 +274,14 @@ class UnicodeBasicCategorizer(object):
if not self.in_range(letter_id) or condition_list:
continue
original_letter = parse_unicode_sequence(line[0])
small_letter = parse_unicode_sequence(line[1])
capital_letter = parse_unicode_sequence(line[3])
lower_case_mapping[letter_id] = small_letter
upper_case_mapping[letter_id] = capital_letter
if small_letter != original_letter:
lower_case_mapping[letter_id] = small_letter
if capital_letter != original_letter:
upper_case_mapping[letter_id] = capital_letter
return lower_case_mapping, upper_case_mapping
@@ -292,12 +303,13 @@ def generate_ranges(script_args, plane_type):
categorizer = UnicodeBasicCategorizer()
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from %s. Do not edit! */" % os.path.basename(script_args.derived_core_properties),
" * from %s. Do not edit! */" % (DERIVED_PROPS_FILE),
""]
c_source.complete_header("\n".join(header_completion))
units = categorizer.read_units(script_args.derived_core_properties, ["ID_Start", "ID_Continue"])
derived_props_path = os.path.join(script_args.unicode_dir, DERIVED_PROPS_FILE)
units = categorizer.read_units(derived_props_path, ["ID_Start", "ID_Continue"])
units["ID_Continue"] = sorted(set(units["ID_Continue"]).union(categorizer.extra_id_continue_units)
- set(units["ID_Start"]))
@@ -305,7 +317,9 @@ def generate_ranges(script_args, plane_type):
for category, unit in units.items():
c_source.add_range(category, categorizer.create_tables(unit))
white_space_units = categorizer.read_units(script_args.prop_list, ["White_Space"], ["Zs"])["White_Space"]
prop_list_path = os.path.join(script_args.unicode_dir, PROP_LIST_FILE)
white_space_units = categorizer.read_units(prop_list_path, ["White_Space"], ["Zs"])["White_Space"]
c_source.add_whitepace_range("White_Space", categorizer, white_space_units)
@@ -314,6 +328,19 @@ def generate_ranges(script_args, plane_type):
# functions for unicode conversions
def make_char(hex_val):
"""
Create a unicode character from a hex value
:param hex_val: Hex value of the character.
:return: Unicode character corresponding to the value.
"""
try:
return unichr(hex_val)
except NameError:
return chr(hex_val)
def parse_unicode_sequence(raw_data):
"""
@@ -331,10 +358,7 @@ def parse_unicode_sequence(raw_data):
# Convert it to unicode code point (from hex value without 0x prefix)
hex_val = int(unicode_char, 16)
try:
result += unichr(hex_val)
except NameError:
result += chr(hex_val)
result += make_char(hex_val)
return result
@@ -637,17 +661,17 @@ def generate_conversions(script_args, plane_type):
c_source = UnicodeBasicSource(CONVERSIONS_C_SOURCE)
categorizer = UnicodeBasicCategorizer()
unicode_file = os.path.basename(script_args.unicode_data)
spec_casing_file = os.path.basename(script_args.special_casing)
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
" * from %s and %s files. Do not edit! */" % (UNICODE_DATA_FILE, SPECIAL_CASING_FILE),
""]
c_source.complete_header("\n".join(header_completion))
unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
# Read the corresponding unicode values of lower and upper case letters and store these in tables
lower_case, upper_case = categorizer.read_case_mappings(script_args.unicode_data, script_args.special_casing)
lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
c_source.add_conversion_range("character_case",
extract_ranges(lower_case, upper_case),
@@ -702,34 +726,76 @@ def generate_conversions(script_args, plane_type):
c_source.generate()
def generate_folding(script_args, plane_type):
if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY:
c_source = UnicodeSupplementarySource(FOLDING_SUP_C_SOURCE)
categorizer = UnicodeSupplementaryCategorizer()
else:
c_source = UnicodeBasicSource(FOLDING_C_SOURCE)
categorizer = UnicodeBasicCategorizer()
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from the %s file. Do not edit! */" % (CASE_FOLDING_FILE),
""]
c_source.complete_header("\n".join(header_completion))
unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
case_folding_path = os.path.join(script_args.unicode_dir, CASE_FOLDING_FILE)
# Read the corresponding unicode values of lower and upper case letters and store these in tables
lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
folding = {}
with open(case_folding_path, 'r') as case_folding:
case_folding_re = re.compile(r'(?P<code_point>[^;]*);\s*(?P<type>[^;]*);\s*(?P<folding>[^;]*);')
for line in case_folding:
match = case_folding_re.match(line)
if match and match.group('type') in ('S', 'C'):
code_point = int(match.group('code_point'), 16)
if categorizer.in_range(code_point):
folding[code_point] = parse_unicode_sequence(match.group('folding'))
should_to_upper = []
should_skip_to_lower = []
for code_point in lower_case:
if code_point not in folding:
should_skip_to_lower.append(code_point)
for code_point, folded in folding.items():
if lower_case.get(code_point, make_char(code_point)) != folded:
should_to_upper.append(code_point)
if upper_case.get(code_point, '') == folded:
should_skip_to_lower.append(code_point)
c_source.add_range('folding_skip_to_lower', categorizer.create_tables(should_skip_to_lower))
c_source.add_range('folding_to_upper', categorizer.create_tables(should_to_upper))
c_source.generate()
# entry point
def main():
parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}-{sup}.inc.h generator',
epilog='''
The input files:
- UnicodeData.txt
- SpecialCasing.txt
- DerivedCoreProperties.txt
- PropList.txt
must be retrieved from
http://www.unicode.org/Public/<VERSION>/ucd/.
The input data must be retrieved from
http://www.unicode.org/Public/<VERSION>/ucd/UCD.zip.
The last known good version is 13.0.0.
''')
def check_file(path):
if not os.path.isfile(path) or not os.access(path, os.R_OK):
raise argparse.ArgumentTypeError('The %s file is missing or not readable!' % path)
def check_dir(path):
if not os.path.isdir(path) or not os.access(path, os.R_OK):
raise argparse.ArgumentTypeError('The %s directory does not exist or is not readable!' % path)
return path
parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True,
type=check_file, help='specify the unicode data file')
parser.add_argument('--special-casing', metavar='FILE', action='store', required=True,
type=check_file, help='specify the special casing file')
parser.add_argument('--prop-list', metavar='FILE', action='store', required=True,
type=check_file, help='specify the prop list file')
parser.add_argument('--derived-core-properties', metavar='FILE', action='store', required=True,
type=check_file, help='specify the DerivedCodeProperties file')
parser.add_argument('--unicode-dir', metavar='DIR', action='store', required=True,
type=check_dir, help='specify the unicode data directory')
script_args = parser.parse_args()
@@ -737,6 +803,9 @@ def main():
generate_ranges(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
generate_conversions(script_args, UNICODE_PLANE_TYPE_BASIC)
generate_conversions(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
generate_folding(script_args, UNICODE_PLANE_TYPE_BASIC)
# There are currently no code points in the supplementary planes that require special folding
# generate_folding(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
if __name__ == "__main__":
+1 -1
View File
@@ -310,7 +310,7 @@ max-args=6
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
max-locals=20
# Maximum number of return / yield for function / method body
max-returns=6