Refactor the generator scripts for unicode tables (#1623)

Extract the source code generator methods into a separated `unicode_c_source.py` script.
Fix the generator scripts to make them compatible with both Python2 and Python3.
Remove pylint warnings.

JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
This commit is contained in:
Robert Sipka
2017-03-16 09:32:18 +01:00
committed by László Langó
parent 818c9cd0b0
commit d77d4ae1c6
5 changed files with 654 additions and 746 deletions
+4 -4
View File
@@ -11,10 +11,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*
* This file is automatically generated by the unicode_case_conversion.py script. Do not edit!
*/ */
/* This file is automatically generated by the unicode_case_conversion.py script
* from UnicodeData-9.0.0.txt and SpecialCasing-9.0.0.txt files. Do not edit! */
/* Contains start points of character case ranges (these are bidirectional conversions). */ /* Contains start points of character case ranges (these are bidirectional conversions). */
static const uint16_t jerry_character_case_ranges[] JERRY_CONST_DATA = static const uint16_t jerry_character_case_ranges[] JERRY_CONST_DATA =
{ {
@@ -154,9 +155,8 @@ static const uint16_t jerry_upper_case_conversions[] JERRY_CONST_DATA =
0x0046, 0x004c 0x0046, 0x004c
}; };
/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */ /* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */
static const uint8_t jerry_upper_case_conversion_counters[] JERRY_CONST_DATA = static const uint8_t jerry_upper_case_conversion_counters[] JERRY_CONST_DATA =
{ {
0x001c, 0x002c, 0x0010 0x001c, 0x002c, 0x0010
}; };
+3 -4
View File
@@ -11,11 +11,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*
* This file is automatically generated by the unicode_ranges.py script
* from UnicodeData-3.0.0.txt. Do not edit!
*/ */
/* This file is automatically generated by the unicode_ranges.py script
* from UnicodeData-3.0.0.txt. Do not edit! */
/** /**
* Character interval starting points for the unicode letters. * Character interval starting points for the unicode letters.
* *
@@ -180,4 +180,3 @@ static const uint16_t jerry_unicode_separator_chars[] JERRY_CONST_DATA =
{ {
0x1680, 0x180e, 0x202f, 0x205f, 0x3000 0x1680, 0x180e, 0x202f, 0x205f, 0x3000
}; };
+75
View File
@@ -0,0 +1,75 @@
#!/usr/bin/env python
# Copyright JS Foundation and other contributors, http://js.foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LICENSE = """/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/"""
class Source(object):
def __init__(self, filepath):
self.__filepath = filepath
self.__header = [LICENSE, ""]
self.__data = []
def complete_header(self, completion):
self.__header.append(completion)
self.__header.append("") # for an extra empty line
def add_table(self, table, table_name, table_type, table_descr):
self.__data.append(table_descr)
self.__data.append("static const %s jerry_%s[] JERRY_CONST_DATA =" % (table_type, table_name))
self.__data.append("{")
self.__data.append(format_code(table, 1))
self.__data.append("};")
self.__data.append("") # for an extra empty line
def generate(self):
with open(self.__filepath, 'w') as genereted_source:
genereted_source.write("\n".join(self.__header))
genereted_source.write("\n".join(self.__data))
def regroup(list_to_group, num):
return [list_to_group[i:i+num] for i in range(0, len(list_to_group), num)]
def hex_format(char):
if isinstance(char, str):
char = ord(char)
return "0x{:04x}".format(char)
def format_code(code, indent):
lines = []
# convert all characters to hex format
converted_code = [hex_format(char) for char in code]
# 10 hex number per line
for line in regroup(", ".join(converted_code), 10 * 8):
lines.append((' ' * indent) + line.strip())
return "\n".join(lines)
+281 -377
View File
@@ -14,6 +14,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from settings import PROJECT_DIR
from unicode_c_source import Source
import argparse import argparse
import csv import csv
import itertools import itertools
@@ -21,14 +25,181 @@ import os
import sys import sys
import warnings import warnings
try: CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
unichr
except NameError:
unichr = chr
TOOLS_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.normpath(os.path.join(TOOLS_DIR, '..')) def main():
C_SOURCE_FILE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') parser = argparse.ArgumentParser()
parser.add_argument('--unicode-data',
metavar='FILE',
action='store',
required=True,
help='specify the unicode data file')
parser.add_argument('--special-casing',
metavar='FILE',
action='store',
required=True,
help='specify the special casing file')
parser.add_argument('--c-source',
metavar='FILE',
action='store',
default=CONVERSIONS_C_SOURCE,
help='specify the output c source for the conversion tables (default: %(default)s)')
script_args = parser.parse_args()
if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
print('The %s file is missing or not readable!' % script_args.unicode_data)
sys.exit(1)
if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK):
print('The %s file is missing or not readable!' % script_args.special_casing)
sys.exit(1)
conv_tables = ConversionTables(script_args.unicode_data, script_args.special_casing)
character_case_ranges = conv_tables.get_character_case_ranges()
character_pair_ranges = conv_tables.get_character_pair_ranges()
character_pairs = conv_tables.get_character_pairs()
upper_case_special_ranges = conv_tables.get_upper_case_special_ranges()
lower_case_ranges = conv_tables.get_lower_case_ranges()
lower_case_conversions = conv_tables.get_lower_case_conversions()
upper_case_conversions = conv_tables.get_upper_case_conversions()
c_source = Source(script_args.c_source)
unicode_file = os.path.basename(script_args.unicode_data)
spec_casing_file = os.path.basename(script_args.special_casing)
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
""]
c_source.complete_header("\n".join(header_completion))
c_source.add_table(character_case_ranges[0],
"character_case_ranges",
"uint16_t",
("/* Contains start points of character case ranges "
"(these are bidirectional conversions). */"))
c_source.add_table(character_case_ranges[1],
"character_case_range_lengths",
"uint8_t",
"/* Interval lengths of start points in `character_case_ranges` table. */")
c_source.add_table(character_pair_ranges[0],
"character_pair_ranges",
"uint16_t",
"/* Contains the start points of bidirectional conversion ranges. */")
c_source.add_table(character_pair_ranges[1],
"character_pair_range_lengths",
"uint8_t",
"/* Interval lengths of start points in `character_pair_ranges` table. */")
c_source.add_table(character_pairs,
"character_pairs",
"uint16_t",
"/* Contains lower/upper case bidirectional conversion pairs. */")
c_source.add_table(upper_case_special_ranges[0],
"upper_case_special_ranges",
"uint16_t",
("/* Contains start points of one-to-two uppercase ranges where the second character\n"
" * is always the same.\n"
" */"))
c_source.add_table(upper_case_special_ranges[1],
"upper_case_special_range_lengths",
"uint8_t",
"/* Interval lengths for start points in `upper_case_special_ranges` table. */")
c_source.add_table(lower_case_ranges[0],
"lower_case_ranges",
"uint16_t",
"/* Contains start points of lowercase ranges. */")
c_source.add_table(lower_case_ranges[1],
"lower_case_range_lengths",
"uint8_t",
"/* Interval lengths for start points in `lower_case_ranges` table. */")
c_source.add_table(lower_case_conversions[0],
"lower_case_conversions",
"uint16_t",
("/* The remaining lowercase conversions. The lowercase variant can "
"be one-to-three character long. */"))
c_source.add_table(lower_case_conversions[1],
"lower_case_conversion_counters",
"uint8_t",
"/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */")
c_source.add_table(upper_case_conversions[0],
"upper_case_conversions",
"uint16_t",
("/* The remaining uppercase conversions. The uppercase variant can "
"be one-to-three character long. */"))
c_source.add_table(upper_case_conversions[1],
"upper_case_conversion_counters",
"uint8_t",
"/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */")
c_source.generate()
class ConversionTables(object):
def __init__(self, unicode_data_file, special_casing_file):
"""
Read the corresponding unicode values of lower and upper case letters and store these in tables
:param unicode_data_file: Contains the default case mappings (one-to-one mappings).
:param special_casing_file: Contains additional informative case mappings that are either not one-to-one
or which are context-sensitive.
"""
case_mappings = read_case_mappings(unicode_data_file, special_casing_file)
lower_case = case_mappings[0]
upper_case = case_mappings[1]
self.__character_case_ranges = extract_ranges(lower_case, upper_case)
self.__character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case)
self.__character_pairs = extract_character_pairs(lower_case, upper_case)
self.__upper_case_special_ranges = extract_special_ranges(upper_case)
self.__lower_case_ranges = extract_ranges(lower_case)
self.__lower_case_conversions = extract_conversions(lower_case)
self.__upper_case_conversions = extract_conversions(upper_case)
if lower_case:
warnings.warn('Not all elements extracted from the lowercase table!')
if upper_case:
warnings.warn('Not all elements extracted from the uppercase table!')
def get_character_case_ranges(self):
return self.__character_case_ranges
def get_character_pair_ranges(self):
return self.__character_pair_ranges
def get_character_pairs(self):
return self.__character_pairs
def get_upper_case_special_ranges(self):
return self.__upper_case_special_ranges
def get_lower_case_ranges(self):
return self.__lower_case_ranges
def get_lower_case_conversions(self):
return self.__lower_case_conversions
def get_upper_case_conversions(self):
return self.__upper_case_conversions
def parse_unicode_sequence(raw_data): def parse_unicode_sequence(raw_data):
@@ -46,7 +217,12 @@ def parse_unicode_sequence(raw_data):
continue continue
# Convert it to unicode code point (from hex value without 0x prefix) # Convert it to unicode code point (from hex value without 0x prefix)
result += unichr(int(unicode_char, 16)) hex_val = int(unicode_char, 16)
try:
result += unichr(hex_val)
except NameError:
result += chr(hex_val)
return result return result
@@ -60,8 +236,8 @@ def read_case_mappings(unicode_data_file, special_casing_file):
:return: Upper and lower case mappings. :return: Upper and lower case mappings.
""" """
lower_case_mapping = CaseMapping() lower_case_mapping = {}
upper_case_mapping = CaseMapping() upper_case_mapping = {}
# Add one-to-one mappings # Add one-to-one mappings
with open(unicode_data_file) as unicode_data: with open(unicode_data_file) as unicode_data:
@@ -78,10 +254,10 @@ def read_case_mappings(unicode_data_file, special_casing_file):
small_letter = line[13] small_letter = line[13]
if capital_letter: if capital_letter:
upper_case_mapping.add(letter_id, parse_unicode_sequence(capital_letter)) upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter)
if small_letter: if small_letter:
lower_case_mapping.add(letter_id, parse_unicode_sequence(small_letter)) lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter)
# Update the conversion tables with the special cases # Update the conversion tables with the special cases
with open(special_casing_file) as special_casing: with open(special_casing_file) as special_casing:
@@ -107,112 +283,20 @@ def read_case_mappings(unicode_data_file, special_casing_file):
small_letter = parse_unicode_sequence(line[1]) small_letter = parse_unicode_sequence(line[1])
capital_letter = parse_unicode_sequence(line[3]) capital_letter = parse_unicode_sequence(line[3])
lower_case_mapping.add(letter_id, small_letter) lower_case_mapping[letter_id] = small_letter
upper_case_mapping.add(letter_id, capital_letter) upper_case_mapping[letter_id] = capital_letter
return lower_case_mapping, upper_case_mapping return lower_case_mapping, upper_case_mapping
class CaseMapping(dict): def extract_ranges(letter_case, reverse_letter_case=None):
"""Class defines an informative, default mapping."""
def __init__(self):
"""Initialize the case mapping table."""
self._conversion_table = {}
def add(self, letter_id, mapped_value):
"""
Add mapped value of the unicode letter.
:param letter_id: An integer, representing the unicode code point of the character.
:param mapped_value: Corresponding character of the case type.
"""
self._conversion_table[letter_id] = mapped_value
def remove(self, letter_id):
"""
Remove mapping from the conversion table.
:param letter_id: An integer, representing the unicode code point of the character.
"""
del self._conversion_table[letter_id]
def get_value(self, letter_id):
"""
Get the mapped value of the given unicode character.
:param letter_id: An integer, representing the unicode code point of the character.
:return: The mapped value of the character.
"""
if self.contains(letter_id):
return self._conversion_table[letter_id]
return None
def get_conversion_distance(self, letter_id):
"""
Calculate the distance between the unicode character and its mapped value
(only needs and works with one-to-one mappings).
:param letter_id: An integer, representing the unicode code point of the character.
:return: The conversion distance.
"""
mapped_value = self.get_value(letter_id)
if mapped_value and len(mapped_value) == 1:
return ord(mapped_value) - letter_id
return None
def is_bidirectional_conversion(self, letter_id, other_case_mapping):
"""
Check that two unicode value are also a mapping value of each other.
:param letter_id: An integer, representing the unicode code point of the character.
:param other_case_mapping: Comparable case mapping table which possible contains
the return direction of the conversion.
:return: True, if it's a reverible conversion, false otherwise.
"""
if not self.contains(letter_id):
return False
# Check one-to-one mapping
mapped_value = self.get_value(letter_id)
if len(mapped_value) > 1:
return False
# Check two way conversions
mapped_value_id = ord(mapped_value)
if other_case_mapping.get_value(mapped_value_id) != unichr(letter_id):
return False
return True
def contains(self, letter_id):
"""
Check that a unicode character is in the conversion table.
:param letter_id: An integer, representing the unicode code point of the character.
:return: True, if it contains the character, false otherwise.
"""
if letter_id in self._conversion_table:
return True
return False
def get_table(self):
return self._conversion_table
def extract_ranges(self, other_case_mapping=None):
""" """
Extract ranges from case mappings Extract ranges from case mappings
(the second param is optional, if it's not empty, a range will contains bidirectional conversions only). (the second param is optional, if it's not empty, a range will contains bidirectional conversions only).
:param letter_id: An integer, representing the unicode code point of the character. :param letter_id: An integer, representing the unicode code point of the character.
:param other_case_mapping: Comparable case mapping table which contains the return direction of the conversion. :param letter_case: case mappings dictionary which contains the conversions.
:param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
:return: A table with the start points and their mapped value, and another table with the lengths of the ranges. :return: A table with the start points and their mapped value, and another table with the lengths of the ranges.
""" """
@@ -221,33 +305,33 @@ class CaseMapping(dict):
ranges = [] ranges = []
range_lengths = [] range_lengths = []
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
prev_letter_id = letter_id - 1 prev_letter_id = letter_id - 1
# One-way conversions # One-way conversions
if other_case_mapping is None: if reverse_letter_case is None:
if len(self.get_value(letter_id)) > 1: if len(letter_case[letter_id]) > 1:
in_range = False in_range = False
continue continue
if not self.contains(prev_letter_id) or len(self.get_value(prev_letter_id)) > 1: if prev_letter_id not in letter_case or len(letter_case[prev_letter_id]) > 1:
in_range = False in_range = False
continue continue
# Two way conversions # Two way conversions
else: else:
if not self.is_bidirectional_conversion(letter_id, other_case_mapping): if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
in_range = False in_range = False
continue continue
if not self.is_bidirectional_conversion(prev_letter_id, other_case_mapping): if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
in_range = False in_range = False
continue continue
conv_distance = self.get_conversion_distance(letter_id) conv_distance = calculate_conversion_distance(letter_case, letter_id)
prev_conv_distance = self.get_conversion_distance(prev_letter_id) prev_conv_distance = calculate_conversion_distance(letter_case, prev_letter_id)
if (conv_distance != prev_conv_distance): if conv_distance != prev_conv_distance:
in_range = False in_range = False
continue continue
@@ -258,28 +342,27 @@ class CaseMapping(dict):
range_position += 1 range_position += 1
# Add the start point of the range and its mapped value # Add the start point of the range and its mapped value
ranges.extend([prev_letter_id, ord(self.get_value(prev_letter_id))]) ranges.extend([prev_letter_id, ord(letter_case[prev_letter_id])])
range_lengths.append(2) range_lengths.append(2)
# Remove all ranges from the case mapping table. # Remove all ranges from the case mapping table.
index = 0 for idx in range(0, len(ranges), 2):
while index != len(ranges): range_length = range_lengths[idx // 2]
range_length = range_lengths[index // 2]
for incr in range(range_length): for incr in range(range_length):
self.remove(ranges[index] + incr) del letter_case[ranges[idx] + incr]
if other_case_mapping is not None: if reverse_letter_case is not None:
other_case_mapping.remove(ranges[index + 1] + incr) del reverse_letter_case[ranges[idx + 1] + incr]
index += 2
return ranges, range_lengths return ranges, range_lengths
def extract_character_pair_ranges(self, other_case_mapping):
def extract_character_pair_ranges(letter_case, reverse_letter_case):
""" """
Extract two or more character pairs from the case mapping tables. Extract two or more character pairs from the case mapping tables.
:param other_case_mapping: Comparable case mapping table which contains the return direction of the conversion. :param letter_case: case mappings dictionary which contains the conversions.
:param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
:return: A table with the start points, and another table with the lengths of the ranges. :return: A table with the start points, and another table with the lengths of the ranges.
""" """
@@ -288,16 +371,16 @@ class CaseMapping(dict):
in_range = False in_range = False
element_counter = -1 element_counter = -1
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
# Only extract character pairs # Only extract character pairs
if not self.is_bidirectional_conversion(letter_id, other_case_mapping): if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
in_range = False in_range = False
continue continue
if self.get_value(letter_id) == unichr(letter_id + 1): if ord(letter_case[letter_id]) == letter_id + 1:
prev_letter_id = letter_id - 2 prev_letter_id = letter_id - 2
if not self.is_bidirectional_conversion(prev_letter_id, other_case_mapping): if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
in_range = False in_range = False
if in_range: if in_range:
@@ -312,45 +395,46 @@ class CaseMapping(dict):
in_range = False in_range = False
# Remove all founded case mapping from the conversion tables after the scanning method # Remove all founded case mapping from the conversion tables after the scanning method
idx = 0 for idx in range(len(start_points)):
while idx != len(start_points):
letter_id = start_points[idx] letter_id = start_points[idx]
conv_length = lengths[idx] conv_length = lengths[idx]
for incr in range(0, conv_length, 2): for incr in range(0, conv_length, 2):
self.remove(letter_id + incr) del letter_case[letter_id + incr]
other_case_mapping.remove(letter_id + 1 + incr) del reverse_letter_case[letter_id + 1 + incr]
idx += 1
return start_points, lengths return start_points, lengths
def extract_character_pairs(self, other_case_mapping):
def extract_character_pairs(letter_case, reverse_letter_case):
""" """
Extract character pairs. Check that two unicode value are also a mapping value of each other. Extract character pairs. Check that two unicode value are also a mapping value of each other.
:param other_case_mapping: Comparable case mapping table which contains the return direction of the conversion. :param letter_case: case mappings dictionary which contains the conversions.
:param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
:return: A table with character pairs. :return: A table with character pairs.
""" """
character_pairs = [] character_pairs = []
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
if self.is_bidirectional_conversion(letter_id, other_case_mapping): if is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
mapped_value = self.get_value(letter_id) mapped_value = letter_case[letter_id]
character_pairs.extend([letter_id, ord(mapped_value)]) character_pairs.extend([letter_id, ord(mapped_value)])
# Remove character pairs from case mapping tables # Remove character pairs from case mapping tables
self.remove(letter_id) del letter_case[letter_id]
other_case_mapping.remove(ord(mapped_value)) del reverse_letter_case[ord(mapped_value)]
return character_pairs return character_pairs
def extract_special_ranges(self):
def extract_special_ranges(letter_case):
""" """
Extract special ranges. It contains that ranges of one-to-two mappings where the second character Extract special ranges. It contains start points of one-to-two letter case ranges
of the mapped values are equals and the other characters are following each other. where the second character is always the same.
eg.: \u1f80 and \u1f81 will be in one range becase their upper-case values are \u1f08\u0399 and \u1f09\u0399
:param letter_case: case mappings dictionary which contains the conversions.
:return: A table with the start points and their mapped values, and a table with the lengths of the ranges. :return: A table with the start points and their mapped values, and a table with the lengths of the ranges.
""" """
@@ -360,19 +444,19 @@ class CaseMapping(dict):
range_position = -1 range_position = -1
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
mapped_value = self.get_value(letter_id) mapped_value = letter_case[letter_id]
if len(mapped_value) != 2: if len(mapped_value) != 2:
continue continue
prev_letter_id = letter_id - 1 prev_letter_id = letter_id - 1
if not self.contains(prev_letter_id): if prev_letter_id not in letter_case:
in_range = False in_range = False
continue continue
prev_mapped_value = self.get_value(prev_letter_id) prev_mapped_value = letter_case[prev_letter_id]
if len(prev_mapped_value) != 2: if len(prev_mapped_value) != 2:
continue continue
@@ -394,19 +478,17 @@ class CaseMapping(dict):
special_range_lengths.append(1) special_range_lengths.append(1)
# Remove special ranges from the conversion table # Remove special ranges from the conversion table
idx = 0 for idx in range(0, len(special_ranges), 3):
while idx != len(special_ranges):
range_length = special_range_lengths[idx // 3] range_length = special_range_lengths[idx // 3]
letter_id = special_ranges[idx] letter_id = special_ranges[idx]
for incr in range(range_length): for incr in range(range_length):
self.remove(special_ranges[idx] + incr) del letter_case[special_ranges[idx] + incr]
idx += 3
return special_ranges, special_range_lengths return special_ranges, special_range_lengths
def extract_conversions(self):
def extract_conversions(letter_case):
""" """
Extract conversions. It provide the full (or remained) case mappings from the table. Extract conversions. It provide the full (or remained) case mappings from the table.
The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings
@@ -419,263 +501,85 @@ class CaseMapping(dict):
unicode_lengths = [0, 0, 0] unicode_lengths = [0, 0, 0]
# 1 to 1 byte # 1 to 1 byte
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
mapped_value = self.get_value(letter_id) mapped_value = letter_case[letter_id]
if len(mapped_value) != 1: if len(mapped_value) != 1:
continue continue
unicodes[0].extend([letter_id, ord(mapped_value)]) unicodes[0].extend([letter_id, ord(mapped_value)])
self.remove(letter_id) del letter_case[letter_id]
# 1 to 2 bytes # 1 to 2 bytes
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
mapped_value = self.get_value(letter_id) mapped_value = letter_case[letter_id]
if len(mapped_value) != 2: if len(mapped_value) != 2:
continue continue
unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])]) unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])])
self.remove(letter_id) del letter_case[letter_id]
# 1 to 3 bytes # 1 to 3 bytes
for letter_id in sorted(self._conversion_table.keys()): for letter_id in sorted(letter_case.keys()):
mapped_value = self.get_value(letter_id) mapped_value = letter_case[letter_id]
if len(mapped_value) != 3: if len(mapped_value) != 3:
continue continue
unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])]) unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])])
self.remove(letter_id) del letter_case[letter_id]
unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)] unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)]
return list(itertools.chain.from_iterable(unicodes)), unicode_lengths return list(itertools.chain.from_iterable(unicodes)), unicode_lengths
def regroup(l, n): def is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
return [l[i:i+n] for i in range(0, len(l), n)]
def hex_format(ch):
if isinstance(ch, str):
ch = ord(ch)
return "0x{:04x}".format(ch)
def format_code(code, indent):
lines = []
# convert all characters to hex format
converted_code = map(hex_format, code)
# 10 hex number per line
for line in regroup(", ".join(converted_code), 10 * 8):
lines.append((' ' * indent) + line.strip())
return "\n".join(lines)
def create_c_format_table(type_name, array_name, table, description=""):
return """{DESC}
static const {TYPE} jerry_{NAME}[] JERRY_CONST_DATA =
{{
{TABLE}
}};
""".format(DESC=description, TYPE=type_name, NAME=array_name, TABLE=format_code(table, 1))
def copy_tables_to_c_source(gen_tables, c_source):
data = []
header = """/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is automatically generated by the {SCRIPT} script. Do not edit!
*/
""".format(SCRIPT=os.path.basename(__file__))
data.append(header)
character_case_ranges = gen_tables.get_character_case_ranges()
character_pair_ranges = gen_tables.get_character_pair_ranges()
character_pairs = gen_tables.get_character_pairs()
upper_case_special_ranges = gen_tables.get_upper_case_special_ranges()
lower_case_ranges = gen_tables.get_lower_case_ranges()
lower_case_conversions = gen_tables.get_lower_case_conversions()
upper_case_conversions = gen_tables.get_upper_case_conversions()
description = "/* Contains start points of character case ranges (these are bidirectional conversions). */"
data.append(create_c_format_table('uint16_t', 'character_case_ranges',
character_case_ranges[0],
description))
description = "/* Interval lengths of start points in `character_case_ranges` table. */"
data.append(create_c_format_table('uint8_t',
'character_case_range_lengths',
character_case_ranges[1],
description))
description = "/* Contains the start points of bidirectional conversion ranges. */"
data.append(create_c_format_table('uint16_t',
'character_pair_ranges',
character_pair_ranges[0],
description))
description = "/* Interval lengths of start points in `character_pair_ranges` table. */"
data.append(create_c_format_table('uint8_t',
'character_pair_range_lengths',
character_pair_ranges[1],
description))
description = "/* Contains lower/upper case bidirectional conversion pairs. */"
data.append(create_c_format_table('uint16_t',
'character_pairs',
character_pairs,
description))
description = """/* Contains start points of one-to-two uppercase ranges where the second character
* is always the same.
*/"""
data.append(create_c_format_table('uint16_t',
'upper_case_special_ranges',
upper_case_special_ranges[0],
description))
description = "/* Interval lengths for start points in `upper_case_special_ranges` table. */"
data.append(create_c_format_table('uint8_t',
'upper_case_special_range_lengths',
upper_case_special_ranges[1],
description))
description = "/* Contains start points of lowercase ranges. */"
data.append(create_c_format_table('uint16_t', 'lower_case_ranges', lower_case_ranges[0], description))
description = "/* Interval lengths for start points in `lower_case_ranges` table. */"
data.append(create_c_format_table('uint8_t', 'lower_case_range_lengths', lower_case_ranges[1], description))
description = "/* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */"
data.append(create_c_format_table('uint16_t',
'lower_case_conversions',
lower_case_conversions[0],
description))
description = "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */"
data.append(create_c_format_table('uint8_t',
'lower_case_conversion_counters',
lower_case_conversions[1],
description))
description = "/* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */"
data.append(create_c_format_table('uint16_t',
'upper_case_conversions',
upper_case_conversions[0],
description))
description = "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */"
data.append(create_c_format_table('uint8_t',
'upper_case_conversion_counters',
upper_case_conversions[1],
description))
with open(c_source, 'w') as genereted_source:
genereted_source.write(''.join(data))
class GenTables(object):
"""Class defines an informative, default generated tables."""
def __init__(self, lower_case_table, upper_case_table):
""" """
Generate the extracted tables from the given case mapping tables. Check that two unicode value are also a mapping value of each other.
:param lower_case_table: Lower-case mappings. :param letter_id: An integer, representing the unicode code point of the character.
:param upper_case_table: Upper-case mappings. :param other_case_mapping: Comparable case mapping table which possible contains
the return direction of the conversion.
:return: True, if it's a reverible conversion, false otherwise.
""" """
self._character_case_ranges = lower_case_table.extract_ranges(upper_case_table) if letter_id not in letter_case:
self._character_pair_ranges = lower_case_table.extract_character_pair_ranges(upper_case_table) return False
self._character_pairs = lower_case_table.extract_character_pairs(upper_case_table)
self._upper_case_special_ranges = upper_case_table.extract_special_ranges()
self._lower_case_ranges = lower_case_table.extract_ranges()
self._lower_case_conversions = lower_case_table.extract_conversions()
self._upper_case_conversions = upper_case_table.extract_conversions()
if lower_case_table.get_table(): # Check one-to-one mapping
warnings.warn('Not all elements extracted from the lowercase conversion table!') mapped_value = letter_case[letter_id]
if upper_case_table.get_table(): if len(mapped_value) > 1:
warnings.warn('Not all elements extracted from the uppercase conversion table!') return False
def get_character_case_ranges(self): # Check two way conversions
return self._character_case_ranges mapped_value_id = ord(mapped_value)
def get_character_pair_ranges(self): if mapped_value_id not in reverse_letter_case or len(reverse_letter_case[mapped_value_id]) > 1:
return self._character_pair_ranges return False
def get_character_pairs(self): if ord(reverse_letter_case[mapped_value_id]) != letter_id:
return self._character_pairs return False
def get_upper_case_special_ranges(self): return True
return self._upper_case_special_ranges
def get_lower_case_ranges(self):
return self._lower_case_ranges
def get_lower_case_conversions(self):
return self._lower_case_conversions
def get_upper_case_conversions(self):
return self._upper_case_conversions
def main(): def calculate_conversion_distance(letter_case, letter_id):
parser = argparse.ArgumentParser() """
Calculate the distance between the unicode character and its mapped value
(only needs and works with one-to-one mappings).
parser.add_argument('--unicode-data', :param letter_case: case mappings dictionary which contains the conversions.
metavar='FILE', :param letter_id: An integer, representing the unicode code point of the character.
action='store', :return: The conversion distance.
required=True, """
help='specify the unicode data file')
parser.add_argument('--special-casing', if letter_id not in letter_case or len(letter_case[letter_id]) > 1:
metavar='FILE', return None
action='store',
required=True,
help='specify the special casing file')
parser.add_argument('--c-source', return ord(letter_case[letter_id]) - letter_id
metavar='FILE',
action='store',
default=C_SOURCE_FILE,
help='specify the output c source (default: %(default)s)')
script_args = parser.parse_args()
if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
print('The %s file is missing or not readable!' % script_args.unicode_data)
sys.exit(1)
if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK):
print('The %s file is missing or not readable!' % script_args.special_casing)
sys.exit(1)
lower_case_table, upper_case_table = read_case_mappings(script_args.unicode_data, script_args.special_casing)
gen_tables = GenTables(lower_case_table, upper_case_table)
copy_tables_to_c_source(gen_tables, script_args.c_source)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Regular → Executable
+108 -178
View File
@@ -26,152 +26,146 @@
# connector punctuation: Pc # connector punctuation: Pc
# separators: Zs # separators: Zs
from __future__ import print_function
from settings import PROJECT_DIR
from unicode_c_source import Source
import argparse import argparse
import bisect import bisect
import csv import csv
import itertools import itertools
import os import os
import sys
TOOLS_DIR = os.path.dirname(os.path.abspath(__file__)) RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
PROJECT_DIR = os.path.normpath(os.path.join(TOOLS_DIR, '..'))
C_SOURCE_FILE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
parser = argparse.ArgumentParser() def main():
parser = argparse.ArgumentParser()
parser.add_argument('unicode_data', parser.add_argument('unicode_data',
metavar='FILE', metavar='FILE',
action='store', action='store',
help='specify the unicode data file') help='specify the unicode data file')
parser.add_argument('--c-source', parser.add_argument('--c-source',
metavar='FILE', metavar='FILE',
action='store', action='store',
default=C_SOURCE_FILE, default=RANGES_C_SOURCE,
help='specify the output c source (default: %(default)s)') help='specify the output c source (default: %(default)s)')
script_args = parser.parse_args() script_args = parser.parse_args()
def main():
if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
print('The %s file is missing or not readable!' % script_args.unicode_data) print('The %s file is missing or not readable!' % script_args.unicode_data)
sys.exit(1) sys.exit(1)
letters, non_letters, separators = read_categories() letters, non_letters, separators = read_categories(script_args.unicode_data)
letters_list = list(ranges(letters)) letter_tables = split_list(list(ranges(letters)))
letter_interval_sps, letter_interval_lengths, letter_chars = split_list(letters_list) non_letter_tables = split_list(list(ranges(non_letters)))
separator_tables = split_list(list(ranges(separators)))
non_letters_list = list(ranges(non_letters)) c_source = Source(script_args.c_source)
non_letter_interval_sps, non_letter_interval_lengths, non_letter_chars = split_list(non_letters_list)
separator_list = list(ranges(separators)) header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
separator_interval_sps, separator_interval_lengths, separator_chars = split_list(separator_list) " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data),
""]
source = GenSource() c_source.complete_header("\n".join(header_completion))
letter_interval_sps_desc = """/** c_source.add_table(letter_tables[0],
* Character interval starting points for the unicode letters.
*
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint16_t",
"unicode_letter_interval_sps", "unicode_letter_interval_sps",
letter_interval_sps, "uint16_t",
letter_interval_sps_desc) ("/**\n"
" * Character interval starting points for the unicode letters.\n"
" *\n"
" * The characters covered by these intervals are from\n"
" * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
" */"))
letter_interval_lengths_desc = """/** c_source.add_table(letter_tables[1],
* Character lengths for the unicode letters.
*
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint8_t",
"unicode_letter_interval_lengths", "unicode_letter_interval_lengths",
letter_interval_lengths, "uint8_t",
letter_interval_lengths_desc) ("/**\n"
" * Character lengths for the unicode letters.\n"
" *\n"
" * The characters covered by these intervals are from\n"
" * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
" */"))
letter_chars_desc = """/** c_source.add_table(letter_tables[2],
* Those unicode letter characters that are not inside any of
* the intervals specified in jerry_unicode_letter_interval_sps array.
*
* The characters are from the following Unicode categories:
* Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint16_t",
"unicode_letter_chars", "unicode_letter_chars",
letter_chars, "uint16_t",
letter_chars_desc) ("/**\n"
" * Those unicode letter characters that are not inside any of\n"
" * the intervals specified in jerry_unicode_letter_interval_sps array.\n"
" *\n"
" * The characters are from the following Unicode categories:\n"
" * Lu, Ll, Lt, Lm, Lo, Nl\n"
" */"))
non_letter_interval_sps_desc = """/** c_source.add_table(non_letter_tables[0],
* Character interval starting points for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/"""
source.add_table("uint16_t",
"unicode_non_letter_ident_part_interval_sps", "unicode_non_letter_ident_part_interval_sps",
non_letter_interval_sps, "uint16_t",
non_letter_interval_sps_desc) ("/**\n"
" * Character interval starting points for non-letter character\n"
" * that can be used as a non-first character of an identifier.\n"
" *\n"
" * The characters covered by these intervals are from\n"
" * the following Unicode categories: Nd, Mn, Mc, Pc\n"
" */"))
non_letter_interval_lengths_desc = """/** c_source.add_table(non_letter_tables[1],
* Character interval lengths for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/"""
source.add_table("uint8_t",
"unicode_non_letter_ident_part_interval_lengths", "unicode_non_letter_ident_part_interval_lengths",
non_letter_interval_lengths, "uint8_t",
non_letter_interval_lengths_desc) ("/**\n"
" * Character interval lengths for non-letter character\n"
" * that can be used as a non-first character of an identifier.\n"
" *\n"
" * The characters covered by these intervals are from\n"
" * the following Unicode categories: Nd, Mn, Mc, Pc\n"
" */"))
non_letter_chars_desc = """/** c_source.add_table(non_letter_tables[2],
* Those non-letter characters that can be used as a non-first
* character of an identifier and not included in any of the intervals
* specified in jerry_unicode_non_letter_ident_part_interval_sps array.
*
* The characters are from the following Unicode categories:
* Nd, Mn, Mc, Pc
*/"""
source.add_table("uint16_t",
"unicode_non_letter_ident_part_chars", "unicode_non_letter_ident_part_chars",
non_letter_chars, "uint16_t",
non_letter_chars_desc) ("/**\n"
" * Those non-letter characters that can be used as a non-first\n"
" * character of an identifier and not included in any of the intervals\n"
" * specified in jerry_unicode_non_letter_ident_part_interval_sps array.\n"
" *\n"
" * The characters are from the following Unicode categories:\n"
" * Nd, Mn, Mc, Pc\n"
" */"))
separator_interval_sps_desc = """/** c_source.add_table(separator_tables[0],
* Unicode separator character interval starting points from Unicode category: Zs
*/"""
source.add_table("uint16_t",
"unicode_separator_char_interval_sps", "unicode_separator_char_interval_sps",
separator_interval_sps, "uint16_t",
separator_interval_sps_desc) ("/**\n"
" * Unicode separator character interval starting points from Unicode category: Zs\n"
" */"))
separator_interval_lengths_desc = """/** c_source.add_table(separator_tables[1],
* Unicode separator character interval lengths from Unicode category: Zs
*/"""
source.add_table("uint8_t",
"unicode_separator_char_interval_lengths", "unicode_separator_char_interval_lengths",
separator_interval_lengths, "uint8_t",
separator_interval_lengths_desc) ("/**\n"
" * Unicode separator character interval lengths from Unicode category: Zs\n"
" */"))
separator_chars_desc = """/** c_source.add_table(separator_tables[2],
* Unicode separator characters that are not in the
* jerry_unicode_separator_char_intervals array.
*
* Unicode category: Zs
*/"""
source.add_table("uint16_t",
"unicode_separator_chars", "unicode_separator_chars",
separator_chars, "uint16_t",
separator_chars_desc) ("/**\n"
" * Unicode separator characters that are not in the\n"
" * jerry_unicode_separator_char_intervals array.\n"
" *\n"
" * Unicode category: Zs\n"
" */"))
source.write_source() c_source.generate()
def read_categories(): def read_categories(unicode_data_file):
""" """
Read the corresponding unicode values and store them in category lists. Read the corresponding unicode values and store them in category lists.
@@ -186,7 +180,7 @@ def read_categories():
non_letters = [] non_letters = []
separators = [] separators = []
with open(script_args.unicode_data) as unicode_data: with open(unicode_data_file) as unicode_data:
unicode_data_reader = csv.reader(unicode_data, delimiter=';') unicode_data_reader = csv.reader(unicode_data, delimiter=';')
for line in unicode_data_reader: for line in unicode_data_reader:
@@ -228,10 +222,9 @@ def ranges(i):
:return: List of ranges. :return: List of ranges.
""" """
for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])):
for a, b in itertools.groupby(enumerate(i), lambda (x, y): y - x): group = list(group)
b = list(b) yield group[0][1], group[-1][1]
yield b[0][1], b[-1][1]
def split_list(category_list): def split_list(category_list):
@@ -241,87 +234,24 @@ def split_list(category_list):
:return: List of interval starting points, interval lengths and single chars :return: List of interval starting points, interval lengths and single chars
""" """
unicode_category_interval_sps = [] interval_sps = []
unicode_category_interval_lengths = [] interval_lengths = []
unicode_category_chars = [] chars = []
for element in category_list: for element in category_list:
interval_length = element[1] - element[0] interval_length = element[1] - element[0]
if interval_length == 0: if interval_length == 0:
unicode_category_chars.append(element[0]) chars.append(element[0])
elif interval_length > 255:
elif (interval_length > 255):
for i in range(element[0], element[1], 256): for i in range(element[0], element[1], 256):
length = 255 if (element[1] - i > 255) else (element[1] - i) length = 255 if (element[1] - i > 255) else (element[1] - i)
unicode_category_interval_sps.append(i) interval_sps.append(i)
unicode_category_interval_lengths.append(length) interval_lengths.append(length)
else: else:
unicode_category_interval_sps.append(element[0]) interval_sps.append(element[0])
unicode_category_interval_lengths.append(element[1] - element[0]) interval_lengths.append(element[1] - element[0])
return unicode_category_interval_sps, unicode_category_interval_lengths, unicode_category_chars return interval_sps, interval_lengths, chars
class GenSource(object):
"""Class defines a default generated c source."""
def __init__(self):
self._data = []
header = """/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is automatically generated by the {SCRIPT} script
* from {UNICODES}. Do not edit!
*/
""".format(SCRIPT=os.path.basename(__file__), UNICODES=os.path.basename(script_args.unicode_data))
self._data.append(header)
def _regroup(self, l, n):
return [l[i:i+n] for i in range(0, len(l), n)]
def _hex_format(self, ch):
if isinstance(ch, str):
ch = ord(ch)
return "0x{:04x}".format(ch)
def _format_code(self, code, indent):
lines = []
# convert all characters to hex format
converted_code = map(self._hex_format, code)
# 10 hex number per line
for line in self._regroup(", ".join(converted_code), 10 * 8):
lines.append((' ' * indent) + line.strip())
return "\n".join(lines)
def add_table(self, type_name, array_name, table, description=""):
table_str = """{DESC}
static const {TYPE} jerry_{NAME}[] JERRY_CONST_DATA =
{{
{TABLE}
}};
""".format(DESC=description, TYPE=type_name, NAME=array_name, TABLE=self._format_code(table, 1))
self._data.append(table_str)
def write_source(self):
with open(script_args.c_source, 'w') as genereted_source:
genereted_source.write(''.join(self._data))
if __name__ == "__main__": if __name__ == "__main__":