Improve gen-unicode.py to support unicode ranges (#2944)

This patch fixes #2936 Co-authored-by: Gabor Loki loki@inf.u-szeged.hu JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
2019-07-05 15:44:56 +02:00
parent e902b870aa
commit 2b8c428694
3 changed files with 133 additions and 66 deletions
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
  0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
  0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
  0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
-  0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
-  0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
-  0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
-  0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
-  0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
-  0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
-  0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
-  0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
+  0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
+  0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
+  0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
+  0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
+  0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
+  0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
+  0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
+  0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
+  0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
+  0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
+  0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
+  0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
+  0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
+  0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
+  0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
+  0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
+  0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
+  0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
+  0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
+  0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
+  0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
+  0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
+  0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
 };

 /**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
  0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
  0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
  0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
-  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
-  0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
-  0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
-  0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
-  0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
-  0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
-  0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
-  0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
+  0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
+  0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
+  0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
+  0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
+  0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
+  0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
 };

 /**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
  0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
  0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
  0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
-  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
-  0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
+  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
+  0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
 };

 /**
@@ -0,0 +1,16 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+var 测试 = "您好";
+assert(测试 === "您好");
@@ -57,67 +57,87 @@ class UniCodeSource(object):
            generated_source.write("\n".join(self.__header))
            generated_source.write("\n".join(self.__data))

+class UnicodeCategorizer(object):
+    def __init__(self):
+        # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
+        #                          Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
+        # letter:                  Lu Ll Lt Lm Lo Nl
+        # non-letter-indent-part:
+        #   digit:                 Nd
+        #   punctuation mark:      Mn Mc
+        #   connector punctuation: Pc
+        # separators:              Zs
+        self._unicode_categories = {
+            'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
+            'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
+            'separators_category' : ["Zs"]
+        }

-# functions for unicode ranges
+        self._categories = {
+            'letters' : [],
+            'non_letters' : [],
+            'separators' : []
+        }

+    def _store_by_category(self, unicode_id, category):
+        """
+        Store the given unicode_id by its category
+        """
+        for target_category in self._categories:
+            if category in self._unicode_categories[target_category + '_category']:
+                self._categories[target_category].append(unicode_id)

-def read_categories(unicode_data_file):
-    """
-    Read the corresponding unicode values and store them in category lists.
+    def read_categories(self, unicode_data_file):
+        """
+        Read the corresponding unicode values and store them in category lists.

-    :return: List of letters, non_letter and separators.
-    """
+        :return: List of letters, non_letter and separators.
+        """

-    # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
-    # letter:                  Lu Ll Lt Lm Lo Nl
-    # non-letter-indent-part:
-    #   digit:                 Nd
-    #   punctuation mark:      Mn Mc
-    #   connector punctuation: Pc
-    # separators:              Zs
-    letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
-    non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
-    separator_category = ["Zs"]
+        range_start_id = 0

-    letters = []
-    non_letters = []
-    separators = []
+        with open(unicode_data_file) as unicode_data:
+            for line in csv.reader(unicode_data, delimiter=';'):
+                unicode_id = int(line[0], 16)

-    with open(unicode_data_file) as unicode_data:
-        for line in csv.reader(unicode_data, delimiter=';'):
-            unicode_id = int(line[0], 16)
+                # Skip supplementary planes and ascii chars
+                if unicode_id >= 0x10000 or unicode_id < 128:
+                    continue

-            # Skip supplementary planes and ascii chars
-            if unicode_id >= 0x10000 or unicode_id < 128:
-                continue
+                category = line[2]

-            category = line[2]
+                if range_start_id != 0:
+                    while range_start_id <= unicode_id:
+                        self._store_by_category(range_start_id, category)
+                        range_start_id += 1
+                    range_start_id = 0
+                    continue

-            if category in letter_category:
-                letters.append(unicode_id)
-            elif category in non_letter_category:
-                non_letters.append(unicode_id)
-            elif category in separator_category:
-                separators.append(unicode_id)
+                if line[1].startswith('<'):
+                    # Save the start position of the range
+                    range_start_id = unicode_id

-    # This separator char is handled separatly
-    non_breaking_space = 0x00A0
-    if non_breaking_space in separators:
-        separators.remove(int(non_breaking_space))
+                self._store_by_category(unicode_id, category)

-    # These separator chars are not in the unicode data file or not in Zs category
-    mongolian_vowel_separator = 0x180E
-    medium_mathematical_space = 0x205F
-    zero_width_space = 0x200B
+        # This separator char is handled separatly
+        separators = self._categories['separators']
+        non_breaking_space = 0x00A0
+        if non_breaking_space in separators:
+            separators.remove(int(non_breaking_space))

-    if mongolian_vowel_separator not in separators:
-        bisect.insort(separators, int(mongolian_vowel_separator))
-    if medium_mathematical_space not in separators:
-        bisect.insort(separators, int(medium_mathematical_space))
-    if zero_width_space not in separators:
-        bisect.insort(separators, int(zero_width_space))
+        # These separator chars are not in the unicode data file or not in Zs category
+        mongolian_vowel_separator = 0x180E
+        medium_mathematical_space = 0x205F
+        zero_width_space = 0x200B

-    return letters, non_letters, separators
+        if mongolian_vowel_separator not in separators:
+            bisect.insort(separators, int(mongolian_vowel_separator))
+        if medium_mathematical_space not in separators:
+            bisect.insort(separators, int(medium_mathematical_space))
+        if zero_width_space not in separators:
+            bisect.insort(separators, int(zero_width_space))
+
+        return self._categories['letters'], self._categories['non_letters'], self._categories['separators']


 def group_ranges(i):
@@ -159,7 +179,8 @@ def split_list(category_list):


 def generate_ranges(script_args):
-    letters, non_letters, separators = read_categories(script_args.unicode_data)
+    categorizer = UnicodeCategorizer()
+    letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)

    letter_tables = split_list(list(group_ranges(letters)))
    non_letter_tables = split_list(list(group_ranges(non_letters)))