From 2b8c42869417dac8795fe7575557af71d9d9008a Mon Sep 17 00:00:00 2001
From: Robert Fancsik <frobert@inf.u-szeged.hu>
Date: Fri, 5 Jul 2019 15:44:56 +0200
Subject: [PATCH] Improve gen-unicode.py to support unicode ranges (#2944)

This patch fixes #2936

Co-authored-by: Gabor Loki loki@inf.u-szeged.hu
JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
---
 jerry-core/lit/lit-unicode-ranges.inc.h   |  66 ++++++++----
 tests/jerry/regression-test-issue-2936.js |  16 +++
 tools/gen-unicode.py                      | 117 +++++++++++++---------
 3 files changed, 133 insertions(+), 66 deletions(-)
 create mode 100644 tests/jerry/regression-test-issue-2936.js

diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h
index b95b3ad60..1749feaf6 100644
--- a/jerry-core/lit/lit-unicode-ranges.inc.h
+++ b/jerry-core/lit/lit-unicode-ranges.inc.h
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
   0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
   0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
   0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
-  0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
-  0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
-  0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
-  0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
-  0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
-  0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
-  0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
-  0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
+  0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
+  0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
+  0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
+  0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
+  0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
+  0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
+  0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
+  0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
+  0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
+  0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
+  0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
+  0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
+  0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
+  0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
+  0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
+  0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
+  0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
+  0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
+  0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
+  0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
+  0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
+  0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
+  0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
 };
 
 /**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
   0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
   0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
   0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
-  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
-  0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
-  0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
-  0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
-  0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
-  0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
-  0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
-  0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
+  0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
+  0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
+  0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
+  0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
+  0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
+  0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
 };
 
 /**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
   0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
   0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
   0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
-  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
-  0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
+  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
+  0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
 };
 
 /**
diff --git a/tests/jerry/regression-test-issue-2936.js b/tests/jerry/regression-test-issue-2936.js
new file mode 100644
index 000000000..e9e588fbf
--- /dev/null
+++ b/tests/jerry/regression-test-issue-2936.js
@@ -0,0 +1,16 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+var 测试 = "您好";
+assert(测试 === "您好");
diff --git a/tools/gen-unicode.py b/tools/gen-unicode.py
index 283c79ba4..34f8ace96 100755
--- a/tools/gen-unicode.py
+++ b/tools/gen-unicode.py
@@ -57,67 +57,87 @@ class UniCodeSource(object):
             generated_source.write("\n".join(self.__header))
             generated_source.write("\n".join(self.__data))
 
+class UnicodeCategorizer(object):
+    def __init__(self):
+        # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
+        #                          Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
+        # letter:                  Lu Ll Lt Lm Lo Nl
+        # non-letter-indent-part:
+        #   digit:                 Nd
+        #   punctuation mark:      Mn Mc
+        #   connector punctuation: Pc
+        # separators:              Zs
+        self._unicode_categories = {
+            'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
+            'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
+            'separators_category' : ["Zs"]
+        }
 
-# functions for unicode ranges
+        self._categories = {
+            'letters' : [],
+            'non_letters' : [],
+            'separators' : []
+        }
 
+    def _store_by_category(self, unicode_id, category):
+        """
+        Store the given unicode_id by its category
+        """
+        for target_category in self._categories:
+            if category in self._unicode_categories[target_category + '_category']:
+                self._categories[target_category].append(unicode_id)
 
-def read_categories(unicode_data_file):
-    """
-    Read the corresponding unicode values and store them in category lists.
+    def read_categories(self, unicode_data_file):
+        """
+        Read the corresponding unicode values and store them in category lists.
 
-    :return: List of letters, non_letter and separators.
-    """
+        :return: List of letters, non_letter and separators.
+        """
 
-    # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
-    # letter:                  Lu Ll Lt Lm Lo Nl
-    # non-letter-indent-part:
-    #   digit:                 Nd
-    #   punctuation mark:      Mn Mc
-    #   connector punctuation: Pc
-    # separators:              Zs
-    letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
-    non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
-    separator_category = ["Zs"]
+        range_start_id = 0
 
-    letters = []
-    non_letters = []
-    separators = []
+        with open(unicode_data_file) as unicode_data:
+            for line in csv.reader(unicode_data, delimiter=';'):
+                unicode_id = int(line[0], 16)
 
-    with open(unicode_data_file) as unicode_data:
-        for line in csv.reader(unicode_data, delimiter=';'):
-            unicode_id = int(line[0], 16)
+                # Skip supplementary planes and ascii chars
+                if unicode_id >= 0x10000 or unicode_id < 128:
+                    continue
 
-            # Skip supplementary planes and ascii chars
-            if unicode_id >= 0x10000 or unicode_id < 128:
-                continue
+                category = line[2]
 
-            category = line[2]
+                if range_start_id != 0:
+                    while range_start_id <= unicode_id:
+                        self._store_by_category(range_start_id, category)
+                        range_start_id += 1
+                    range_start_id = 0
+                    continue
 
-            if category in letter_category:
-                letters.append(unicode_id)
-            elif category in non_letter_category:
-                non_letters.append(unicode_id)
-            elif category in separator_category:
-                separators.append(unicode_id)
+                if line[1].startswith('<'):
+                    # Save the start position of the range
+                    range_start_id = unicode_id
 
-    # This separator char is handled separatly
-    non_breaking_space = 0x00A0
-    if non_breaking_space in separators:
-        separators.remove(int(non_breaking_space))
+                self._store_by_category(unicode_id, category)
 
-    # These separator chars are not in the unicode data file or not in Zs category
-    mongolian_vowel_separator = 0x180E
-    medium_mathematical_space = 0x205F
-    zero_width_space = 0x200B
+        # This separator char is handled separatly
+        separators = self._categories['separators']
+        non_breaking_space = 0x00A0
+        if non_breaking_space in separators:
+            separators.remove(int(non_breaking_space))
 
-    if mongolian_vowel_separator not in separators:
-        bisect.insort(separators, int(mongolian_vowel_separator))
-    if medium_mathematical_space not in separators:
-        bisect.insort(separators, int(medium_mathematical_space))
-    if zero_width_space not in separators:
-        bisect.insort(separators, int(zero_width_space))
+        # These separator chars are not in the unicode data file or not in Zs category
+        mongolian_vowel_separator = 0x180E
+        medium_mathematical_space = 0x205F
+        zero_width_space = 0x200B
 
-    return letters, non_letters, separators
+        if mongolian_vowel_separator not in separators:
+            bisect.insort(separators, int(mongolian_vowel_separator))
+        if medium_mathematical_space not in separators:
+            bisect.insort(separators, int(medium_mathematical_space))
+        if zero_width_space not in separators:
+            bisect.insort(separators, int(zero_width_space))
+
+        return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
 
 
 def group_ranges(i):
@@ -159,7 +179,8 @@ def split_list(category_list):
 
 
 def generate_ranges(script_args):
-    letters, non_letters, separators = read_categories(script_args.unicode_data)
+    categorizer = UnicodeCategorizer()
+    letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
 
     letter_tables = split_list(list(group_ranges(letters)))
     non_letter_tables = split_list(list(group_ranges(non_letters)))