From 321215fdbb6a3074fe0b508e5dd256a1d8de0c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1niel=20B=C3=A1tyai?= <daniel.batyai@h-lab.eu>
Date: Mon, 20 Jul 2020 15:51:43 +0200
Subject: [PATCH] Update RegExp unicode mode case folding to conform to the
 standard (#4004)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai daniel.batyai@h-lab.eu
---
 .../ecma/operations/ecma-regexp-object.c      |  37 +++--
 jerry-core/lit/lit-char-helpers.c             |  51 +++++++
 jerry-core/lit/lit-char-helpers.h             |   5 +
 jerry-core/lit/lit-unicode-conversions.inc.h  |  69 ++++-----
 jerry-core/lit/lit-unicode-folding.inc.h      |  65 +++++++++
 tests/jerry/es.next/regexp-unicode.js         |   5 +
 tests/test262-es6-excludelist.xml             |   1 -
 tools/gen-unicode.py                          | 133 +++++++++++++-----
 tools/pylint/pylintrc                         |   2 +-
 9 files changed, 284 insertions(+), 84 deletions(-)
 create mode 100644 jerry-core/lit/lit-unicode-folding.inc.h

diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c
index 49c5dfc93..aa8c4c09a 100644
--- a/jerry-core/ecma/operations/ecma-regexp-object.c
+++ b/jerry-core/ecma/operations/ecma-regexp-object.c
@@ -403,30 +403,43 @@ lit_code_point_t
 ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */
                                bool unicode) /**< unicode */
 {
-  if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
+#if ENABLED (JERRY_ESNEXT)
+  if (unicode)
   {
-    if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z)
+    /* In unicode mode the mappings contained in the CaseFolding.txt file should be used to canonicalize the character.
+     * These mappings generally correspond to the lowercase variant of the character, however there are some
+     * differences. In some cases the uppercase variant is used, in others the lowercase of the uppercase character is
+     * used, and there are also cases where the character has no case folding mapping even though it has upper/lower
+     * variants. Since lowercasing is the most common this is used as the default behaviour, and characters with
+     * differing behaviours are encoded in lookup tables. */
+
+    if (lit_char_fold_to_upper (ch))
     {
-      return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
+      ch = lit_char_to_upper_case (ch, NULL);
+      JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
+    }
+
+    if (lit_char_fold_to_lower (ch))
+    {
+      ch = lit_char_to_lower_case (ch, NULL);
+      JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
     }
 
     return ch;
   }
+#endif /* !ENABLED (JERRY_ESNEXT) */
 
+  JERRY_UNUSED (unicode);
   lit_code_point_t cu = lit_char_to_upper_case (ch, NULL);
 
-  if (cu == LIT_MULTIPLE_CU)
+  if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX
+      || (cu > LIT_UTF8_1_BYTE_CODE_POINT_MAX
+          && cu != LIT_MULTIPLE_CU))
   {
-    return ch;
+    return cu;
   }
 
-  if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode)
-  {
-    /* 6. */
-    return ch;
-  }
-
-  return cu;
+  return ch;
 } /* ecma_regexp_canonicalize_char */
 
 /**
diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c
index 9e808827b..8474b8777 100644
--- a/jerry-core/lit/lit-char-helpers.c
+++ b/jerry-core/lit/lit-char-helpers.c
@@ -23,6 +23,9 @@
 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
 #include "lit-unicode-conversions.inc.h"
 #include "lit-unicode-conversions-sup.inc.h"
+#if ENABLED (JERRY_ESNEXT)
+#include "lit-unicode-folding.inc.h"
+#endif /* ENABLED (JERRY_ESNEXT) */
 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
 
 #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
@@ -914,3 +917,51 @@ lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
   return cp;
 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
 } /* lit_char_to_upper_case */
+
+#if ENABLED (JERRY_ESNEXT)
+/*
+ * Look up whether the character should be folded to the lowercase variant.
+ *
+ * @return true, if character should be lowercased
+ *         false, otherwise
+ */
+bool
+lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */
+{
+#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
+  return (cp > LIT_UTF16_CODE_UNIT_MAX
+          || (!lit_search_char_in_interval_array ((ecma_char_t) cp,
+                                                  lit_unicode_folding_skip_to_lower_interval_starts,
+                                                  lit_unicode_folding_skip_to_lower_interval_lengths,
+                                                  NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
+              && !lit_search_char_in_array ((ecma_char_t) cp,
+                                            lit_unicode_folding_skip_to_lower_chars,
+                                            NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
+#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
+  return true;
+#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
+} /* lit_char_fold_to_lower */
+
+/*
+ * Look up whether the character should be folded to the uppercase variant.
+ *
+ * @return true, if character should be uppercased
+ *         false, otherwise
+ */
+bool
+lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */
+{
+#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
+  return (cp <= LIT_UTF16_CODE_UNIT_MAX
+          && (lit_search_char_in_interval_array ((ecma_char_t) cp,
+                                                  lit_unicode_folding_to_upper_interval_starts,
+                                                  lit_unicode_folding_to_upper_interval_lengths,
+                                                  NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
+              || lit_search_char_in_array ((ecma_char_t) cp,
+                                           lit_unicode_folding_to_upper_chars,
+                                           NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
+#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
+  return false;
+#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
+} /* lit_char_fold_to_upper */
+#endif /* ENABLED (JERRY_ESNEXT) */
diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h
index 49eae214d..2a39f22de 100644
--- a/jerry-core/lit/lit-char-helpers.h
+++ b/jerry-core/lit/lit-char-helpers.h
@@ -248,4 +248,9 @@ bool lit_char_is_word_char (lit_code_point_t c);
 lit_code_point_t lit_char_to_lower_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
 lit_code_point_t lit_char_to_upper_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
 
+#if ENABLED (JERRY_ESNEXT)
+bool lit_char_fold_to_lower (lit_code_point_t cp);
+bool lit_char_fold_to_upper (lit_code_point_t cp);
+#endif /* ENABLED (JERRY_ESNEXT) */
+
 #endif /* !LIT_CHAR_HELPERS_H */
diff --git a/jerry-core/lit/lit-unicode-conversions.inc.h b/jerry-core/lit/lit-unicode-conversions.inc.h
index bf4287d9a..b100576a1 100644
--- a/jerry-core/lit/lit-unicode-conversions.inc.h
+++ b/jerry-core/lit/lit-unicode-conversions.inc.h
@@ -96,68 +96,61 @@ static const uint8_t lit_unicode_upper_case_special_range_lengths[] JERRY_ATTR_C
 /* Contains start points of lowercase ranges. */
 static const uint16_t lit_unicode_lower_case_ranges[] JERRY_ATTR_CONST_DATA =
 {
-  0x1e96, 0x1e96, 0x1f80, 0x1f80, 0x1f88, 0x1f80, 0x1f90, 0x1f90, 0x1f98, 0x1f90,
-  0x1fa0, 0x1fa0, 0x1fa8, 0x1fa0, 0x1fb2, 0x1fb2, 0x1fb6, 0x1fb6, 0x1fc2, 0x1fc2,
-  0x1fc6, 0x1fc6, 0x1fd2, 0x1fd2, 0x1fd6, 0x1fd6, 0x1fe2, 0x1fe2, 0x1fe6, 0x1fe6,
-  0x1ff2, 0x1ff2, 0x1ff6, 0x1ff6, 0xfb00, 0xfb00, 0xfb13, 0xfb13
+  0x1f88, 0x1f80, 0x1f98, 0x1f90, 0x1fa8, 0x1fa0
 };
 
 /* Interval lengths for start points in `lower_case_ranges` table. */
 static const uint8_t lit_unicode_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA =
 {
-  0x0005, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0003, 0x0002, 0x0003,
-  0x0002, 0x0002, 0x0002, 0x0003, 0x0002, 0x0003, 0x0002, 0x0007, 0x0005
+  0x0008, 0x0008, 0x0008
 };
 
 /* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */
 static const uint16_t lit_unicode_lower_case_conversions[] JERRY_ATTR_CONST_DATA =
 {
-  0x00df, 0x00df, 0x0149, 0x0149, 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc,
-  0x01f0, 0x01f0, 0x01f2, 0x01f3, 0x0390, 0x0390, 0x03b0, 0x03b0, 0x03f4, 0x03b8,
-  0x0587, 0x0587, 0x1e9e, 0x00df, 0x1f50, 0x1f50, 0x1f52, 0x1f52, 0x1f54, 0x1f54,
-  0x1f56, 0x1f56, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
+  0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, 0x01f2, 0x01f3, 0x03f4, 0x03b8,
+  0x1e9e, 0x00df, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
   0x212a, 0x006b, 0x212b, 0x00e5, 0x0130, 0x0069, 0x0307
 };
 
 /* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */
 static const uint8_t lit_unicode_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
 {
-  0x0016, 0x0001, 0x0000
+  0x000c, 0x0001, 0x0000
 };
 
 /* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */
 static const uint16_t lit_unicode_upper_case_conversions[] JERRY_ATTR_CONST_DATA =
 {
-  0x00b5, 0x039c, 0x0130, 0x0130, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4,
-  0x01c8, 0x01c7, 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3,
-  0x03d0, 0x0392, 0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a,
-  0x03f1, 0x03a1, 0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e,
-  0x1c83, 0x0421, 0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462,
-  0x1c88, 0xa64a, 0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149,
-  0x02bc, 0x004e, 0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048,
-  0x0331, 0x1e97, 0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a,
-  0x1e9a, 0x0041, 0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f,
-  0x1f0f, 0x0399, 0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f,
-  0x0399, 0x1faf, 0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399,
-  0x1fb4, 0x0386, 0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2,
-  0x1fca, 0x0399, 0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397,
-  0x0342, 0x1fcc, 0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313,
-  0x1fe6, 0x03a5, 0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4,
-  0x038f, 0x0399, 0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046,
-  0x0046, 0xfb01, 0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054,
-  0xfb06, 0x0053, 0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15,
-  0x0544, 0x053b, 0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399,
-  0x0308, 0x0301, 0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300,
-  0x1f54, 0x03a5, 0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391,
-  0x0342, 0x0399, 0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300,
-  0x1fd3, 0x0399, 0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5,
-  0x0308, 0x0300, 0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342,
-  0x1ff7, 0x03a9, 0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046,
-  0x0046, 0x004c
+  0x00b5, 0x039c, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, 0x01c8, 0x01c7,
+  0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, 0x03d0, 0x0392,
+  0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a, 0x03f1, 0x03a1,
+  0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e, 0x1c83, 0x0421,
+  0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462, 0x1c88, 0xa64a,
+  0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149, 0x02bc, 0x004e,
+  0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048, 0x0331, 0x1e97,
+  0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a, 0x1e9a, 0x0041,
+  0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f, 0x1f0f, 0x0399,
+  0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f, 0x0399, 0x1faf,
+  0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399, 0x1fb4, 0x0386,
+  0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2, 0x1fca, 0x0399,
+  0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397, 0x0342, 0x1fcc,
+  0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313, 0x1fe6, 0x03a5,
+  0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4, 0x038f, 0x0399,
+  0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046, 0x0046, 0xfb01,
+  0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054, 0xfb06, 0x0053,
+  0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15, 0x0544, 0x053b,
+  0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399, 0x0308, 0x0301,
+  0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300, 0x1f54, 0x03a5,
+  0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391, 0x0342, 0x0399,
+  0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300, 0x1fd3, 0x0399,
+  0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5, 0x0308, 0x0300,
+  0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342, 0x1ff7, 0x03a9,
+  0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046, 0x0046, 0x004c
 };
 
 /* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */
 static const uint8_t lit_unicode_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
 {
-  0x001c, 0x002c, 0x0010
+  0x001b, 0x002c, 0x0010
 };
diff --git a/jerry-core/lit/lit-unicode-folding.inc.h b/jerry-core/lit/lit-unicode-folding.inc.h
new file mode 100644
index 000000000..5c4965b0d
--- /dev/null
+++ b/jerry-core/lit/lit-unicode-folding.inc.h
@@ -0,0 +1,65 @@
+/* Copyright JS Foundation and other contributors, http://js.foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This file is automatically generated by the gen-unicode.py script
+ * from the CaseFolding.txt file. Do not edit! */
+
+/**
+ * Character interval starting points for folding_skip_to_lower.
+ */
+static const uint16_t lit_unicode_folding_skip_to_lower_interval_starts[] JERRY_ATTR_CONST_DATA =
+{
+  0x13a0, 0x13f8, 0xab70
+};
+
+/**
+ * Character interval lengths for folding_skip_to_lower.
+ */
+static const uint8_t lit_unicode_folding_skip_to_lower_interval_lengths[] JERRY_ATTR_CONST_DATA =
+{
+  0x0055, 0x0005, 0x004f
+};
+
+/**
+ * Non-interval characters for folding_skip_to_lower.
+ */
+static const uint16_t lit_unicode_folding_skip_to_lower_chars[] JERRY_ATTR_CONST_DATA =
+{
+  0x0130
+};
+
+/**
+ * Character interval starting points for folding_to_upper.
+ */
+static const uint16_t lit_unicode_folding_to_upper_interval_starts[] JERRY_ATTR_CONST_DATA =
+{
+  0x03d0, 0x03d5, 0x03f0, 0x13f8, 0x1c80, 0xab70
+};
+
+/**
+ * Character interval lengths for folding_to_upper.
+ */
+static const uint8_t lit_unicode_folding_to_upper_interval_lengths[] JERRY_ATTR_CONST_DATA =
+{
+  0x0001, 0x0001, 0x0001, 0x0005, 0x0008, 0x004f
+};
+
+/**
+ * Non-interval characters for folding_to_upper.
+ */
+static const uint16_t lit_unicode_folding_to_upper_chars[] JERRY_ATTR_CONST_DATA =
+{
+  0x00b5, 0x017f, 0x0345, 0x03c2, 0x03f5, 0x1e9b, 0x1fbe
+};
diff --git a/tests/jerry/es.next/regexp-unicode.js b/tests/jerry/es.next/regexp-unicode.js
index 60ac33e83..58f6e60b8 100644
--- a/tests/jerry/es.next/regexp-unicode.js
+++ b/tests/jerry/es.next/regexp-unicode.js
@@ -359,3 +359,8 @@ try {
 } catch (e) {
   assert (e instanceof SyntaxError);
 }
+
+assert(/\w/iu.test("ſ"));
+assert(/\w/iu.test("\u212a"));
+assert(/k/iu.test("\u212a"));
+assert(/\u{10c90}/iu.test("\u{10cd0}"));
diff --git a/tests/test262-es6-excludelist.xml b/tests/test262-es6-excludelist.xml
index 6b2de92b8..4df24f868 100644
--- a/tests/test262-es6-excludelist.xml
+++ b/tests/test262-es6-excludelist.xml
@@ -338,7 +338,6 @@
   <test id="language/expressions/tagged-template/cache-identical-source-new-function.js"><reason></reason></test>
   <test id="language/expressions/tagged-template/constructor-invocation.js"><reason></reason></test>
   <test id="language/expressions/template-literal/invalid-legacy-octal-escape-sequence.js"><reason></reason></test>
-  <test id="language/literals/regexp/u-case-mapping.js"><reason></reason></test>
   <test id="language/literals/string/7.8.4-1-s.js"><reason></reason></test>
   <test id="language/module-code/export-unresolvable.js"><reason></reason></test>
   <test id="language/statements/class/definition/methods.js"><reason></reason></test>
diff --git a/tools/gen-unicode.py b/tools/gen-unicode.py
index 804c0ff73..884830642 100755
--- a/tools/gen-unicode.py
+++ b/tools/gen-unicode.py
@@ -27,10 +27,18 @@ from gen_c_source import LICENSE, format_code
 from settings import PROJECT_DIR
 
 
+UNICODE_DATA_FILE = 'UnicodeData.txt'
+SPECIAL_CASING_FILE = 'SpecialCasing.txt'
+DERIVED_PROPS_FILE = 'DerivedCoreProperties.txt'
+PROP_LIST_FILE = 'PropList.txt'
+CASE_FOLDING_FILE = 'CaseFolding.txt'
+
 RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
 RANGES_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges-sup.inc.h')
 CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
 CONVERSIONS_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions-sup.inc.h')
+FOLDING_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding.inc.h')
+FOLDING_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding-sup.inc.h')
 
 UNICODE_PLANE_TYPE_BASIC = 0
 UNICODE_PLANE_TYPE_SUPPLEMENTARY = 1
@@ -266,11 +274,14 @@ class UnicodeBasicCategorizer(object):
                 if not self.in_range(letter_id) or condition_list:
                     continue
 
+                original_letter = parse_unicode_sequence(line[0])
                 small_letter = parse_unicode_sequence(line[1])
                 capital_letter = parse_unicode_sequence(line[3])
 
-                lower_case_mapping[letter_id] = small_letter
-                upper_case_mapping[letter_id] = capital_letter
+                if small_letter != original_letter:
+                    lower_case_mapping[letter_id] = small_letter
+                if capital_letter != original_letter:
+                    upper_case_mapping[letter_id] = capital_letter
 
         return lower_case_mapping, upper_case_mapping
 
@@ -292,12 +303,13 @@ def generate_ranges(script_args, plane_type):
         categorizer = UnicodeBasicCategorizer()
 
     header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
-                         " * from %s. Do not edit! */" % os.path.basename(script_args.derived_core_properties),
+                         " * from %s. Do not edit! */" % (DERIVED_PROPS_FILE),
                          ""]
 
     c_source.complete_header("\n".join(header_completion))
 
-    units = categorizer.read_units(script_args.derived_core_properties, ["ID_Start", "ID_Continue"])
+    derived_props_path = os.path.join(script_args.unicode_dir, DERIVED_PROPS_FILE)
+    units = categorizer.read_units(derived_props_path, ["ID_Start", "ID_Continue"])
 
     units["ID_Continue"] = sorted(set(units["ID_Continue"]).union(categorizer.extra_id_continue_units)
                                   - set(units["ID_Start"]))
@@ -305,7 +317,9 @@ def generate_ranges(script_args, plane_type):
     for category, unit in units.items():
         c_source.add_range(category, categorizer.create_tables(unit))
 
-    white_space_units = categorizer.read_units(script_args.prop_list, ["White_Space"], ["Zs"])["White_Space"]
+    prop_list_path = os.path.join(script_args.unicode_dir, PROP_LIST_FILE)
+
+    white_space_units = categorizer.read_units(prop_list_path, ["White_Space"], ["Zs"])["White_Space"]
 
     c_source.add_whitepace_range("White_Space", categorizer, white_space_units)
 
@@ -314,6 +328,19 @@ def generate_ranges(script_args, plane_type):
 
 # functions for unicode conversions
 
+def make_char(hex_val):
+    """
+    Create a unicode character from a hex value
+
+    :param hex_val: Hex value of the character.
+    :return: Unicode character corresponding to the value.
+    """
+
+    try:
+        return unichr(hex_val)
+    except NameError:
+        return chr(hex_val)
+
 
 def parse_unicode_sequence(raw_data):
     """
@@ -331,10 +358,7 @@ def parse_unicode_sequence(raw_data):
 
         # Convert it to unicode code point (from hex value without 0x prefix)
         hex_val = int(unicode_char, 16)
-        try:
-            result += unichr(hex_val)
-        except NameError:
-            result += chr(hex_val)
+        result += make_char(hex_val)
 
     return result
 
@@ -637,17 +661,17 @@ def generate_conversions(script_args, plane_type):
         c_source = UnicodeBasicSource(CONVERSIONS_C_SOURCE)
         categorizer = UnicodeBasicCategorizer()
 
-    unicode_file = os.path.basename(script_args.unicode_data)
-    spec_casing_file = os.path.basename(script_args.special_casing)
-
     header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
-                         " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
+                         " * from %s and %s files. Do not edit! */" % (UNICODE_DATA_FILE, SPECIAL_CASING_FILE),
                          ""]
 
     c_source.complete_header("\n".join(header_completion))
 
+    unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
+    special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
+
     # Read the corresponding unicode values of lower and upper case letters and store these in tables
-    lower_case, upper_case = categorizer.read_case_mappings(script_args.unicode_data, script_args.special_casing)
+    lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
 
     c_source.add_conversion_range("character_case",
                                   extract_ranges(lower_case, upper_case),
@@ -702,34 +726,76 @@ def generate_conversions(script_args, plane_type):
     c_source.generate()
 
 
+def generate_folding(script_args, plane_type):
+    if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY:
+        c_source = UnicodeSupplementarySource(FOLDING_SUP_C_SOURCE)
+        categorizer = UnicodeSupplementaryCategorizer()
+    else:
+        c_source = UnicodeBasicSource(FOLDING_C_SOURCE)
+        categorizer = UnicodeBasicCategorizer()
+
+    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
+                         " * from the %s file. Do not edit! */" % (CASE_FOLDING_FILE),
+                         ""]
+
+    c_source.complete_header("\n".join(header_completion))
+
+    unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
+    special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
+    case_folding_path = os.path.join(script_args.unicode_dir, CASE_FOLDING_FILE)
+
+    # Read the corresponding unicode values of lower and upper case letters and store these in tables
+    lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
+
+    folding = {}
+
+    with open(case_folding_path, 'r') as case_folding:
+        case_folding_re = re.compile(r'(?P<code_point>[^;]*);\s*(?P<type>[^;]*);\s*(?P<folding>[^;]*);')
+        for line in case_folding:
+            match = case_folding_re.match(line)
+            if match and match.group('type') in ('S', 'C'):
+                code_point = int(match.group('code_point'), 16)
+
+                if categorizer.in_range(code_point):
+                    folding[code_point] = parse_unicode_sequence(match.group('folding'))
+
+    should_to_upper = []
+    should_skip_to_lower = []
+
+    for code_point in lower_case:
+        if code_point not in folding:
+            should_skip_to_lower.append(code_point)
+
+    for code_point, folded in folding.items():
+        if lower_case.get(code_point, make_char(code_point)) != folded:
+            should_to_upper.append(code_point)
+
+            if upper_case.get(code_point, '') == folded:
+                should_skip_to_lower.append(code_point)
+
+    c_source.add_range('folding_skip_to_lower', categorizer.create_tables(should_skip_to_lower))
+    c_source.add_range('folding_to_upper', categorizer.create_tables(should_to_upper))
+
+    c_source.generate()
+
+
 # entry point
 
 
 def main():
     parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}-{sup}.inc.h generator',
                                      epilog='''
-                                        The input files:
-                                            - UnicodeData.txt
-                                            - SpecialCasing.txt
-                                            - DerivedCoreProperties.txt
-                                            - PropList.txt
-                                        must be retrieved from
-                                        http://www.unicode.org/Public/<VERSION>/ucd/.
+                                        The input data must be retrieved from
+                                        http://www.unicode.org/Public/<VERSION>/ucd/UCD.zip.
                                         The last known good version is 13.0.0.
                                         ''')
-    def check_file(path):
-        if not os.path.isfile(path) or not os.access(path, os.R_OK):
-            raise argparse.ArgumentTypeError('The %s file is missing or not readable!' % path)
+    def check_dir(path):
+        if not os.path.isdir(path) or not os.access(path, os.R_OK):
+            raise argparse.ArgumentTypeError('The %s directory does not exist or is not readable!' % path)
         return path
 
-    parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True,
-                        type=check_file, help='specify the unicode data file')
-    parser.add_argument('--special-casing', metavar='FILE', action='store', required=True,
-                        type=check_file, help='specify the special casing file')
-    parser.add_argument('--prop-list', metavar='FILE', action='store', required=True,
-                        type=check_file, help='specify the prop list file')
-    parser.add_argument('--derived-core-properties', metavar='FILE', action='store', required=True,
-                        type=check_file, help='specify the DerivedCodeProperties file')
+    parser.add_argument('--unicode-dir', metavar='DIR', action='store', required=True,
+                        type=check_dir, help='specify the unicode data directory')
 
     script_args = parser.parse_args()
 
@@ -737,6 +803,9 @@ def main():
     generate_ranges(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
     generate_conversions(script_args, UNICODE_PLANE_TYPE_BASIC)
     generate_conversions(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
+    generate_folding(script_args, UNICODE_PLANE_TYPE_BASIC)
+    # There are currently no code points in the supplementary planes that require special folding
+    # generate_folding(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
 
 
 if __name__ == "__main__":
diff --git a/tools/pylint/pylintrc b/tools/pylint/pylintrc
index 8a436182e..b277c0bf7 100644
--- a/tools/pylint/pylintrc
+++ b/tools/pylint/pylintrc
@@ -310,7 +310,7 @@ max-args=6
 ignored-argument-names=_.*
 
 # Maximum number of locals for function / method body
-max-locals=15
+max-locals=20
 
 # Maximum number of return / yield for function / method body
 max-returns=6