From 321215fdbb6a3074fe0b508e5dd256a1d8de0c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1niel=20B=C3=A1tyai?= Date: Mon, 20 Jul 2020 15:51:43 +0200 Subject: [PATCH] Update RegExp unicode mode case folding to conform to the standard (#4004) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai daniel.batyai@h-lab.eu --- .../ecma/operations/ecma-regexp-object.c | 37 +++-- jerry-core/lit/lit-char-helpers.c | 51 +++++++ jerry-core/lit/lit-char-helpers.h | 5 + jerry-core/lit/lit-unicode-conversions.inc.h | 69 ++++----- jerry-core/lit/lit-unicode-folding.inc.h | 65 +++++++++ tests/jerry/es.next/regexp-unicode.js | 5 + tests/test262-es6-excludelist.xml | 1 - tools/gen-unicode.py | 133 +++++++++++++----- tools/pylint/pylintrc | 2 +- 9 files changed, 284 insertions(+), 84 deletions(-) create mode 100644 jerry-core/lit/lit-unicode-folding.inc.h diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index 49c5dfc93..aa8c4c09a 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -403,30 +403,43 @@ lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */ bool unicode) /**< unicode */ { - if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)) +#if ENABLED (JERRY_ESNEXT) + if (unicode) { - if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z) + /* In unicode mode the mappings contained in the CaseFolding.txt file should be used to canonicalize the character. + * These mappings generally correspond to the lowercase variant of the character, however there are some + * differences. In some cases the uppercase variant is used, in others the lowercase of the uppercase character is + * used, and there are also cases where the character has no case folding mapping even though it has upper/lower + * variants. Since lowercasing is the most common this is used as the default behaviour, and characters with + * differing behaviours are encoded in lookup tables. */ + + if (lit_char_fold_to_upper (ch)) { - return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + ch = lit_char_to_upper_case (ch, NULL); + JERRY_ASSERT (ch != LIT_MULTIPLE_CU); + } + + if (lit_char_fold_to_lower (ch)) + { + ch = lit_char_to_lower_case (ch, NULL); + JERRY_ASSERT (ch != LIT_MULTIPLE_CU); } return ch; } +#endif /* !ENABLED (JERRY_ESNEXT) */ + JERRY_UNUSED (unicode); lit_code_point_t cu = lit_char_to_upper_case (ch, NULL); - if (cu == LIT_MULTIPLE_CU) + if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX + || (cu > LIT_UTF8_1_BYTE_CODE_POINT_MAX + && cu != LIT_MULTIPLE_CU)) { - return ch; + return cu; } - if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode) - { - /* 6. */ - return ch; - } - - return cu; + return ch; } /* ecma_regexp_canonicalize_char */ /** diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index 9e808827b..8474b8777 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -23,6 +23,9 @@ #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) #include "lit-unicode-conversions.inc.h" #include "lit-unicode-conversions-sup.inc.h" +#if ENABLED (JERRY_ESNEXT) +#include "lit-unicode-folding.inc.h" +#endif /* ENABLED (JERRY_ESNEXT) */ #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0])) @@ -914,3 +917,51 @@ lit_char_to_upper_case (lit_code_point_t cp, /**< code point */ return cp; #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ } /* lit_char_to_upper_case */ + +#if ENABLED (JERRY_ESNEXT) +/* + * Look up whether the character should be folded to the lowercase variant. + * + * @return true, if character should be lowercased + * false, otherwise + */ +bool +lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */ +{ +#if ENABLED (JERRY_UNICODE_CASE_CONVERSION) + return (cp > LIT_UTF16_CODE_UNIT_MAX + || (!lit_search_char_in_interval_array ((ecma_char_t) cp, + lit_unicode_folding_skip_to_lower_interval_starts, + lit_unicode_folding_skip_to_lower_interval_lengths, + NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts)) + && !lit_search_char_in_array ((ecma_char_t) cp, + lit_unicode_folding_skip_to_lower_chars, + NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars)))); +#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ + return true; +#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ +} /* lit_char_fold_to_lower */ + +/* + * Look up whether the character should be folded to the uppercase variant. + * + * @return true, if character should be uppercased + * false, otherwise + */ +bool +lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */ +{ +#if ENABLED (JERRY_UNICODE_CASE_CONVERSION) + return (cp <= LIT_UTF16_CODE_UNIT_MAX + && (lit_search_char_in_interval_array ((ecma_char_t) cp, + lit_unicode_folding_to_upper_interval_starts, + lit_unicode_folding_to_upper_interval_lengths, + NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts)) + || lit_search_char_in_array ((ecma_char_t) cp, + lit_unicode_folding_to_upper_chars, + NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars)))); +#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ + return false; +#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ +} /* lit_char_fold_to_upper */ +#endif /* ENABLED (JERRY_ESNEXT) */ diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index 49eae214d..2a39f22de 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -248,4 +248,9 @@ bool lit_char_is_word_char (lit_code_point_t c); lit_code_point_t lit_char_to_lower_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p); lit_code_point_t lit_char_to_upper_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p); +#if ENABLED (JERRY_ESNEXT) +bool lit_char_fold_to_lower (lit_code_point_t cp); +bool lit_char_fold_to_upper (lit_code_point_t cp); +#endif /* ENABLED (JERRY_ESNEXT) */ + #endif /* !LIT_CHAR_HELPERS_H */ diff --git a/jerry-core/lit/lit-unicode-conversions.inc.h b/jerry-core/lit/lit-unicode-conversions.inc.h index bf4287d9a..b100576a1 100644 --- a/jerry-core/lit/lit-unicode-conversions.inc.h +++ b/jerry-core/lit/lit-unicode-conversions.inc.h @@ -96,68 +96,61 @@ static const uint8_t lit_unicode_upper_case_special_range_lengths[] JERRY_ATTR_C /* Contains start points of lowercase ranges. */ static const uint16_t lit_unicode_lower_case_ranges[] JERRY_ATTR_CONST_DATA = { - 0x1e96, 0x1e96, 0x1f80, 0x1f80, 0x1f88, 0x1f80, 0x1f90, 0x1f90, 0x1f98, 0x1f90, - 0x1fa0, 0x1fa0, 0x1fa8, 0x1fa0, 0x1fb2, 0x1fb2, 0x1fb6, 0x1fb6, 0x1fc2, 0x1fc2, - 0x1fc6, 0x1fc6, 0x1fd2, 0x1fd2, 0x1fd6, 0x1fd6, 0x1fe2, 0x1fe2, 0x1fe6, 0x1fe6, - 0x1ff2, 0x1ff2, 0x1ff6, 0x1ff6, 0xfb00, 0xfb00, 0xfb13, 0xfb13 + 0x1f88, 0x1f80, 0x1f98, 0x1f90, 0x1fa8, 0x1fa0 }; /* Interval lengths for start points in `lower_case_ranges` table. */ static const uint8_t lit_unicode_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA = { - 0x0005, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0003, 0x0002, 0x0003, - 0x0002, 0x0002, 0x0002, 0x0003, 0x0002, 0x0003, 0x0002, 0x0007, 0x0005 + 0x0008, 0x0008, 0x0008 }; /* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */ static const uint16_t lit_unicode_lower_case_conversions[] JERRY_ATTR_CONST_DATA = { - 0x00df, 0x00df, 0x0149, 0x0149, 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, - 0x01f0, 0x01f0, 0x01f2, 0x01f3, 0x0390, 0x0390, 0x03b0, 0x03b0, 0x03f4, 0x03b8, - 0x0587, 0x0587, 0x1e9e, 0x00df, 0x1f50, 0x1f50, 0x1f52, 0x1f52, 0x1f54, 0x1f54, - 0x1f56, 0x1f56, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9, + 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, 0x01f2, 0x01f3, 0x03f4, 0x03b8, + 0x1e9e, 0x00df, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9, 0x212a, 0x006b, 0x212b, 0x00e5, 0x0130, 0x0069, 0x0307 }; /* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */ static const uint8_t lit_unicode_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA = { - 0x0016, 0x0001, 0x0000 + 0x000c, 0x0001, 0x0000 }; /* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */ static const uint16_t lit_unicode_upper_case_conversions[] JERRY_ATTR_CONST_DATA = { - 0x00b5, 0x039c, 0x0130, 0x0130, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, - 0x01c8, 0x01c7, 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, - 0x03d0, 0x0392, 0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a, - 0x03f1, 0x03a1, 0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e, - 0x1c83, 0x0421, 0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462, - 0x1c88, 0xa64a, 0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149, - 0x02bc, 0x004e, 0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048, - 0x0331, 0x1e97, 0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a, - 0x1e9a, 0x0041, 0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f, - 0x1f0f, 0x0399, 0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f, - 0x0399, 0x1faf, 0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399, - 0x1fb4, 0x0386, 0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2, - 0x1fca, 0x0399, 0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397, - 0x0342, 0x1fcc, 0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313, - 0x1fe6, 0x03a5, 0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4, - 0x038f, 0x0399, 0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046, - 0x0046, 0xfb01, 0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054, - 0xfb06, 0x0053, 0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15, - 0x0544, 0x053b, 0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399, - 0x0308, 0x0301, 0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300, - 0x1f54, 0x03a5, 0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391, - 0x0342, 0x0399, 0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300, - 0x1fd3, 0x0399, 0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5, - 0x0308, 0x0300, 0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342, - 0x1ff7, 0x03a9, 0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046, - 0x0046, 0x004c + 0x00b5, 0x039c, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, 0x01c8, 0x01c7, + 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, 0x03d0, 0x0392, + 0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a, 0x03f1, 0x03a1, + 0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e, 0x1c83, 0x0421, + 0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462, 0x1c88, 0xa64a, + 0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149, 0x02bc, 0x004e, + 0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048, 0x0331, 0x1e97, + 0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a, 0x1e9a, 0x0041, + 0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f, 0x1f0f, 0x0399, + 0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f, 0x0399, 0x1faf, + 0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399, 0x1fb4, 0x0386, + 0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2, 0x1fca, 0x0399, + 0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397, 0x0342, 0x1fcc, + 0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313, 0x1fe6, 0x03a5, + 0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4, 0x038f, 0x0399, + 0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046, 0x0046, 0xfb01, + 0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054, 0xfb06, 0x0053, + 0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15, 0x0544, 0x053b, + 0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399, 0x0308, 0x0301, + 0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300, 0x1f54, 0x03a5, + 0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391, 0x0342, 0x0399, + 0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300, 0x1fd3, 0x0399, + 0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5, 0x0308, 0x0300, + 0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342, 0x1ff7, 0x03a9, + 0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046, 0x0046, 0x004c }; /* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */ static const uint8_t lit_unicode_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA = { - 0x001c, 0x002c, 0x0010 + 0x001b, 0x002c, 0x0010 }; diff --git a/jerry-core/lit/lit-unicode-folding.inc.h b/jerry-core/lit/lit-unicode-folding.inc.h new file mode 100644 index 000000000..5c4965b0d --- /dev/null +++ b/jerry-core/lit/lit-unicode-folding.inc.h @@ -0,0 +1,65 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This file is automatically generated by the gen-unicode.py script + * from the CaseFolding.txt file. Do not edit! */ + +/** + * Character interval starting points for folding_skip_to_lower. + */ +static const uint16_t lit_unicode_folding_skip_to_lower_interval_starts[] JERRY_ATTR_CONST_DATA = +{ + 0x13a0, 0x13f8, 0xab70 +}; + +/** + * Character interval lengths for folding_skip_to_lower. + */ +static const uint8_t lit_unicode_folding_skip_to_lower_interval_lengths[] JERRY_ATTR_CONST_DATA = +{ + 0x0055, 0x0005, 0x004f +}; + +/** + * Non-interval characters for folding_skip_to_lower. + */ +static const uint16_t lit_unicode_folding_skip_to_lower_chars[] JERRY_ATTR_CONST_DATA = +{ + 0x0130 +}; + +/** + * Character interval starting points for folding_to_upper. + */ +static const uint16_t lit_unicode_folding_to_upper_interval_starts[] JERRY_ATTR_CONST_DATA = +{ + 0x03d0, 0x03d5, 0x03f0, 0x13f8, 0x1c80, 0xab70 +}; + +/** + * Character interval lengths for folding_to_upper. + */ +static const uint8_t lit_unicode_folding_to_upper_interval_lengths[] JERRY_ATTR_CONST_DATA = +{ + 0x0001, 0x0001, 0x0001, 0x0005, 0x0008, 0x004f +}; + +/** + * Non-interval characters for folding_to_upper. + */ +static const uint16_t lit_unicode_folding_to_upper_chars[] JERRY_ATTR_CONST_DATA = +{ + 0x00b5, 0x017f, 0x0345, 0x03c2, 0x03f5, 0x1e9b, 0x1fbe +}; diff --git a/tests/jerry/es.next/regexp-unicode.js b/tests/jerry/es.next/regexp-unicode.js index 60ac33e83..58f6e60b8 100644 --- a/tests/jerry/es.next/regexp-unicode.js +++ b/tests/jerry/es.next/regexp-unicode.js @@ -359,3 +359,8 @@ try { } catch (e) { assert (e instanceof SyntaxError); } + +assert(/\w/iu.test("ſ")); +assert(/\w/iu.test("\u212a")); +assert(/k/iu.test("\u212a")); +assert(/\u{10c90}/iu.test("\u{10cd0}")); diff --git a/tests/test262-es6-excludelist.xml b/tests/test262-es6-excludelist.xml index 6b2de92b8..4df24f868 100644 --- a/tests/test262-es6-excludelist.xml +++ b/tests/test262-es6-excludelist.xml @@ -338,7 +338,6 @@ - diff --git a/tools/gen-unicode.py b/tools/gen-unicode.py index 804c0ff73..884830642 100755 --- a/tools/gen-unicode.py +++ b/tools/gen-unicode.py @@ -27,10 +27,18 @@ from gen_c_source import LICENSE, format_code from settings import PROJECT_DIR +UNICODE_DATA_FILE = 'UnicodeData.txt' +SPECIAL_CASING_FILE = 'SpecialCasing.txt' +DERIVED_PROPS_FILE = 'DerivedCoreProperties.txt' +PROP_LIST_FILE = 'PropList.txt' +CASE_FOLDING_FILE = 'CaseFolding.txt' + RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') RANGES_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges-sup.inc.h') CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') CONVERSIONS_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions-sup.inc.h') +FOLDING_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding.inc.h') +FOLDING_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding-sup.inc.h') UNICODE_PLANE_TYPE_BASIC = 0 UNICODE_PLANE_TYPE_SUPPLEMENTARY = 1 @@ -266,11 +274,14 @@ class UnicodeBasicCategorizer(object): if not self.in_range(letter_id) or condition_list: continue + original_letter = parse_unicode_sequence(line[0]) small_letter = parse_unicode_sequence(line[1]) capital_letter = parse_unicode_sequence(line[3]) - lower_case_mapping[letter_id] = small_letter - upper_case_mapping[letter_id] = capital_letter + if small_letter != original_letter: + lower_case_mapping[letter_id] = small_letter + if capital_letter != original_letter: + upper_case_mapping[letter_id] = capital_letter return lower_case_mapping, upper_case_mapping @@ -292,12 +303,13 @@ def generate_ranges(script_args, plane_type): categorizer = UnicodeBasicCategorizer() header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), - " * from %s. Do not edit! */" % os.path.basename(script_args.derived_core_properties), + " * from %s. Do not edit! */" % (DERIVED_PROPS_FILE), ""] c_source.complete_header("\n".join(header_completion)) - units = categorizer.read_units(script_args.derived_core_properties, ["ID_Start", "ID_Continue"]) + derived_props_path = os.path.join(script_args.unicode_dir, DERIVED_PROPS_FILE) + units = categorizer.read_units(derived_props_path, ["ID_Start", "ID_Continue"]) units["ID_Continue"] = sorted(set(units["ID_Continue"]).union(categorizer.extra_id_continue_units) - set(units["ID_Start"])) @@ -305,7 +317,9 @@ def generate_ranges(script_args, plane_type): for category, unit in units.items(): c_source.add_range(category, categorizer.create_tables(unit)) - white_space_units = categorizer.read_units(script_args.prop_list, ["White_Space"], ["Zs"])["White_Space"] + prop_list_path = os.path.join(script_args.unicode_dir, PROP_LIST_FILE) + + white_space_units = categorizer.read_units(prop_list_path, ["White_Space"], ["Zs"])["White_Space"] c_source.add_whitepace_range("White_Space", categorizer, white_space_units) @@ -314,6 +328,19 @@ def generate_ranges(script_args, plane_type): # functions for unicode conversions +def make_char(hex_val): + """ + Create a unicode character from a hex value + + :param hex_val: Hex value of the character. + :return: Unicode character corresponding to the value. + """ + + try: + return unichr(hex_val) + except NameError: + return chr(hex_val) + def parse_unicode_sequence(raw_data): """ @@ -331,10 +358,7 @@ def parse_unicode_sequence(raw_data): # Convert it to unicode code point (from hex value without 0x prefix) hex_val = int(unicode_char, 16) - try: - result += unichr(hex_val) - except NameError: - result += chr(hex_val) + result += make_char(hex_val) return result @@ -637,17 +661,17 @@ def generate_conversions(script_args, plane_type): c_source = UnicodeBasicSource(CONVERSIONS_C_SOURCE) categorizer = UnicodeBasicCategorizer() - unicode_file = os.path.basename(script_args.unicode_data) - spec_casing_file = os.path.basename(script_args.special_casing) - header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), - " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file), + " * from %s and %s files. Do not edit! */" % (UNICODE_DATA_FILE, SPECIAL_CASING_FILE), ""] c_source.complete_header("\n".join(header_completion)) + unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE) + special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE) + # Read the corresponding unicode values of lower and upper case letters and store these in tables - lower_case, upper_case = categorizer.read_case_mappings(script_args.unicode_data, script_args.special_casing) + lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path) c_source.add_conversion_range("character_case", extract_ranges(lower_case, upper_case), @@ -702,34 +726,76 @@ def generate_conversions(script_args, plane_type): c_source.generate() +def generate_folding(script_args, plane_type): + if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY: + c_source = UnicodeSupplementarySource(FOLDING_SUP_C_SOURCE) + categorizer = UnicodeSupplementaryCategorizer() + else: + c_source = UnicodeBasicSource(FOLDING_C_SOURCE) + categorizer = UnicodeBasicCategorizer() + + header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), + " * from the %s file. Do not edit! */" % (CASE_FOLDING_FILE), + ""] + + c_source.complete_header("\n".join(header_completion)) + + unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE) + special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE) + case_folding_path = os.path.join(script_args.unicode_dir, CASE_FOLDING_FILE) + + # Read the corresponding unicode values of lower and upper case letters and store these in tables + lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path) + + folding = {} + + with open(case_folding_path, 'r') as case_folding: + case_folding_re = re.compile(r'(?P[^;]*);\s*(?P[^;]*);\s*(?P[^;]*);') + for line in case_folding: + match = case_folding_re.match(line) + if match and match.group('type') in ('S', 'C'): + code_point = int(match.group('code_point'), 16) + + if categorizer.in_range(code_point): + folding[code_point] = parse_unicode_sequence(match.group('folding')) + + should_to_upper = [] + should_skip_to_lower = [] + + for code_point in lower_case: + if code_point not in folding: + should_skip_to_lower.append(code_point) + + for code_point, folded in folding.items(): + if lower_case.get(code_point, make_char(code_point)) != folded: + should_to_upper.append(code_point) + + if upper_case.get(code_point, '') == folded: + should_skip_to_lower.append(code_point) + + c_source.add_range('folding_skip_to_lower', categorizer.create_tables(should_skip_to_lower)) + c_source.add_range('folding_to_upper', categorizer.create_tables(should_to_upper)) + + c_source.generate() + + # entry point def main(): parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}-{sup}.inc.h generator', epilog=''' - The input files: - - UnicodeData.txt - - SpecialCasing.txt - - DerivedCoreProperties.txt - - PropList.txt - must be retrieved from - http://www.unicode.org/Public//ucd/. + The input data must be retrieved from + http://www.unicode.org/Public//ucd/UCD.zip. The last known good version is 13.0.0. ''') - def check_file(path): - if not os.path.isfile(path) or not os.access(path, os.R_OK): - raise argparse.ArgumentTypeError('The %s file is missing or not readable!' % path) + def check_dir(path): + if not os.path.isdir(path) or not os.access(path, os.R_OK): + raise argparse.ArgumentTypeError('The %s directory does not exist or is not readable!' % path) return path - parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True, - type=check_file, help='specify the unicode data file') - parser.add_argument('--special-casing', metavar='FILE', action='store', required=True, - type=check_file, help='specify the special casing file') - parser.add_argument('--prop-list', metavar='FILE', action='store', required=True, - type=check_file, help='specify the prop list file') - parser.add_argument('--derived-core-properties', metavar='FILE', action='store', required=True, - type=check_file, help='specify the DerivedCodeProperties file') + parser.add_argument('--unicode-dir', metavar='DIR', action='store', required=True, + type=check_dir, help='specify the unicode data directory') script_args = parser.parse_args() @@ -737,6 +803,9 @@ def main(): generate_ranges(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY) generate_conversions(script_args, UNICODE_PLANE_TYPE_BASIC) generate_conversions(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY) + generate_folding(script_args, UNICODE_PLANE_TYPE_BASIC) + # There are currently no code points in the supplementary planes that require special folding + # generate_folding(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY) if __name__ == "__main__": diff --git a/tools/pylint/pylintrc b/tools/pylint/pylintrc index 8a436182e..b277c0bf7 100644 --- a/tools/pylint/pylintrc +++ b/tools/pylint/pylintrc @@ -310,7 +310,7 @@ max-args=6 ignored-argument-names=_.* # Maximum number of locals for function / method body -max-locals=15 +max-locals=20 # Maximum number of return / yield for function / method body max-returns=6