Update RegExp unicode mode case folding to conform to the standard (#4004)
JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai daniel.batyai@h-lab.eu
This commit is contained in:
@@ -403,30 +403,43 @@ lit_code_point_t
|
||||
ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */
|
||||
bool unicode) /**< unicode */
|
||||
{
|
||||
if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (unicode)
|
||||
{
|
||||
if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z)
|
||||
/* In unicode mode the mappings contained in the CaseFolding.txt file should be used to canonicalize the character.
|
||||
* These mappings generally correspond to the lowercase variant of the character, however there are some
|
||||
* differences. In some cases the uppercase variant is used, in others the lowercase of the uppercase character is
|
||||
* used, and there are also cases where the character has no case folding mapping even though it has upper/lower
|
||||
* variants. Since lowercasing is the most common this is used as the default behaviour, and characters with
|
||||
* differing behaviours are encoded in lookup tables. */
|
||||
|
||||
if (lit_char_fold_to_upper (ch))
|
||||
{
|
||||
return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
||||
ch = lit_char_to_upper_case (ch, NULL);
|
||||
JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
|
||||
}
|
||||
|
||||
if (lit_char_fold_to_lower (ch))
|
||||
{
|
||||
ch = lit_char_to_lower_case (ch, NULL);
|
||||
JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
|
||||
}
|
||||
|
||||
return ch;
|
||||
}
|
||||
#endif /* !ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
JERRY_UNUSED (unicode);
|
||||
lit_code_point_t cu = lit_char_to_upper_case (ch, NULL);
|
||||
|
||||
if (cu == LIT_MULTIPLE_CU)
|
||||
if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX
|
||||
|| (cu > LIT_UTF8_1_BYTE_CODE_POINT_MAX
|
||||
&& cu != LIT_MULTIPLE_CU))
|
||||
{
|
||||
return ch;
|
||||
return cu;
|
||||
}
|
||||
|
||||
if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode)
|
||||
{
|
||||
/* 6. */
|
||||
return ch;
|
||||
}
|
||||
|
||||
return cu;
|
||||
return ch;
|
||||
} /* ecma_regexp_canonicalize_char */
|
||||
|
||||
/**
|
||||
|
||||
@@ -23,6 +23,9 @@
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
#include "lit-unicode-conversions.inc.h"
|
||||
#include "lit-unicode-conversions-sup.inc.h"
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
#include "lit-unicode-folding.inc.h"
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
|
||||
#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
|
||||
@@ -914,3 +917,51 @@ lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
|
||||
return cp;
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
} /* lit_char_to_upper_case */
|
||||
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
/*
|
||||
* Look up whether the character should be folded to the lowercase variant.
|
||||
*
|
||||
* @return true, if character should be lowercased
|
||||
* false, otherwise
|
||||
*/
|
||||
bool
|
||||
lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */
|
||||
{
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
return (cp > LIT_UTF16_CODE_UNIT_MAX
|
||||
|| (!lit_search_char_in_interval_array ((ecma_char_t) cp,
|
||||
lit_unicode_folding_skip_to_lower_interval_starts,
|
||||
lit_unicode_folding_skip_to_lower_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
|
||||
&& !lit_search_char_in_array ((ecma_char_t) cp,
|
||||
lit_unicode_folding_skip_to_lower_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
|
||||
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
return true;
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
} /* lit_char_fold_to_lower */
|
||||
|
||||
/*
|
||||
* Look up whether the character should be folded to the uppercase variant.
|
||||
*
|
||||
* @return true, if character should be uppercased
|
||||
* false, otherwise
|
||||
*/
|
||||
bool
|
||||
lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */
|
||||
{
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
return (cp <= LIT_UTF16_CODE_UNIT_MAX
|
||||
&& (lit_search_char_in_interval_array ((ecma_char_t) cp,
|
||||
lit_unicode_folding_to_upper_interval_starts,
|
||||
lit_unicode_folding_to_upper_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
|
||||
|| lit_search_char_in_array ((ecma_char_t) cp,
|
||||
lit_unicode_folding_to_upper_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
|
||||
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
return false;
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
} /* lit_char_fold_to_upper */
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
@@ -248,4 +248,9 @@ bool lit_char_is_word_char (lit_code_point_t c);
|
||||
lit_code_point_t lit_char_to_lower_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
|
||||
lit_code_point_t lit_char_to_upper_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
|
||||
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
bool lit_char_fold_to_lower (lit_code_point_t cp);
|
||||
bool lit_char_fold_to_upper (lit_code_point_t cp);
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
#endif /* !LIT_CHAR_HELPERS_H */
|
||||
|
||||
@@ -96,68 +96,61 @@ static const uint8_t lit_unicode_upper_case_special_range_lengths[] JERRY_ATTR_C
|
||||
/* Contains start points of lowercase ranges. */
|
||||
static const uint16_t lit_unicode_lower_case_ranges[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x1e96, 0x1e96, 0x1f80, 0x1f80, 0x1f88, 0x1f80, 0x1f90, 0x1f90, 0x1f98, 0x1f90,
|
||||
0x1fa0, 0x1fa0, 0x1fa8, 0x1fa0, 0x1fb2, 0x1fb2, 0x1fb6, 0x1fb6, 0x1fc2, 0x1fc2,
|
||||
0x1fc6, 0x1fc6, 0x1fd2, 0x1fd2, 0x1fd6, 0x1fd6, 0x1fe2, 0x1fe2, 0x1fe6, 0x1fe6,
|
||||
0x1ff2, 0x1ff2, 0x1ff6, 0x1ff6, 0xfb00, 0xfb00, 0xfb13, 0xfb13
|
||||
0x1f88, 0x1f80, 0x1f98, 0x1f90, 0x1fa8, 0x1fa0
|
||||
};
|
||||
|
||||
/* Interval lengths for start points in `lower_case_ranges` table. */
|
||||
static const uint8_t lit_unicode_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x0005, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0003, 0x0002, 0x0003,
|
||||
0x0002, 0x0002, 0x0002, 0x0003, 0x0002, 0x0003, 0x0002, 0x0007, 0x0005
|
||||
0x0008, 0x0008, 0x0008
|
||||
};
|
||||
|
||||
/* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */
|
||||
static const uint16_t lit_unicode_lower_case_conversions[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x00df, 0x00df, 0x0149, 0x0149, 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc,
|
||||
0x01f0, 0x01f0, 0x01f2, 0x01f3, 0x0390, 0x0390, 0x03b0, 0x03b0, 0x03f4, 0x03b8,
|
||||
0x0587, 0x0587, 0x1e9e, 0x00df, 0x1f50, 0x1f50, 0x1f52, 0x1f52, 0x1f54, 0x1f54,
|
||||
0x1f56, 0x1f56, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
|
||||
0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, 0x01f2, 0x01f3, 0x03f4, 0x03b8,
|
||||
0x1e9e, 0x00df, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
|
||||
0x212a, 0x006b, 0x212b, 0x00e5, 0x0130, 0x0069, 0x0307
|
||||
};
|
||||
|
||||
/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */
|
||||
static const uint8_t lit_unicode_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x0016, 0x0001, 0x0000
|
||||
0x000c, 0x0001, 0x0000
|
||||
};
|
||||
|
||||
/* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */
|
||||
static const uint16_t lit_unicode_upper_case_conversions[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x00b5, 0x039c, 0x0130, 0x0130, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4,
|
||||
0x01c8, 0x01c7, 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3,
|
||||
0x03d0, 0x0392, 0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a,
|
||||
0x03f1, 0x03a1, 0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e,
|
||||
0x1c83, 0x0421, 0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462,
|
||||
0x1c88, 0xa64a, 0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149,
|
||||
0x02bc, 0x004e, 0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048,
|
||||
0x0331, 0x1e97, 0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a,
|
||||
0x1e9a, 0x0041, 0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f,
|
||||
0x1f0f, 0x0399, 0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f,
|
||||
0x0399, 0x1faf, 0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399,
|
||||
0x1fb4, 0x0386, 0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2,
|
||||
0x1fca, 0x0399, 0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397,
|
||||
0x0342, 0x1fcc, 0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313,
|
||||
0x1fe6, 0x03a5, 0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4,
|
||||
0x038f, 0x0399, 0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046,
|
||||
0x0046, 0xfb01, 0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054,
|
||||
0xfb06, 0x0053, 0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15,
|
||||
0x0544, 0x053b, 0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399,
|
||||
0x0308, 0x0301, 0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300,
|
||||
0x1f54, 0x03a5, 0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391,
|
||||
0x0342, 0x0399, 0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300,
|
||||
0x1fd3, 0x0399, 0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5,
|
||||
0x0308, 0x0300, 0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342,
|
||||
0x1ff7, 0x03a9, 0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046,
|
||||
0x0046, 0x004c
|
||||
0x00b5, 0x039c, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, 0x01c8, 0x01c7,
|
||||
0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, 0x03d0, 0x0392,
|
||||
0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a, 0x03f1, 0x03a1,
|
||||
0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e, 0x1c83, 0x0421,
|
||||
0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462, 0x1c88, 0xa64a,
|
||||
0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149, 0x02bc, 0x004e,
|
||||
0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048, 0x0331, 0x1e97,
|
||||
0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a, 0x1e9a, 0x0041,
|
||||
0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f, 0x1f0f, 0x0399,
|
||||
0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f, 0x0399, 0x1faf,
|
||||
0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399, 0x1fb4, 0x0386,
|
||||
0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2, 0x1fca, 0x0399,
|
||||
0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397, 0x0342, 0x1fcc,
|
||||
0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313, 0x1fe6, 0x03a5,
|
||||
0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4, 0x038f, 0x0399,
|
||||
0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046, 0x0046, 0xfb01,
|
||||
0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054, 0xfb06, 0x0053,
|
||||
0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15, 0x0544, 0x053b,
|
||||
0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399, 0x0308, 0x0301,
|
||||
0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300, 0x1f54, 0x03a5,
|
||||
0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391, 0x0342, 0x0399,
|
||||
0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300, 0x1fd3, 0x0399,
|
||||
0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5, 0x0308, 0x0300,
|
||||
0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342, 0x1ff7, 0x03a9,
|
||||
0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046, 0x0046, 0x004c
|
||||
};
|
||||
|
||||
/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */
|
||||
static const uint8_t lit_unicode_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x001c, 0x002c, 0x0010
|
||||
0x001b, 0x002c, 0x0010
|
||||
};
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
/* Copyright JS Foundation and other contributors, http://js.foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* This file is automatically generated by the gen-unicode.py script
|
||||
* from the CaseFolding.txt file. Do not edit! */
|
||||
|
||||
/**
|
||||
* Character interval starting points for folding_skip_to_lower.
|
||||
*/
|
||||
static const uint16_t lit_unicode_folding_skip_to_lower_interval_starts[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x13a0, 0x13f8, 0xab70
|
||||
};
|
||||
|
||||
/**
|
||||
* Character interval lengths for folding_skip_to_lower.
|
||||
*/
|
||||
static const uint8_t lit_unicode_folding_skip_to_lower_interval_lengths[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x0055, 0x0005, 0x004f
|
||||
};
|
||||
|
||||
/**
|
||||
* Non-interval characters for folding_skip_to_lower.
|
||||
*/
|
||||
static const uint16_t lit_unicode_folding_skip_to_lower_chars[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x0130
|
||||
};
|
||||
|
||||
/**
|
||||
* Character interval starting points for folding_to_upper.
|
||||
*/
|
||||
static const uint16_t lit_unicode_folding_to_upper_interval_starts[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x03d0, 0x03d5, 0x03f0, 0x13f8, 0x1c80, 0xab70
|
||||
};
|
||||
|
||||
/**
|
||||
* Character interval lengths for folding_to_upper.
|
||||
*/
|
||||
static const uint8_t lit_unicode_folding_to_upper_interval_lengths[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x0001, 0x0001, 0x0001, 0x0005, 0x0008, 0x004f
|
||||
};
|
||||
|
||||
/**
|
||||
* Non-interval characters for folding_to_upper.
|
||||
*/
|
||||
static const uint16_t lit_unicode_folding_to_upper_chars[] JERRY_ATTR_CONST_DATA =
|
||||
{
|
||||
0x00b5, 0x017f, 0x0345, 0x03c2, 0x03f5, 0x1e9b, 0x1fbe
|
||||
};
|
||||
Reference in New Issue
Block a user