diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index 1cc15fa52..8c42f4aa8 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -136,9 +136,11 @@ lit_char_is_white_space (ecma_char_t c) /**< code unit */ { return (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM - || (c >= unicode_separator_char_interv_sps[0] - && c <= unicode_separator_char_interv_sps[0] + unicode_separator_char_interv_lens[0]) - || search_char_in_char_array (c, unicode_separator_chars, NUM_OF_ELEMENTS (unicode_separator_chars))); + || (c >= jerry_unicode_separator_char_interval_sps[0] + && c <= jerry_unicode_separator_char_interval_sps[0] + jerry_unicode_separator_char_interval_lengths[0]) + || search_char_in_char_array (c, + jerry_unicode_separator_chars, + NUM_OF_ELEMENTS (jerry_unicode_separator_chars))); } } /* lit_char_is_white_space */ @@ -178,9 +180,11 @@ lit_char_is_line_terminator (ecma_char_t c) /**< code unit */ static bool lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */ { - return (search_char_in_interval_array (c, unicode_letter_interv_sps, unicode_letter_interv_lens, - NUM_OF_ELEMENTS (unicode_letter_interv_sps)) - || search_char_in_char_array (c, unicode_letter_chars, NUM_OF_ELEMENTS (unicode_letter_chars))); + return (search_char_in_interval_array (c, + jerry_unicode_letter_interval_sps, + jerry_unicode_letter_interval_lengths, + NUM_OF_ELEMENTS (jerry_unicode_letter_interval_sps)) + || search_char_in_char_array (c, jerry_unicode_letter_chars, NUM_OF_ELEMENTS (jerry_unicode_letter_chars))); } /* lit_char_is_unicode_letter */ /** @@ -200,11 +204,13 @@ lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */ static bool lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */ { - return (search_char_in_interval_array (c, unicode_non_letter_ident_part_interv_sps, - unicode_non_letter_ident_part_interv_lens, - NUM_OF_ELEMENTS (unicode_non_letter_ident_part_interv_sps)) - || search_char_in_char_array (c, unicode_non_letter_ident_part_chars, - NUM_OF_ELEMENTS (unicode_non_letter_ident_part_chars))); + return (search_char_in_interval_array (c, + jerry_unicode_non_letter_ident_part_interval_sps, + jerry_unicode_non_letter_ident_part_interval_lengths, + NUM_OF_ELEMENTS (jerry_unicode_non_letter_ident_part_interval_sps)) + || search_char_in_char_array (c, + jerry_unicode_non_letter_ident_part_chars, + NUM_OF_ELEMENTS (jerry_unicode_non_letter_ident_part_chars))); } /* lit_char_is_unicode_non_letter_ident_part */ /** diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h index ee142aa21..b322357ac 100644 --- a/jerry-core/lit/lit-unicode-ranges.inc.h +++ b/jerry-core/lit/lit-unicode-ranges.inc.h @@ -12,169 +12,152 @@ * See the License for the specific language governing permissions and * limitations under the License. * - * - * Unicode characters and ranges generated by tools/print-unicode-ranges.sh - * from UnicodeData-3.0.0.txt. - * See also: - * http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt - * http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html + * This file is automatically generated by the unicode_ranges.py script + * from UnicodeData-3.0.0.txt. Do not edit! */ -#ifndef LIT_UNICODE_RANGES_INC_H_ -#define LIT_UNICODE_RANGES_INC_H_ - /** * Character interval starting points for the unicode letters. * - * The characters covered by these intervalse are from + * The characters covered by these intervals are from * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl */ -static const uint16_t unicode_letter_interv_sps[] JERRY_CONST_DATA = +static const uint16_t jerry_unicode_letter_interval_sps[] JERRY_CONST_DATA = { -/* - * these are handled separetely - * 0x0041, len 25 - * 0x0061, len 25 - */ - 0x00C0, 0x00D8, 0XF8, 0X1F8, 0x0222, 0x0250, 0x02B0, 0x02BB, - 0x02D0, 0x02E0, 0x0388, 0x038E, 0x03A3, 0x03D0, 0x03DA, 0x0400, 0x048C, 0x04C7, - 0x04CB, 0x04D0, 0x04F8, 0x0531, 0x0561, 0x05D0, 0x05F0, 0x0621, 0x0640, 0x0671, - 0x06E5, 0x06FA, 0x0712, 0x0780, 0x0905, 0x0958, 0x0985, 0x098F, 0x0993, 0x09AA, - 0x09B6, 0x09DC, 0x09DF, 0x09F0, 0x0A05, 0x0A0F, 0x0A13, 0x0A2A, 0x0A32, 0x0A35, - 0x0A38, 0x0A59, 0x0A72, 0x0A85, 0x0A8F, 0x0A93, 0x0AAA, 0x0AB2, 0x0AB5, 0x0B05, - 0x0B0F, 0x0B13, 0x0B2A, 0x0B32, 0x0B36, 0x0B5C, 0x0B5F, 0x0B85, 0x0B8E, 0x0B92, - 0x0B99, 0x0B9E, 0x0BA3, 0x0BA8, 0x0BAE, 0x0BB7, 0x0C05, 0x0C0E, 0x0C12, 0x0C2A, - 0x0C35, 0x0C60, 0x0C85, 0x0C8E, 0x0C92, 0x0CAA, 0x0CB5, 0x0CE0, 0x0D05, 0x0D0E, - 0x0D12, 0x0D2A, 0x0D60, 0x0D85, 0x0D9A, 0x0DB3, 0x0DC0, 0x0E01, 0x0E32, 0x0E40, - 0x0E81, 0x0E87, 0x0E94, 0x0E99, 0x0EA1, 0x0EAA, 0x0EAD, 0x0EB2, 0x0EC0, 0x0EDC, - 0x0F40, 0x0F49, 0x0F88, 0x1000, 0x1023, 0x1029, 0x1050, 0x10A0, 0x10D0, 0x1100, - 0x115F, 0x11A8, 0x1200, 0x1208, 0x124A, 0x1250, 0x125A, 0x1260, 0x128A, 0x1290, - 0x12B2, 0x12B8, 0x12C2, 0x12C8, 0x12D0, 0x12D8, 0x12F0, 0x1312, 0x1318, 0x1320, - 0x1348, 0x13A0, 0X1401, 0X1501, 0X1601, 0x166F, 0x1681, 0x16A0, 0x1780, 0x1820, - 0x1880, 0x1E00, 0x1EA0, 0x1F00, 0x1F18, 0x1F20, 0x1F48, 0x1F50, 0x1F5F, 0x1F80, - 0x1FB6, 0x1FC2, 0x1FC6, 0x1FD0, 0x1FD6, 0x1FE0, 0x1FF2, 0x1FF6, 0x210A, 0x2119, - 0x212A, 0x212F, 0x2133, 0x2160, 0x3005, 0x3021, 0x3031, 0x3038, 0x3041, 0x309D, - 0x30A1, 0x30FC, 0x3105, 0x3131, 0x31A0, 0XA000, 0XA100, 0XA200, 0XA300, 0XA400, - 0XF900, 0XFA00, 0xFB00, 0xFB13, 0xFB1F, 0xFB2A, 0xFB38, 0xFB40, 0xFB43, 0xFB46, - 0XFBD3, 0XFCD3, 0xFD50, 0xFD92, 0xFDF0, 0xFE70, 0xFE76, 0xFF21, 0xFF41, 0xFF66, - 0xFFC2, 0xFFCA, 0xFFD2, 0xFFDA + 0x00c0, 0x00d8, 0x00f8, 0x01f8, 0x0222, 0x0250, 0x02b0, 0x02bb, 0x02d0, 0x02e0, + 0x0388, 0x038e, 0x03a3, 0x03d0, 0x03da, 0x0400, 0x048c, 0x04c7, 0x04cb, 0x04d0, + 0x04f8, 0x0531, 0x0561, 0x05d0, 0x05f0, 0x0621, 0x0640, 0x0671, 0x06e5, 0x06fa, + 0x0712, 0x0780, 0x0905, 0x0958, 0x0985, 0x098f, 0x0993, 0x09aa, 0x09b6, 0x09dc, + 0x09df, 0x09f0, 0x0a05, 0x0a0f, 0x0a13, 0x0a2a, 0x0a32, 0x0a35, 0x0a38, 0x0a59, + 0x0a72, 0x0a85, 0x0a8f, 0x0a93, 0x0aaa, 0x0ab2, 0x0ab5, 0x0b05, 0x0b0f, 0x0b13, + 0x0b2a, 0x0b32, 0x0b36, 0x0b5c, 0x0b5f, 0x0b85, 0x0b8e, 0x0b92, 0x0b99, 0x0b9e, + 0x0ba3, 0x0ba8, 0x0bae, 0x0bb7, 0x0c05, 0x0c0e, 0x0c12, 0x0c2a, 0x0c35, 0x0c60, + 0x0c85, 0x0c8e, 0x0c92, 0x0caa, 0x0cb5, 0x0ce0, 0x0d05, 0x0d0e, 0x0d12, 0x0d2a, + 0x0d60, 0x0d85, 0x0d9a, 0x0db3, 0x0dc0, 0x0e01, 0x0e32, 0x0e40, 0x0e81, 0x0e87, + 0x0e94, 0x0e99, 0x0ea1, 0x0eaa, 0x0ead, 0x0eb2, 0x0ec0, 0x0edc, 0x0f40, 0x0f49, + 0x0f88, 0x1000, 0x1023, 0x1029, 0x1050, 0x10a0, 0x10d0, 0x1100, 0x115f, 0x11a8, + 0x1200, 0x1208, 0x124a, 0x1250, 0x125a, 0x1260, 0x128a, 0x1290, 0x12b2, 0x12b8, + 0x12c2, 0x12c8, 0x12d0, 0x12d8, 0x12f0, 0x1312, 0x1318, 0x1320, 0x1348, 0x13a0, + 0x1401, 0x1501, 0x1601, 0x166f, 0x1681, 0x16a0, 0x1780, 0x1820, 0x1880, 0x1e00, + 0x1ea0, 0x1f00, 0x1f18, 0x1f20, 0x1f48, 0x1f50, 0x1f5f, 0x1f80, 0x1fb6, 0x1fc2, + 0x1fc6, 0x1fd0, 0x1fd6, 0x1fe0, 0x1ff2, 0x1ff6, 0x210a, 0x2119, 0x212a, 0x212f, + 0x2133, 0x2160, 0x3005, 0x3021, 0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, + 0x3105, 0x3131, 0x31a0, 0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xf900, 0xfa00, + 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, + 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, + 0xffd2, 0xffda }; /** * Character lengths for the unicode letters. * - * The characters covered by these intervalse are from + * The characters covered by these intervals are from * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl */ -static const uint8_t unicode_letter_interv_lens[] JERRY_CONST_DATA = +static const uint8_t jerry_unicode_letter_interval_lengths[] JERRY_CONST_DATA = { - 22, 30, 255, 39, 17, 93, 8, 6, - 1, 4, 2, 19, 43, 7, 25, 129, 56, 1, - 1, 37, 1, 37, 38, 26, 2, 25, 10, 98, - 1, 2, 26, 37, 52, 9, 7, 1, 21, 6, - 3, 1, 2, 1, 5, 1, 21, 6, 1, 1, - 1, 3, 2, 6, 2, 21, 6, 1, 4, 7, - 1, 21, 6, 1, 3, 1, 2, 5, 2, 3, - 1, 1, 1, 2, 7, 2, 7, 2, 22, 9, - 4, 1, 7, 2, 22, 9, 4, 1, 7, 2, - 22, 15, 1, 17, 23, 8, 6, 47, 1, 6, - 1, 1, 3, 6, 2, 1, 3, 1, 4, 1, - 7, 33, 3, 33, 4, 1, 5, 37, 38, 89, - 67, 81, 6, 62, 3, 6, 3, 38, 3, 30, - 3, 6, 3, 6, 6, 22, 30, 3, 6, 38, - 18, 84, 255, 255, 107, 7, 25, 74, 51, 87, - 40, 155, 89, 21, 5, 37, 5, 7, 30, 52, - 6, 2, 6, 3, 5, 12, 2, 6, 9, 4, - 3, 2, 6, 35, 2, 8, 4, 2, 83, 1, - 89, 2, 39, 93, 23, 255, 255, 255, 255, 140, - 255, 45, 6, 4, 9, 12, 4, 1, 1, 107, - 255, 106, 63, 53, 11, 2, 134, 25, 25, 88, - 5, 5, 5, 2 + 0x0016, 0x001e, 0x00ff, 0x0027, 0x0011, 0x005d, 0x0008, 0x0006, 0x0001, 0x0004, + 0x0002, 0x0013, 0x002b, 0x0007, 0x0019, 0x0081, 0x0038, 0x0001, 0x0001, 0x0025, + 0x0001, 0x0025, 0x0026, 0x001a, 0x0002, 0x0019, 0x000a, 0x0062, 0x0001, 0x0002, + 0x001a, 0x0025, 0x0034, 0x0009, 0x0007, 0x0001, 0x0015, 0x0006, 0x0003, 0x0001, + 0x0002, 0x0001, 0x0005, 0x0001, 0x0015, 0x0006, 0x0001, 0x0001, 0x0001, 0x0003, + 0x0002, 0x0006, 0x0002, 0x0015, 0x0006, 0x0001, 0x0004, 0x0007, 0x0001, 0x0015, + 0x0006, 0x0001, 0x0003, 0x0001, 0x0002, 0x0005, 0x0002, 0x0003, 0x0001, 0x0001, + 0x0001, 0x0002, 0x0007, 0x0002, 0x0007, 0x0002, 0x0016, 0x0009, 0x0004, 0x0001, + 0x0007, 0x0002, 0x0016, 0x0009, 0x0004, 0x0001, 0x0007, 0x0002, 0x0016, 0x000f, + 0x0001, 0x0011, 0x0017, 0x0008, 0x0006, 0x002f, 0x0001, 0x0006, 0x0001, 0x0001, + 0x0003, 0x0006, 0x0002, 0x0001, 0x0003, 0x0001, 0x0004, 0x0001, 0x0007, 0x0021, + 0x0003, 0x0021, 0x0004, 0x0001, 0x0005, 0x0025, 0x0026, 0x0059, 0x0043, 0x0051, + 0x0006, 0x003e, 0x0003, 0x0006, 0x0003, 0x0026, 0x0003, 0x001e, 0x0003, 0x0006, + 0x0003, 0x0006, 0x0006, 0x0016, 0x001e, 0x0003, 0x0006, 0x0026, 0x0012, 0x0054, + 0x00ff, 0x00ff, 0x006b, 0x0007, 0x0019, 0x004a, 0x0033, 0x0057, 0x0028, 0x009b, + 0x0059, 0x0015, 0x0005, 0x0025, 0x0005, 0x0007, 0x001e, 0x0034, 0x0006, 0x0002, + 0x0006, 0x0003, 0x0005, 0x000c, 0x0002, 0x0006, 0x0009, 0x0004, 0x0003, 0x0002, + 0x0006, 0x0023, 0x0002, 0x0008, 0x0004, 0x0002, 0x0053, 0x0001, 0x0059, 0x0002, + 0x0027, 0x005d, 0x0017, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x00ff, 0x002d, + 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, + 0x003f, 0x0035, 0x000b, 0x0002, 0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, + 0x0005, 0x0002 }; /** * Those unicode letter characters that are not inside any of - * the intervals specified in unicode_letter_intervals array. + * the intervals specified in jerry_unicode_letter_interval_sps array. * * The characters are from the following Unicode categories: * Lu, Ll, Lt, Lm, Lo, Nl */ -static const uint16_t unicode_letter_chars[] JERRY_CONST_DATA = +static const uint16_t jerry_unicode_letter_chars[] JERRY_CONST_DATA = { - 0x00AA, 0x00B5, 0x00BA, 0x02EE, 0x037A, 0x0386, 0x038C, 0x0559, 0x06D5, 0x0710, - 0x093D, 0x0950, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C, - 0x0CDE, 0x0DBD, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EBD, 0x0EC6, 0x0F00, - 0x1248, 0x1258, 0x1288, 0x12B0, 0x12C0, 0x1310, 0x1F59, 0x1F5B, 0x1F5D, 0x1FBE, - 0x207F, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3400, 0x4DB5, 0x4E00, - 0x9FA5, 0xAC00, 0xD7A3, 0xFB1D, 0xFB3E, 0xFE74 + 0x00aa, 0x00b5, 0x00ba, 0x02ee, 0x037a, 0x0386, 0x038c, 0x0559, 0x06d5, 0x0710, + 0x093d, 0x0950, 0x09b2, 0x0a5e, 0x0a8d, 0x0abd, 0x0ad0, 0x0ae0, 0x0b3d, 0x0b9c, + 0x0cde, 0x0dbd, 0x0e84, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, + 0x1248, 0x1258, 0x1288, 0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, + 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3400, 0x4db5, 0x4e00, + 0x9fa5, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e, 0xfe74 }; /** * Character interval starting points for non-letter character * that can be used as a non-first character of an identifier. * - * The characters covered by these intervalse are from + * The characters covered by these intervals are from * the following Unicode categories: Nd, Mn, Mc, Pc */ -static const uint16_t unicode_non_letter_ident_part_interv_sps[] JERRY_CONST_DATA = +static const uint16_t jerry_unicode_non_letter_ident_part_interval_sps[] JERRY_CONST_DATA = { -/* - * decimal digits: handled separately - * 0x0030, len: 9 - */ - 0x0300, 0x0360, 0x0483, 0x0591, 0x05A3, 0x05BB, 0x05C1, 0x064B, 0x0660, - 0x06D6, 0x06DF, 0x06E7, 0x06EA, 0x06F0, 0x0730, 0x07A6, 0x0901, 0x093E, 0x0951, - 0x0962, 0x0966, 0x0981, 0x09BE, 0x09C7, 0x09CB, 0x09E2, 0x09E6, 0x0A3E, 0x0A47, - 0x0A4B, 0x0A66, 0x0A81, 0x0ABE, 0x0AC7, 0x0ACB, 0x0AE6, 0x0B01, 0x0B3E, 0x0B47, - 0x0B4B, 0x0B56, 0x0B66, 0x0B82, 0x0BBE, 0x0BC6, 0x0BCA, 0x0BE7, 0x0C01, 0x0C3E, - 0x0C46, 0x0C4A, 0x0C55, 0x0C66, 0x0C82, 0x0CBE, 0x0CC6, 0x0CCA, 0x0CD5, 0x0CE6, - 0x0D02, 0x0D3E, 0x0D46, 0x0D4A, 0x0D66, 0x0D82, 0x0DCF, 0x0DD8, 0x0DF2, 0x0E34, - 0x0E47, 0x0E50, 0x0EB4, 0x0EBB, 0x0EC8, 0x0ED0, 0x0F18, 0x0F20, 0x0F3E, 0x0F71, - 0x0F86, 0x0F90, 0x0F99, 0x102C, 0x1036, 0x1040, 0x1056, 0x1369, 0x17B4, 0x17E0, - 0x1810, 0x203F, 0x20D0, 0x302A, 0x3099, 0xFE20, 0xFE33, 0xFE4D, 0xFF10 + 0x0300, 0x0360, 0x0483, 0x0591, 0x05a3, 0x05bb, 0x05c1, 0x064b, 0x0660, 0x06d6, + 0x06df, 0x06e7, 0x06ea, 0x06f0, 0x0730, 0x07a6, 0x0901, 0x093e, 0x0951, 0x0962, + 0x0966, 0x0981, 0x09be, 0x09c7, 0x09cb, 0x09e2, 0x09e6, 0x0a3e, 0x0a47, 0x0a4b, + 0x0a66, 0x0a81, 0x0abe, 0x0ac7, 0x0acb, 0x0ae6, 0x0b01, 0x0b3e, 0x0b47, 0x0b4b, + 0x0b56, 0x0b66, 0x0b82, 0x0bbe, 0x0bc6, 0x0bca, 0x0be7, 0x0c01, 0x0c3e, 0x0c46, + 0x0c4a, 0x0c55, 0x0c66, 0x0c82, 0x0cbe, 0x0cc6, 0x0cca, 0x0cd5, 0x0ce6, 0x0d02, + 0x0d3e, 0x0d46, 0x0d4a, 0x0d66, 0x0d82, 0x0dcf, 0x0dd8, 0x0df2, 0x0e34, 0x0e47, + 0x0e50, 0x0eb4, 0x0ebb, 0x0ec8, 0x0ed0, 0x0f18, 0x0f20, 0x0f3e, 0x0f71, 0x0f86, + 0x0f90, 0x0f99, 0x102c, 0x1036, 0x1040, 0x1056, 0x1369, 0x17b4, 0x17e0, 0x1810, + 0x203f, 0x20d0, 0x302a, 0x3099, 0xfe20, 0xfe33, 0xfe4d, 0xff10 }; /** * Character interval lengths for non-letter character * that can be used as a non-first character of an identifier. * - * The characters covered by these intervalse are from + * The characters covered by these intervals are from * the following Unicode categories: Nd, Mn, Mc, Pc */ -static const uint8_t unicode_non_letter_ident_part_interv_lens[] = +static const uint8_t jerry_unicode_non_letter_ident_part_interval_lengths[] JERRY_CONST_DATA = { - 78, 2, 3, 16, 22, 2, 1, 10, 9, - 6, 5, 1, 3, 9, 26, 10, 2, 15, 3, - 1, 9, 2, 6, 1, 2, 1, 9, 4, 1, - 2, 11, 2, 7, 2, 2, 9, 2, 5, 1, - 2, 1, 9, 1, 4, 2, 3, 8, 2, 6, - 2, 3, 1, 9, 1, 6, 2, 3, 1, 9, - 1, 5, 2, 3, 9, 1, 5, 7, 1, 6, - 7, 9, 5, 1, 5, 9, 1, 9, 1, 19, - 1, 7, 35, 6, 3, 9, 3, 8, 31, 9, - 9, 1, 12, 5, 1, 3, 1, 2, 9 + 0x004e, 0x0002, 0x0003, 0x0010, 0x0016, 0x0002, 0x0001, 0x000a, 0x0009, 0x0006, + 0x0005, 0x0001, 0x0003, 0x0009, 0x001a, 0x000a, 0x0002, 0x000f, 0x0003, 0x0001, + 0x0009, 0x0002, 0x0006, 0x0001, 0x0002, 0x0001, 0x0009, 0x0004, 0x0001, 0x0002, + 0x000b, 0x0002, 0x0007, 0x0002, 0x0002, 0x0009, 0x0002, 0x0005, 0x0001, 0x0002, + 0x0001, 0x0009, 0x0001, 0x0004, 0x0002, 0x0003, 0x0008, 0x0002, 0x0006, 0x0002, + 0x0003, 0x0001, 0x0009, 0x0001, 0x0006, 0x0002, 0x0003, 0x0001, 0x0009, 0x0001, + 0x0005, 0x0002, 0x0003, 0x0009, 0x0001, 0x0005, 0x0007, 0x0001, 0x0006, 0x0007, + 0x0009, 0x0005, 0x0001, 0x0005, 0x0009, 0x0001, 0x0009, 0x0001, 0x0013, 0x0001, + 0x0007, 0x0023, 0x0006, 0x0003, 0x0009, 0x0003, 0x0008, 0x001f, 0x0009, 0x0009, + 0x0001, 0x000c, 0x0005, 0x0001, 0x0003, 0x0001, 0x0002, 0x0009 }; /** * Those non-letter characters that can be used as a non-first * character of an identifier and not included in any of the intervals - * specified in unicode_non_letter_ident_part_intervals array. + * specified in jerry_unicode_non_letter_ident_part_interval_sps array. * * The characters are from the following Unicode categories: * Nd, Mn, Mc, Pc */ -static const uint16_t unicode_non_letter_ident_part_chars[] = +static const uint16_t jerry_unicode_non_letter_ident_part_chars[] JERRY_CONST_DATA = { - 0x005F, 0x05BF, 0x05C4, 0x0670, 0x0711, 0x093C, 0x09BC, 0x09D7, 0x0A02, 0x0A3C, - 0x0ABC, 0x0B3C, 0x0BD7, 0x0D57, 0x0DCA, 0x0DD6, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, - 0x0F39, 0x0FC6, 0x18A9, 0x20E1, 0x30FB, 0xFB1E, 0xFF3F, 0xFF65 + 0x05bf, 0x05c4, 0x0670, 0x0711, 0x093c, 0x09bc, 0x09d7, 0x0a02, 0x0a3c, 0x0abc, + 0x0b3c, 0x0bd7, 0x0d57, 0x0dca, 0x0dd6, 0x0e31, 0x0eb1, 0x0f35, 0x0f37, 0x0f39, + 0x0fc6, 0x18a9, 0x20e1, 0x30fb, 0xfb1e, 0xff3f, 0xff65 }; - /** - * Unicode separator character interval strting points from Unicode category: Zs + * Unicode separator character interval starting points from Unicode category: Zs */ -static const uint16_t unicode_separator_char_interv_sps[] = +static const uint16_t jerry_unicode_separator_char_interval_sps[] JERRY_CONST_DATA = { 0x2000 }; @@ -182,29 +165,19 @@ static const uint16_t unicode_separator_char_interv_sps[] = /** * Unicode separator character interval lengths from Unicode category: Zs */ -static const uint8_t unicode_separator_char_interv_lens[] = +static const uint8_t jerry_unicode_separator_char_interval_lengths[] JERRY_CONST_DATA = { - 11 + 0x000b }; /** * Unicode separator characters that are not in the - * unicode_separator_char_intervals array. + * jerry_unicode_separator_char_intervals array. * * Unicode category: Zs */ -static const uint16_t unicode_separator_chars[] = +static const uint16_t jerry_unicode_separator_chars[] JERRY_CONST_DATA = { - /* - * these two chars are handled separatly @see lit_char_is_space_separator - * 0x0020, space - * 0x00A0, non-braking space - */ - 0x1680, \ - 0x180E, /* manually added */ \ - 0x202F, /* manually added */ \ - 0x205F, \ - 0x3000 + 0x1680, 0x180e, 0x202f, 0x205f, 0x3000 }; -#endif diff --git a/tools/print-unicode-ranges.sh b/tools/print-unicode-ranges.sh deleted file mode 100755 index b4e719f23..000000000 --- a/tools/print-unicode-ranges.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash - -# Copyright JS Foundation and other contributors, http://js.foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt -# - -# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So -# letter: Lu Ll Lt Lm Lo Nl -# non-letter-indent-part: -# digit: Nd -# punctuation mark: Mn Mc -# connector punctuation: Pc -# separators: Zs - -if [ $# -le 4 ]; then - echo "useage: print-unicode-ranges.sh <-i y sp|y len|n> <-cat letters|non-let-indent-parts|separators>" - echo " -i: y sp - print interval starting points" - echo " y len - print interval lengths" - echo " n - print individual characters" - echo " -cat: whether print letters|non-let-indent-parts|separators category" - exit 1 -fi - -STARTING_POINT="len" - -UNICODE_DATA_PATH="$1" -shift - -while [ $# -gt 0 ]; do - if [ $1 == "-i" ]; then - shift - PRINT_INTERVALS="$1" - if [ $PRINT_INTERVALS == "y" ]; then - shift - STARTING_POINT="$1" - echo $STARTING_POINT - fi - elif [ $1 == "-cat" ]; then - shift - CATEGORY="$1" - echo $CATEGORY - fi - shift -done - -awk -v desired_category="$CATEGORY" \ -'BEGIN \ - { \ - FS=";"; OFS=";" \ - } \ - { \ - cat=$3; \ - if (desired_category == "letters" && (cat == "Lu" || cat == "Ll" || cat == "Lt" || cat == "Lm" || cat == "Lo" || cat == "Nl")) \ - { \ - print "0x"$1, $2, $3; \ - } \ - else if (desired_category == "non-let-indent-parts" && (cat == "Nd" || cat == "Mn" || cat == "Mc" || cat == "Pc")) \ - { \ - print "0x"$1, $2, $3; \ - } \ - else if (desired_category == "separators" && cat == "Zs") \ - { \ - print "0x"$1, $2, $3; \ - } \ - }' $UNICODE_DATA_PATH \ -| gawk --non-decimal-data -v print_intervals="$PRINT_INTERVALS" -v sp="$STARTING_POINT" \ -'BEGIN \ - { \ - FS = ";"; \ - OFS = ";"; \ - is_in_range = 0; \ - print_count = 0; \ - } \ - \ - function print_Nl() \ - { \ - ++print_count; \ - if (print_count == 10) \ - { \ - printf "\n"; \ - print_count = 0; \ - } \ - } \ - \ - function output_next_range () \ - { \ - if (range_begin != range_prev && print_intervals=="y") \ - { \ - i1 = strtonum(range_begin); \ - i2 = strtonum(range_prev); \ - len = i2 - i1; \ - # if the length of an interval is > 255 have to spilt it into 255-lenth ones - if (len > 255) \ - { \ - numOfSubintervals = (len / 255); # more precisely number of subintervals - 1 \ - for (i = 1; i <= numOfSubintervals; ++i) \ - { \ - if (sp == "sp") \ - { \ - printf "0X%X, ", i1; \ - print_Nl(); \ - } - else \ - { \ - printf "%d, ", 255; \ - print_Nl(); \ - } \ - i1 = i1 + 256; # next interval begins on the ending of the previous + 1 \ - } \ - if (sp == "sp") \ - { \ - printf "0X%X, ", i1; \ - print_Nl(); \ - } \ - else \ - { \ - printf "%d, ", len % 255 - (i-1); \ - print_Nl(); \ - } \ - } \ - else \ - { \ - if (sp == "sp") \ - { \ - printf "%s, ", range_begin; \ - print_Nl(); \ - } \ - else \ - { \ - printf "%d, ", len; \ - print_Nl(); \ - } \ - } \ - } \ - else if (range_begin == range_prev && print_intervals != "y")\ - { \ - printf "%s, ", range_begin; \ - print_Nl(); \ - } \ - } \ - \ - { \ - if (is_in_range == 0) \ - { \ - is_in_range = 1; \ - range_begin = $1; \ - range_prev = $1; \ - range_begin_name = $2; \ - range_prev_name = $2; \ - } \ - else \ - { \ - if (range_prev + 1 == $1) \ - { \ - range_prev = $1; \ - range_prev_name = $2 - } \ - else \ - { \ - output_next_range(); \ - range_begin = $1; \ - range_prev=$1; \ - range_begin_name = $2; \ - range_prev_name = $2; \ - } \ - } \ - } \ - \ -END \ - { \ - output_next_range(); \ - }' diff --git a/tools/unicode_ranges.py b/tools/unicode_ranges.py new file mode 100644 index 000000000..0b9f9f72b --- /dev/null +++ b/tools/unicode_ranges.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python + +# Copyright JS Foundation and other contributors, http://js.foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt +# + +# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So +# letter: Lu Ll Lt Lm Lo Nl +# non-letter-indent-part: +# digit: Nd +# punctuation mark: Mn Mc +# connector punctuation: Pc +# separators: Zs + +import argparse +import bisect +import csv +import itertools +import os + +TOOLS_DIR = os.path.dirname(os.path.abspath(__file__)) +PROJECT_DIR = os.path.normpath(os.path.join(TOOLS_DIR, '..')) +C_SOURCE_FILE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') + +parser = argparse.ArgumentParser() + +parser.add_argument('unicode_data', + metavar='FILE', + action='store', + help='specify the unicode data file') + +parser.add_argument('--c-source', + metavar='FILE', + action='store', + default=C_SOURCE_FILE, + help='specify the output c source (default: %(default)s)') + +script_args = parser.parse_args() + + +def main(): + if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): + print('The %s file is missing or not readable!' % script_args.unicode_data) + sys.exit(1) + + letters, non_letters, separators = read_categories() + + letters_list = list(ranges(letters)) + letter_interval_sps, letter_interval_lengths, letter_chars = split_list(letters_list) + + non_letters_list = list(ranges(non_letters)) + non_letter_interval_sps, non_letter_interval_lengths, non_letter_chars = split_list(non_letters_list) + + separator_list = list(ranges(separators)) + separator_interval_sps, separator_interval_lengths, separator_chars = split_list(separator_list) + + source = GenSource() + + letter_interval_sps_desc = """/** + * Character interval starting points for the unicode letters. + * + * The characters covered by these intervals are from + * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl + */""" + source.add_table("uint16_t", + "unicode_letter_interval_sps", + letter_interval_sps, + letter_interval_sps_desc) + + letter_interval_lengths_desc = """/** + * Character lengths for the unicode letters. + * + * The characters covered by these intervals are from + * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl + */""" + source.add_table("uint8_t", + "unicode_letter_interval_lengths", + letter_interval_lengths, + letter_interval_lengths_desc) + + letter_chars_desc = """/** + * Those unicode letter characters that are not inside any of + * the intervals specified in jerry_unicode_letter_interval_sps array. + * + * The characters are from the following Unicode categories: + * Lu, Ll, Lt, Lm, Lo, Nl + */""" + source.add_table("uint16_t", + "unicode_letter_chars", + letter_chars, + letter_chars_desc) + + non_letter_interval_sps_desc = """/** + * Character interval starting points for non-letter character + * that can be used as a non-first character of an identifier. + * + * The characters covered by these intervals are from + * the following Unicode categories: Nd, Mn, Mc, Pc + */""" + source.add_table("uint16_t", + "unicode_non_letter_ident_part_interval_sps", + non_letter_interval_sps, + non_letter_interval_sps_desc) + + non_letter_interval_lengths_desc = """/** + * Character interval lengths for non-letter character + * that can be used as a non-first character of an identifier. + * + * The characters covered by these intervals are from + * the following Unicode categories: Nd, Mn, Mc, Pc + */""" + source.add_table("uint8_t", + "unicode_non_letter_ident_part_interval_lengths", + non_letter_interval_lengths, + non_letter_interval_lengths_desc) + + non_letter_chars_desc = """/** + * Those non-letter characters that can be used as a non-first + * character of an identifier and not included in any of the intervals + * specified in jerry_unicode_non_letter_ident_part_interval_sps array. + * + * The characters are from the following Unicode categories: + * Nd, Mn, Mc, Pc + */""" + source.add_table("uint16_t", + "unicode_non_letter_ident_part_chars", + non_letter_chars, + non_letter_chars_desc) + + separator_interval_sps_desc = """/** + * Unicode separator character interval starting points from Unicode category: Zs + */""" + source.add_table("uint16_t", + "unicode_separator_char_interval_sps", + separator_interval_sps, + separator_interval_sps_desc) + + separator_interval_lengths_desc = """/** + * Unicode separator character interval lengths from Unicode category: Zs + */""" + source.add_table("uint8_t", + "unicode_separator_char_interval_lengths", + separator_interval_lengths, + separator_interval_lengths_desc) + + separator_chars_desc = """/** + * Unicode separator characters that are not in the + * jerry_unicode_separator_char_intervals array. + * + * Unicode category: Zs + */""" + source.add_table("uint16_t", + "unicode_separator_chars", + separator_chars, + separator_chars_desc) + + source.write_source() + + +def read_categories(): + """ + Read the corresponding unicode values and store them in category lists. + + :return: List of letters, non_letter and separators. + """ + + letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"] + non_letter_category = ["Nd", "Mn", "Mc", "Pc"] + separator_category = ["Zs"] + + letters = [] + non_letters = [] + separators = [] + + with open(script_args.unicode_data) as unicode_data: + unicode_data_reader = csv.reader(unicode_data, delimiter=';') + + for line in unicode_data_reader: + unicode_id = int(line[0], 16) + + # Skip supplementary planes and ascii chars + if unicode_id >= 0x10000 or unicode_id < 128: + continue + + category = line[2] + + if category in letter_category: + letters.append(unicode_id) + elif category in non_letter_category: + non_letters.append(unicode_id) + elif category in separator_category: + separators.append(unicode_id) + + # This separator char is handled separatly + non_breaking_space = 0x00A0 + if non_breaking_space in separators: + separators.remove(int(non_breaking_space)) + + # These separator chars are not in UnicodeData-3.0.0.txt or not in Zs category + mongolian_vowel_separator = 0x180E + medium_mathematical_space = 0x205F + + if mongolian_vowel_separator not in separators: + bisect.insort(separators, int(mongolian_vowel_separator)) + if medium_mathematical_space not in separators: + bisect.insort(separators, int(medium_mathematical_space)) + + return letters, non_letters, separators + + +def ranges(i): + """ + Convert an increasing list of integers into a range list + + :return: List of ranges. + """ + + for a, b in itertools.groupby(enumerate(i), lambda (x, y): y - x): + b = list(b) + yield b[0][1], b[-1][1] + + +def split_list(category_list): + """ + Split list of ranges into intervals and single char lists. + + :return: List of interval starting points, interval lengths and single chars + """ + + unicode_category_interval_sps = [] + unicode_category_interval_lengths = [] + unicode_category_chars = [] + + for element in category_list: + interval_length = element[1] - element[0] + if interval_length == 0: + unicode_category_chars.append(element[0]) + + elif (interval_length > 255): + for i in range(element[0], element[1], 256): + length = 255 if (element[1] - i > 255) else (element[1] - i) + unicode_category_interval_sps.append(i) + unicode_category_interval_lengths.append(length) + else: + unicode_category_interval_sps.append(element[0]) + unicode_category_interval_lengths.append(element[1] - element[0]) + + return unicode_category_interval_sps, unicode_category_interval_lengths, unicode_category_chars + + +class GenSource(object): + """Class defines a default generated c source.""" + + def __init__(self): + self._data = [] + + header = """/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is automatically generated by the {SCRIPT} script + * from {UNICODES}. Do not edit! + */ + +""".format(SCRIPT=os.path.basename(__file__), UNICODES=os.path.basename(script_args.unicode_data)) + + self._data.append(header) + + def _regroup(self, l, n): + return [l[i:i+n] for i in range(0, len(l), n)] + + def _hex_format(self, ch): + if isinstance(ch, str): + ch = ord(ch) + + return "0x{:04x}".format(ch) + + def _format_code(self, code, indent): + lines = [] + # convert all characters to hex format + converted_code = map(self._hex_format, code) + # 10 hex number per line + for line in self._regroup(", ".join(converted_code), 10 * 8): + lines.append((' ' * indent) + line.strip()) + return "\n".join(lines) + + def add_table(self, type_name, array_name, table, description=""): + table_str = """{DESC} +static const {TYPE} jerry_{NAME}[] JERRY_CONST_DATA = +{{ +{TABLE} +}}; + +""".format(DESC=description, TYPE=type_name, NAME=array_name, TABLE=self._format_code(table, 1)) + + self._data.append(table_str) + + def write_source(self): + with open(script_args.c_source, 'w') as genereted_source: + genereted_source.write(''.join(self._data)) + + +if __name__ == "__main__": + main()