Rewrite the generator script of unicode ranges. (#1583)

The script generates the source file instead of copy the tables and paste these manually.

JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
This commit is contained in:
Robert Sipka
2017-02-16 18:31:30 +01:00
committed by Dániel Bátyai
parent 799726aa42
commit 188dc46fe0
4 changed files with 439 additions and 318 deletions
+17 -11
View File
@@ -136,9 +136,11 @@ lit_char_is_white_space (ecma_char_t c) /**< code unit */
{
return (c == LIT_CHAR_NBSP
|| c == LIT_CHAR_BOM
|| (c >= unicode_separator_char_interv_sps[0]
&& c <= unicode_separator_char_interv_sps[0] + unicode_separator_char_interv_lens[0])
|| search_char_in_char_array (c, unicode_separator_chars, NUM_OF_ELEMENTS (unicode_separator_chars)));
|| (c >= jerry_unicode_separator_char_interval_sps[0]
&& c <= jerry_unicode_separator_char_interval_sps[0] + jerry_unicode_separator_char_interval_lengths[0])
|| search_char_in_char_array (c,
jerry_unicode_separator_chars,
NUM_OF_ELEMENTS (jerry_unicode_separator_chars)));
}
} /* lit_char_is_white_space */
@@ -178,9 +180,11 @@ lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
static bool
lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
{
return (search_char_in_interval_array (c, unicode_letter_interv_sps, unicode_letter_interv_lens,
NUM_OF_ELEMENTS (unicode_letter_interv_sps))
|| search_char_in_char_array (c, unicode_letter_chars, NUM_OF_ELEMENTS (unicode_letter_chars)));
return (search_char_in_interval_array (c,
jerry_unicode_letter_interval_sps,
jerry_unicode_letter_interval_lengths,
NUM_OF_ELEMENTS (jerry_unicode_letter_interval_sps))
|| search_char_in_char_array (c, jerry_unicode_letter_chars, NUM_OF_ELEMENTS (jerry_unicode_letter_chars)));
} /* lit_char_is_unicode_letter */
/**
@@ -200,11 +204,13 @@ lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
static bool
lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
{
return (search_char_in_interval_array (c, unicode_non_letter_ident_part_interv_sps,
unicode_non_letter_ident_part_interv_lens,
NUM_OF_ELEMENTS (unicode_non_letter_ident_part_interv_sps))
|| search_char_in_char_array (c, unicode_non_letter_ident_part_chars,
NUM_OF_ELEMENTS (unicode_non_letter_ident_part_chars)));
return (search_char_in_interval_array (c,
jerry_unicode_non_letter_ident_part_interval_sps,
jerry_unicode_non_letter_ident_part_interval_lengths,
NUM_OF_ELEMENTS (jerry_unicode_non_letter_ident_part_interval_sps))
|| search_char_in_char_array (c,
jerry_unicode_non_letter_ident_part_chars,
NUM_OF_ELEMENTS (jerry_unicode_non_letter_ident_part_chars)));
} /* lit_char_is_unicode_non_letter_ident_part */
/**
+94 -121
View File
@@ -12,169 +12,152 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
* Unicode characters and ranges generated by tools/print-unicode-ranges.sh
* from UnicodeData-3.0.0.txt.
* See also:
* http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
* http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
* This file is automatically generated by the unicode_ranges.py script
* from UnicodeData-3.0.0.txt. Do not edit!
*/
#ifndef LIT_UNICODE_RANGES_INC_H_
#define LIT_UNICODE_RANGES_INC_H_
/**
* Character interval starting points for the unicode letters.
*
* The characters covered by these intervalse are from
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/
static const uint16_t unicode_letter_interv_sps[] JERRY_CONST_DATA =
static const uint16_t jerry_unicode_letter_interval_sps[] JERRY_CONST_DATA =
{
/*
* these are handled separetely
* 0x0041, len 25
* 0x0061, len 25
*/
0x00C0, 0x00D8, 0XF8, 0X1F8, 0x0222, 0x0250, 0x02B0, 0x02BB,
0x02D0, 0x02E0, 0x0388, 0x038E, 0x03A3, 0x03D0, 0x03DA, 0x0400, 0x048C, 0x04C7,
0x04CB, 0x04D0, 0x04F8, 0x0531, 0x0561, 0x05D0, 0x05F0, 0x0621, 0x0640, 0x0671,
0x06E5, 0x06FA, 0x0712, 0x0780, 0x0905, 0x0958, 0x0985, 0x098F, 0x0993, 0x09AA,
0x09B6, 0x09DC, 0x09DF, 0x09F0, 0x0A05, 0x0A0F, 0x0A13, 0x0A2A, 0x0A32, 0x0A35,
0x0A38, 0x0A59, 0x0A72, 0x0A85, 0x0A8F, 0x0A93, 0x0AAA, 0x0AB2, 0x0AB5, 0x0B05,
0x0B0F, 0x0B13, 0x0B2A, 0x0B32, 0x0B36, 0x0B5C, 0x0B5F, 0x0B85, 0x0B8E, 0x0B92,
0x0B99, 0x0B9E, 0x0BA3, 0x0BA8, 0x0BAE, 0x0BB7, 0x0C05, 0x0C0E, 0x0C12, 0x0C2A,
0x0C35, 0x0C60, 0x0C85, 0x0C8E, 0x0C92, 0x0CAA, 0x0CB5, 0x0CE0, 0x0D05, 0x0D0E,
0x0D12, 0x0D2A, 0x0D60, 0x0D85, 0x0D9A, 0x0DB3, 0x0DC0, 0x0E01, 0x0E32, 0x0E40,
0x0E81, 0x0E87, 0x0E94, 0x0E99, 0x0EA1, 0x0EAA, 0x0EAD, 0x0EB2, 0x0EC0, 0x0EDC,
0x0F40, 0x0F49, 0x0F88, 0x1000, 0x1023, 0x1029, 0x1050, 0x10A0, 0x10D0, 0x1100,
0x115F, 0x11A8, 0x1200, 0x1208, 0x124A, 0x1250, 0x125A, 0x1260, 0x128A, 0x1290,
0x12B2, 0x12B8, 0x12C2, 0x12C8, 0x12D0, 0x12D8, 0x12F0, 0x1312, 0x1318, 0x1320,
0x1348, 0x13A0, 0X1401, 0X1501, 0X1601, 0x166F, 0x1681, 0x16A0, 0x1780, 0x1820,
0x1880, 0x1E00, 0x1EA0, 0x1F00, 0x1F18, 0x1F20, 0x1F48, 0x1F50, 0x1F5F, 0x1F80,
0x1FB6, 0x1FC2, 0x1FC6, 0x1FD0, 0x1FD6, 0x1FE0, 0x1FF2, 0x1FF6, 0x210A, 0x2119,
0x212A, 0x212F, 0x2133, 0x2160, 0x3005, 0x3021, 0x3031, 0x3038, 0x3041, 0x309D,
0x30A1, 0x30FC, 0x3105, 0x3131, 0x31A0, 0XA000, 0XA100, 0XA200, 0XA300, 0XA400,
0XF900, 0XFA00, 0xFB00, 0xFB13, 0xFB1F, 0xFB2A, 0xFB38, 0xFB40, 0xFB43, 0xFB46,
0XFBD3, 0XFCD3, 0xFD50, 0xFD92, 0xFDF0, 0xFE70, 0xFE76, 0xFF21, 0xFF41, 0xFF66,
0xFFC2, 0xFFCA, 0xFFD2, 0xFFDA
0x00c0, 0x00d8, 0x00f8, 0x01f8, 0x0222, 0x0250, 0x02b0, 0x02bb, 0x02d0, 0x02e0,
0x0388, 0x038e, 0x03a3, 0x03d0, 0x03da, 0x0400, 0x048c, 0x04c7, 0x04cb, 0x04d0,
0x04f8, 0x0531, 0x0561, 0x05d0, 0x05f0, 0x0621, 0x0640, 0x0671, 0x06e5, 0x06fa,
0x0712, 0x0780, 0x0905, 0x0958, 0x0985, 0x098f, 0x0993, 0x09aa, 0x09b6, 0x09dc,
0x09df, 0x09f0, 0x0a05, 0x0a0f, 0x0a13, 0x0a2a, 0x0a32, 0x0a35, 0x0a38, 0x0a59,
0x0a72, 0x0a85, 0x0a8f, 0x0a93, 0x0aaa, 0x0ab2, 0x0ab5, 0x0b05, 0x0b0f, 0x0b13,
0x0b2a, 0x0b32, 0x0b36, 0x0b5c, 0x0b5f, 0x0b85, 0x0b8e, 0x0b92, 0x0b99, 0x0b9e,
0x0ba3, 0x0ba8, 0x0bae, 0x0bb7, 0x0c05, 0x0c0e, 0x0c12, 0x0c2a, 0x0c35, 0x0c60,
0x0c85, 0x0c8e, 0x0c92, 0x0caa, 0x0cb5, 0x0ce0, 0x0d05, 0x0d0e, 0x0d12, 0x0d2a,
0x0d60, 0x0d85, 0x0d9a, 0x0db3, 0x0dc0, 0x0e01, 0x0e32, 0x0e40, 0x0e81, 0x0e87,
0x0e94, 0x0e99, 0x0ea1, 0x0eaa, 0x0ead, 0x0eb2, 0x0ec0, 0x0edc, 0x0f40, 0x0f49,
0x0f88, 0x1000, 0x1023, 0x1029, 0x1050, 0x10a0, 0x10d0, 0x1100, 0x115f, 0x11a8,
0x1200, 0x1208, 0x124a, 0x1250, 0x125a, 0x1260, 0x128a, 0x1290, 0x12b2, 0x12b8,
0x12c2, 0x12c8, 0x12d0, 0x12d8, 0x12f0, 0x1312, 0x1318, 0x1320, 0x1348, 0x13a0,
0x1401, 0x1501, 0x1601, 0x166f, 0x1681, 0x16a0, 0x1780, 0x1820, 0x1880, 0x1e00,
0x1ea0, 0x1f00, 0x1f18, 0x1f20, 0x1f48, 0x1f50, 0x1f5f, 0x1f80, 0x1fb6, 0x1fc2,
0x1fc6, 0x1fd0, 0x1fd6, 0x1fe0, 0x1ff2, 0x1ff6, 0x210a, 0x2119, 0x212a, 0x212f,
0x2133, 0x2160, 0x3005, 0x3021, 0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc,
0x3105, 0x3131, 0x31a0, 0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xf900, 0xfa00,
0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3,
0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca,
0xffd2, 0xffda
};
/**
* Character lengths for the unicode letters.
*
* The characters covered by these intervalse are from
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/
static const uint8_t unicode_letter_interv_lens[] JERRY_CONST_DATA =
static const uint8_t jerry_unicode_letter_interval_lengths[] JERRY_CONST_DATA =
{
22, 30, 255, 39, 17, 93, 8, 6,
1, 4, 2, 19, 43, 7, 25, 129, 56, 1,
1, 37, 1, 37, 38, 26, 2, 25, 10, 98,
1, 2, 26, 37, 52, 9, 7, 1, 21, 6,
3, 1, 2, 1, 5, 1, 21, 6, 1, 1,
1, 3, 2, 6, 2, 21, 6, 1, 4, 7,
1, 21, 6, 1, 3, 1, 2, 5, 2, 3,
1, 1, 1, 2, 7, 2, 7, 2, 22, 9,
4, 1, 7, 2, 22, 9, 4, 1, 7, 2,
22, 15, 1, 17, 23, 8, 6, 47, 1, 6,
1, 1, 3, 6, 2, 1, 3, 1, 4, 1,
7, 33, 3, 33, 4, 1, 5, 37, 38, 89,
67, 81, 6, 62, 3, 6, 3, 38, 3, 30,
3, 6, 3, 6, 6, 22, 30, 3, 6, 38,
18, 84, 255, 255, 107, 7, 25, 74, 51, 87,
40, 155, 89, 21, 5, 37, 5, 7, 30, 52,
6, 2, 6, 3, 5, 12, 2, 6, 9, 4,
3, 2, 6, 35, 2, 8, 4, 2, 83, 1,
89, 2, 39, 93, 23, 255, 255, 255, 255, 140,
255, 45, 6, 4, 9, 12, 4, 1, 1, 107,
255, 106, 63, 53, 11, 2, 134, 25, 25, 88,
5, 5, 5, 2
0x0016, 0x001e, 0x00ff, 0x0027, 0x0011, 0x005d, 0x0008, 0x0006, 0x0001, 0x0004,
0x0002, 0x0013, 0x002b, 0x0007, 0x0019, 0x0081, 0x0038, 0x0001, 0x0001, 0x0025,
0x0001, 0x0025, 0x0026, 0x001a, 0x0002, 0x0019, 0x000a, 0x0062, 0x0001, 0x0002,
0x001a, 0x0025, 0x0034, 0x0009, 0x0007, 0x0001, 0x0015, 0x0006, 0x0003, 0x0001,
0x0002, 0x0001, 0x0005, 0x0001, 0x0015, 0x0006, 0x0001, 0x0001, 0x0001, 0x0003,
0x0002, 0x0006, 0x0002, 0x0015, 0x0006, 0x0001, 0x0004, 0x0007, 0x0001, 0x0015,
0x0006, 0x0001, 0x0003, 0x0001, 0x0002, 0x0005, 0x0002, 0x0003, 0x0001, 0x0001,
0x0001, 0x0002, 0x0007, 0x0002, 0x0007, 0x0002, 0x0016, 0x0009, 0x0004, 0x0001,
0x0007, 0x0002, 0x0016, 0x0009, 0x0004, 0x0001, 0x0007, 0x0002, 0x0016, 0x000f,
0x0001, 0x0011, 0x0017, 0x0008, 0x0006, 0x002f, 0x0001, 0x0006, 0x0001, 0x0001,
0x0003, 0x0006, 0x0002, 0x0001, 0x0003, 0x0001, 0x0004, 0x0001, 0x0007, 0x0021,
0x0003, 0x0021, 0x0004, 0x0001, 0x0005, 0x0025, 0x0026, 0x0059, 0x0043, 0x0051,
0x0006, 0x003e, 0x0003, 0x0006, 0x0003, 0x0026, 0x0003, 0x001e, 0x0003, 0x0006,
0x0003, 0x0006, 0x0006, 0x0016, 0x001e, 0x0003, 0x0006, 0x0026, 0x0012, 0x0054,
0x00ff, 0x00ff, 0x006b, 0x0007, 0x0019, 0x004a, 0x0033, 0x0057, 0x0028, 0x009b,
0x0059, 0x0015, 0x0005, 0x0025, 0x0005, 0x0007, 0x001e, 0x0034, 0x0006, 0x0002,
0x0006, 0x0003, 0x0005, 0x000c, 0x0002, 0x0006, 0x0009, 0x0004, 0x0003, 0x0002,
0x0006, 0x0023, 0x0002, 0x0008, 0x0004, 0x0002, 0x0053, 0x0001, 0x0059, 0x0002,
0x0027, 0x005d, 0x0017, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x00ff, 0x002d,
0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a,
0x003f, 0x0035, 0x000b, 0x0002, 0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005,
0x0005, 0x0002
};
/**
* Those unicode letter characters that are not inside any of
* the intervals specified in unicode_letter_intervals array.
* the intervals specified in jerry_unicode_letter_interval_sps array.
*
* The characters are from the following Unicode categories:
* Lu, Ll, Lt, Lm, Lo, Nl
*/
static const uint16_t unicode_letter_chars[] JERRY_CONST_DATA =
static const uint16_t jerry_unicode_letter_chars[] JERRY_CONST_DATA =
{
0x00AA, 0x00B5, 0x00BA, 0x02EE, 0x037A, 0x0386, 0x038C, 0x0559, 0x06D5, 0x0710,
0x093D, 0x0950, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AD0, 0x0AE0, 0x0B3D, 0x0B9C,
0x0CDE, 0x0DBD, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EBD, 0x0EC6, 0x0F00,
0x1248, 0x1258, 0x1288, 0x12B0, 0x12C0, 0x1310, 0x1F59, 0x1F5B, 0x1F5D, 0x1FBE,
0x207F, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3400, 0x4DB5, 0x4E00,
0x9FA5, 0xAC00, 0xD7A3, 0xFB1D, 0xFB3E, 0xFE74
0x00aa, 0x00b5, 0x00ba, 0x02ee, 0x037a, 0x0386, 0x038c, 0x0559, 0x06d5, 0x0710,
0x093d, 0x0950, 0x09b2, 0x0a5e, 0x0a8d, 0x0abd, 0x0ad0, 0x0ae0, 0x0b3d, 0x0b9c,
0x0cde, 0x0dbd, 0x0e84, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00,
0x1248, 0x1258, 0x1288, 0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe,
0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3400, 0x4db5, 0x4e00,
0x9fa5, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e, 0xfe74
};
/**
* Character interval starting points for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervalse are from
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/
static const uint16_t unicode_non_letter_ident_part_interv_sps[] JERRY_CONST_DATA =
static const uint16_t jerry_unicode_non_letter_ident_part_interval_sps[] JERRY_CONST_DATA =
{
/*
* decimal digits: handled separately
* 0x0030, len: 9
*/
0x0300, 0x0360, 0x0483, 0x0591, 0x05A3, 0x05BB, 0x05C1, 0x064B, 0x0660,
0x06D6, 0x06DF, 0x06E7, 0x06EA, 0x06F0, 0x0730, 0x07A6, 0x0901, 0x093E, 0x0951,
0x0962, 0x0966, 0x0981, 0x09BE, 0x09C7, 0x09CB, 0x09E2, 0x09E6, 0x0A3E, 0x0A47,
0x0A4B, 0x0A66, 0x0A81, 0x0ABE, 0x0AC7, 0x0ACB, 0x0AE6, 0x0B01, 0x0B3E, 0x0B47,
0x0B4B, 0x0B56, 0x0B66, 0x0B82, 0x0BBE, 0x0BC6, 0x0BCA, 0x0BE7, 0x0C01, 0x0C3E,
0x0C46, 0x0C4A, 0x0C55, 0x0C66, 0x0C82, 0x0CBE, 0x0CC6, 0x0CCA, 0x0CD5, 0x0CE6,
0x0D02, 0x0D3E, 0x0D46, 0x0D4A, 0x0D66, 0x0D82, 0x0DCF, 0x0DD8, 0x0DF2, 0x0E34,
0x0E47, 0x0E50, 0x0EB4, 0x0EBB, 0x0EC8, 0x0ED0, 0x0F18, 0x0F20, 0x0F3E, 0x0F71,
0x0F86, 0x0F90, 0x0F99, 0x102C, 0x1036, 0x1040, 0x1056, 0x1369, 0x17B4, 0x17E0,
0x1810, 0x203F, 0x20D0, 0x302A, 0x3099, 0xFE20, 0xFE33, 0xFE4D, 0xFF10
0x0300, 0x0360, 0x0483, 0x0591, 0x05a3, 0x05bb, 0x05c1, 0x064b, 0x0660, 0x06d6,
0x06df, 0x06e7, 0x06ea, 0x06f0, 0x0730, 0x07a6, 0x0901, 0x093e, 0x0951, 0x0962,
0x0966, 0x0981, 0x09be, 0x09c7, 0x09cb, 0x09e2, 0x09e6, 0x0a3e, 0x0a47, 0x0a4b,
0x0a66, 0x0a81, 0x0abe, 0x0ac7, 0x0acb, 0x0ae6, 0x0b01, 0x0b3e, 0x0b47, 0x0b4b,
0x0b56, 0x0b66, 0x0b82, 0x0bbe, 0x0bc6, 0x0bca, 0x0be7, 0x0c01, 0x0c3e, 0x0c46,
0x0c4a, 0x0c55, 0x0c66, 0x0c82, 0x0cbe, 0x0cc6, 0x0cca, 0x0cd5, 0x0ce6, 0x0d02,
0x0d3e, 0x0d46, 0x0d4a, 0x0d66, 0x0d82, 0x0dcf, 0x0dd8, 0x0df2, 0x0e34, 0x0e47,
0x0e50, 0x0eb4, 0x0ebb, 0x0ec8, 0x0ed0, 0x0f18, 0x0f20, 0x0f3e, 0x0f71, 0x0f86,
0x0f90, 0x0f99, 0x102c, 0x1036, 0x1040, 0x1056, 0x1369, 0x17b4, 0x17e0, 0x1810,
0x203f, 0x20d0, 0x302a, 0x3099, 0xfe20, 0xfe33, 0xfe4d, 0xff10
};
/**
* Character interval lengths for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervalse are from
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/
static const uint8_t unicode_non_letter_ident_part_interv_lens[] =
static const uint8_t jerry_unicode_non_letter_ident_part_interval_lengths[] JERRY_CONST_DATA =
{
78, 2, 3, 16, 22, 2, 1, 10, 9,
6, 5, 1, 3, 9, 26, 10, 2, 15, 3,
1, 9, 2, 6, 1, 2, 1, 9, 4, 1,
2, 11, 2, 7, 2, 2, 9, 2, 5, 1,
2, 1, 9, 1, 4, 2, 3, 8, 2, 6,
2, 3, 1, 9, 1, 6, 2, 3, 1, 9,
1, 5, 2, 3, 9, 1, 5, 7, 1, 6,
7, 9, 5, 1, 5, 9, 1, 9, 1, 19,
1, 7, 35, 6, 3, 9, 3, 8, 31, 9,
9, 1, 12, 5, 1, 3, 1, 2, 9
0x004e, 0x0002, 0x0003, 0x0010, 0x0016, 0x0002, 0x0001, 0x000a, 0x0009, 0x0006,
0x0005, 0x0001, 0x0003, 0x0009, 0x001a, 0x000a, 0x0002, 0x000f, 0x0003, 0x0001,
0x0009, 0x0002, 0x0006, 0x0001, 0x0002, 0x0001, 0x0009, 0x0004, 0x0001, 0x0002,
0x000b, 0x0002, 0x0007, 0x0002, 0x0002, 0x0009, 0x0002, 0x0005, 0x0001, 0x0002,
0x0001, 0x0009, 0x0001, 0x0004, 0x0002, 0x0003, 0x0008, 0x0002, 0x0006, 0x0002,
0x0003, 0x0001, 0x0009, 0x0001, 0x0006, 0x0002, 0x0003, 0x0001, 0x0009, 0x0001,
0x0005, 0x0002, 0x0003, 0x0009, 0x0001, 0x0005, 0x0007, 0x0001, 0x0006, 0x0007,
0x0009, 0x0005, 0x0001, 0x0005, 0x0009, 0x0001, 0x0009, 0x0001, 0x0013, 0x0001,
0x0007, 0x0023, 0x0006, 0x0003, 0x0009, 0x0003, 0x0008, 0x001f, 0x0009, 0x0009,
0x0001, 0x000c, 0x0005, 0x0001, 0x0003, 0x0001, 0x0002, 0x0009
};
/**
* Those non-letter characters that can be used as a non-first
* character of an identifier and not included in any of the intervals
* specified in unicode_non_letter_ident_part_intervals array.
* specified in jerry_unicode_non_letter_ident_part_interval_sps array.
*
* The characters are from the following Unicode categories:
* Nd, Mn, Mc, Pc
*/
static const uint16_t unicode_non_letter_ident_part_chars[] =
static const uint16_t jerry_unicode_non_letter_ident_part_chars[] JERRY_CONST_DATA =
{
0x005F, 0x05BF, 0x05C4, 0x0670, 0x0711, 0x093C, 0x09BC, 0x09D7, 0x0A02, 0x0A3C,
0x0ABC, 0x0B3C, 0x0BD7, 0x0D57, 0x0DCA, 0x0DD6, 0x0E31, 0x0EB1, 0x0F35, 0x0F37,
0x0F39, 0x0FC6, 0x18A9, 0x20E1, 0x30FB, 0xFB1E, 0xFF3F, 0xFF65
0x05bf, 0x05c4, 0x0670, 0x0711, 0x093c, 0x09bc, 0x09d7, 0x0a02, 0x0a3c, 0x0abc,
0x0b3c, 0x0bd7, 0x0d57, 0x0dca, 0x0dd6, 0x0e31, 0x0eb1, 0x0f35, 0x0f37, 0x0f39,
0x0fc6, 0x18a9, 0x20e1, 0x30fb, 0xfb1e, 0xff3f, 0xff65
};
/**
* Unicode separator character interval strting points from Unicode category: Zs
* Unicode separator character interval starting points from Unicode category: Zs
*/
static const uint16_t unicode_separator_char_interv_sps[] =
static const uint16_t jerry_unicode_separator_char_interval_sps[] JERRY_CONST_DATA =
{
0x2000
};
@@ -182,29 +165,19 @@ static const uint16_t unicode_separator_char_interv_sps[] =
/**
* Unicode separator character interval lengths from Unicode category: Zs
*/
static const uint8_t unicode_separator_char_interv_lens[] =
static const uint8_t jerry_unicode_separator_char_interval_lengths[] JERRY_CONST_DATA =
{
11
0x000b
};
/**
* Unicode separator characters that are not in the
* unicode_separator_char_intervals array.
* jerry_unicode_separator_char_intervals array.
*
* Unicode category: Zs
*/
static const uint16_t unicode_separator_chars[] =
static const uint16_t jerry_unicode_separator_chars[] JERRY_CONST_DATA =
{
/*
* these two chars are handled separatly @see lit_char_is_space_separator
* 0x0020, space
* 0x00A0, non-braking space
*/
0x1680, \
0x180E, /* manually added */ \
0x202F, /* manually added */ \
0x205F, \
0x3000
0x1680, 0x180e, 0x202f, 0x205f, 0x3000
};
#endif
-186
View File
@@ -1,186 +0,0 @@
#!/bin/bash
# Copyright JS Foundation and other contributors, http://js.foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
#
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
# letter: Lu Ll Lt Lm Lo Nl
# non-letter-indent-part:
# digit: Nd
# punctuation mark: Mn Mc
# connector punctuation: Pc
# separators: Zs
if [ $# -le 4 ]; then
echo "useage: print-unicode-ranges.sh <unicode-data-path> <-i y sp|y len|n> <-cat letters|non-let-indent-parts|separators>"
echo " -i: y sp - print interval starting points"
echo " y len - print interval lengths"
echo " n - print individual characters"
echo " -cat: whether print letters|non-let-indent-parts|separators category"
exit 1
fi
STARTING_POINT="len"
UNICODE_DATA_PATH="$1"
shift
while [ $# -gt 0 ]; do
if [ $1 == "-i" ]; then
shift
PRINT_INTERVALS="$1"
if [ $PRINT_INTERVALS == "y" ]; then
shift
STARTING_POINT="$1"
echo $STARTING_POINT
fi
elif [ $1 == "-cat" ]; then
shift
CATEGORY="$1"
echo $CATEGORY
fi
shift
done
awk -v desired_category="$CATEGORY" \
'BEGIN \
{ \
FS=";"; OFS=";" \
} \
{ \
cat=$3; \
if (desired_category == "letters" && (cat == "Lu" || cat == "Ll" || cat == "Lt" || cat == "Lm" || cat == "Lo" || cat == "Nl")) \
{ \
print "0x"$1, $2, $3; \
} \
else if (desired_category == "non-let-indent-parts" && (cat == "Nd" || cat == "Mn" || cat == "Mc" || cat == "Pc")) \
{ \
print "0x"$1, $2, $3; \
} \
else if (desired_category == "separators" && cat == "Zs") \
{ \
print "0x"$1, $2, $3; \
} \
}' $UNICODE_DATA_PATH \
| gawk --non-decimal-data -v print_intervals="$PRINT_INTERVALS" -v sp="$STARTING_POINT" \
'BEGIN \
{ \
FS = ";"; \
OFS = ";"; \
is_in_range = 0; \
print_count = 0; \
} \
\
function print_Nl() \
{ \
++print_count; \
if (print_count == 10) \
{ \
printf "\n"; \
print_count = 0; \
} \
} \
\
function output_next_range () \
{ \
if (range_begin != range_prev && print_intervals=="y") \
{ \
i1 = strtonum(range_begin); \
i2 = strtonum(range_prev); \
len = i2 - i1; \
# if the length of an interval is > 255 have to spilt it into 255-lenth ones
if (len > 255) \
{ \
numOfSubintervals = (len / 255); # more precisely number of subintervals - 1 \
for (i = 1; i <= numOfSubintervals; ++i) \
{ \
if (sp == "sp") \
{ \
printf "0X%X, ", i1; \
print_Nl(); \
}
else \
{ \
printf "%d, ", 255; \
print_Nl(); \
} \
i1 = i1 + 256; # next interval begins on the ending of the previous + 1 \
} \
if (sp == "sp") \
{ \
printf "0X%X, ", i1; \
print_Nl(); \
} \
else \
{ \
printf "%d, ", len % 255 - (i-1); \
print_Nl(); \
} \
} \
else \
{ \
if (sp == "sp") \
{ \
printf "%s, ", range_begin; \
print_Nl(); \
} \
else \
{ \
printf "%d, ", len; \
print_Nl(); \
} \
} \
} \
else if (range_begin == range_prev && print_intervals != "y")\
{ \
printf "%s, ", range_begin; \
print_Nl(); \
} \
} \
\
{ \
if (is_in_range == 0) \
{ \
is_in_range = 1; \
range_begin = $1; \
range_prev = $1; \
range_begin_name = $2; \
range_prev_name = $2; \
} \
else \
{ \
if (range_prev + 1 == $1) \
{ \
range_prev = $1; \
range_prev_name = $2
} \
else \
{ \
output_next_range(); \
range_begin = $1; \
range_prev=$1; \
range_begin_name = $2; \
range_prev_name = $2; \
} \
} \
} \
\
END \
{ \
output_next_range(); \
}'
+328
View File
@@ -0,0 +1,328 @@
#!/usr/bin/env python
# Copyright JS Foundation and other contributors, http://js.foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
#
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
# letter: Lu Ll Lt Lm Lo Nl
# non-letter-indent-part:
# digit: Nd
# punctuation mark: Mn Mc
# connector punctuation: Pc
# separators: Zs
import argparse
import bisect
import csv
import itertools
import os
TOOLS_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.normpath(os.path.join(TOOLS_DIR, '..'))
C_SOURCE_FILE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
parser = argparse.ArgumentParser()
parser.add_argument('unicode_data',
metavar='FILE',
action='store',
help='specify the unicode data file')
parser.add_argument('--c-source',
metavar='FILE',
action='store',
default=C_SOURCE_FILE,
help='specify the output c source (default: %(default)s)')
script_args = parser.parse_args()
def main():
if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
print('The %s file is missing or not readable!' % script_args.unicode_data)
sys.exit(1)
letters, non_letters, separators = read_categories()
letters_list = list(ranges(letters))
letter_interval_sps, letter_interval_lengths, letter_chars = split_list(letters_list)
non_letters_list = list(ranges(non_letters))
non_letter_interval_sps, non_letter_interval_lengths, non_letter_chars = split_list(non_letters_list)
separator_list = list(ranges(separators))
separator_interval_sps, separator_interval_lengths, separator_chars = split_list(separator_list)
source = GenSource()
letter_interval_sps_desc = """/**
* Character interval starting points for the unicode letters.
*
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint16_t",
"unicode_letter_interval_sps",
letter_interval_sps,
letter_interval_sps_desc)
letter_interval_lengths_desc = """/**
* Character lengths for the unicode letters.
*
* The characters covered by these intervals are from
* the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint8_t",
"unicode_letter_interval_lengths",
letter_interval_lengths,
letter_interval_lengths_desc)
letter_chars_desc = """/**
* Those unicode letter characters that are not inside any of
* the intervals specified in jerry_unicode_letter_interval_sps array.
*
* The characters are from the following Unicode categories:
* Lu, Ll, Lt, Lm, Lo, Nl
*/"""
source.add_table("uint16_t",
"unicode_letter_chars",
letter_chars,
letter_chars_desc)
non_letter_interval_sps_desc = """/**
* Character interval starting points for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/"""
source.add_table("uint16_t",
"unicode_non_letter_ident_part_interval_sps",
non_letter_interval_sps,
non_letter_interval_sps_desc)
non_letter_interval_lengths_desc = """/**
* Character interval lengths for non-letter character
* that can be used as a non-first character of an identifier.
*
* The characters covered by these intervals are from
* the following Unicode categories: Nd, Mn, Mc, Pc
*/"""
source.add_table("uint8_t",
"unicode_non_letter_ident_part_interval_lengths",
non_letter_interval_lengths,
non_letter_interval_lengths_desc)
non_letter_chars_desc = """/**
* Those non-letter characters that can be used as a non-first
* character of an identifier and not included in any of the intervals
* specified in jerry_unicode_non_letter_ident_part_interval_sps array.
*
* The characters are from the following Unicode categories:
* Nd, Mn, Mc, Pc
*/"""
source.add_table("uint16_t",
"unicode_non_letter_ident_part_chars",
non_letter_chars,
non_letter_chars_desc)
separator_interval_sps_desc = """/**
* Unicode separator character interval starting points from Unicode category: Zs
*/"""
source.add_table("uint16_t",
"unicode_separator_char_interval_sps",
separator_interval_sps,
separator_interval_sps_desc)
separator_interval_lengths_desc = """/**
* Unicode separator character interval lengths from Unicode category: Zs
*/"""
source.add_table("uint8_t",
"unicode_separator_char_interval_lengths",
separator_interval_lengths,
separator_interval_lengths_desc)
separator_chars_desc = """/**
* Unicode separator characters that are not in the
* jerry_unicode_separator_char_intervals array.
*
* Unicode category: Zs
*/"""
source.add_table("uint16_t",
"unicode_separator_chars",
separator_chars,
separator_chars_desc)
source.write_source()
def read_categories():
"""
Read the corresponding unicode values and store them in category lists.
:return: List of letters, non_letter and separators.
"""
letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
separator_category = ["Zs"]
letters = []
non_letters = []
separators = []
with open(script_args.unicode_data) as unicode_data:
unicode_data_reader = csv.reader(unicode_data, delimiter=';')
for line in unicode_data_reader:
unicode_id = int(line[0], 16)
# Skip supplementary planes and ascii chars
if unicode_id >= 0x10000 or unicode_id < 128:
continue
category = line[2]
if category in letter_category:
letters.append(unicode_id)
elif category in non_letter_category:
non_letters.append(unicode_id)
elif category in separator_category:
separators.append(unicode_id)
# This separator char is handled separatly
non_breaking_space = 0x00A0
if non_breaking_space in separators:
separators.remove(int(non_breaking_space))
# These separator chars are not in UnicodeData-3.0.0.txt or not in Zs category
mongolian_vowel_separator = 0x180E
medium_mathematical_space = 0x205F
if mongolian_vowel_separator not in separators:
bisect.insort(separators, int(mongolian_vowel_separator))
if medium_mathematical_space not in separators:
bisect.insort(separators, int(medium_mathematical_space))
return letters, non_letters, separators
def ranges(i):
"""
Convert an increasing list of integers into a range list
:return: List of ranges.
"""
for a, b in itertools.groupby(enumerate(i), lambda (x, y): y - x):
b = list(b)
yield b[0][1], b[-1][1]
def split_list(category_list):
"""
Split list of ranges into intervals and single char lists.
:return: List of interval starting points, interval lengths and single chars
"""
unicode_category_interval_sps = []
unicode_category_interval_lengths = []
unicode_category_chars = []
for element in category_list:
interval_length = element[1] - element[0]
if interval_length == 0:
unicode_category_chars.append(element[0])
elif (interval_length > 255):
for i in range(element[0], element[1], 256):
length = 255 if (element[1] - i > 255) else (element[1] - i)
unicode_category_interval_sps.append(i)
unicode_category_interval_lengths.append(length)
else:
unicode_category_interval_sps.append(element[0])
unicode_category_interval_lengths.append(element[1] - element[0])
return unicode_category_interval_sps, unicode_category_interval_lengths, unicode_category_chars
class GenSource(object):
"""Class defines a default generated c source."""
def __init__(self):
self._data = []
header = """/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is automatically generated by the {SCRIPT} script
* from {UNICODES}. Do not edit!
*/
""".format(SCRIPT=os.path.basename(__file__), UNICODES=os.path.basename(script_args.unicode_data))
self._data.append(header)
def _regroup(self, l, n):
return [l[i:i+n] for i in range(0, len(l), n)]
def _hex_format(self, ch):
if isinstance(ch, str):
ch = ord(ch)
return "0x{:04x}".format(ch)
def _format_code(self, code, indent):
lines = []
# convert all characters to hex format
converted_code = map(self._hex_format, code)
# 10 hex number per line
for line in self._regroup(", ".join(converted_code), 10 * 8):
lines.append((' ' * indent) + line.strip())
return "\n".join(lines)
def add_table(self, type_name, array_name, table, description=""):
table_str = """{DESC}
static const {TYPE} jerry_{NAME}[] JERRY_CONST_DATA =
{{
{TABLE}
}};
""".format(DESC=description, TYPE=type_name, NAME=array_name, TABLE=self._format_code(table, 1))
self._data.append(table_str)
def write_source(self):
with open(script_args.c_source, 'w') as genereted_source:
genereted_source.write(''.join(self._data))
if __name__ == "__main__":
main()