Add functions for iterating utf-8 strings.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov
2015-07-01 23:33:39 +03:00
parent 0787d76b62
commit ae3eea8ae8
5 changed files with 529 additions and 114 deletions
+273 -93
View File
@@ -17,54 +17,6 @@
#include "jrt-libc-includes.h"
/**
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
*/
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
#define LIT_UTF16_BITS_IN_SURROGATE (10)
#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
#define LIT_UTF8_1_BYTE_MARKER (0x00)
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
#define LIT_UTF8_1_BYTE_MASK (0x80)
#define LIT_UTF8_2_BYTE_MASK (0xE0)
#define LIT_UTF8_3_BYTE_MASK (0xF0)
#define LIT_UTF8_4_BYTE_MASK (0xF8)
#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
#define LIT_UTF8_LAST_1_BIT_MASK (0x01)
#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x1000)
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
/**
* Validate utf-8 string
*
@@ -175,18 +127,80 @@ lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
lit_utf8_size_t buf_size) /**< string size */
{
JERRY_ASSERT (utf8_buf_p || !buf_size);
JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size));
lit_utf8_iterator_t buf_iter =
{
0,
buf_size,
utf8_buf_p,
0,
buf_size,
{
0,
false
}
};
return buf_iter;
} /* lit_utf8_iterator_create */
/**
* Reset iterator to point to the beginning of a string
*/
void
lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */
{
iter_p->buf_pos.offset = 0;
iter_p->buf_pos.is_non_bmp_middle = false;
} /* lit_utf8_iterator_seek_bos */
/**
* Reset iterator to point to the end of a string
*/
void
lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */
{
iter_p->buf_pos.offset = iter_p->buf_size & LIT_ITERATOR_OFFSET_MASK;
iter_p->buf_pos.is_non_bmp_middle = false;
} /* lit_utf8_iterator_seek_eos */
/**
* Save iterator's position to restore it later
*
* @return current position of the iterator
*/
lit_utf8_iterator_pos_t
lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *iter_p)
{
return iter_p->buf_pos;
} /* lit_utf8_iterator_get_pos */
/**
* Restore previously saved position of the iterator
*/
void
lit_utf8_iterator_seek (lit_utf8_iterator_t *iter_p, /**< utf-8 string iterator */
lit_utf8_iterator_pos_t iter_pos) /**< position to restore */
{
JERRY_ASSERT (iter_pos.offset <= iter_p->buf_size);
#ifndef JERRY_NDEBUG
lit_utf8_byte_t byte = *(iter_p->buf_p + iter_pos.offset);
JERRY_ASSERT ((byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
JERRY_ASSERT (!iter_pos.is_non_bmp_middle || ((byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER));
#endif
iter_p->buf_pos = iter_pos;
} /* lit_utf8_iterator_seek */
/**
* Get offset (in code units) of the iterator
*
* @return current offset of the iterator in code units
*/
ecma_length_t
lit_utf8_iterator_get_index (const lit_utf8_iterator_t *iter_p)
{
return lit_utf8_string_length (iter_p->buf_p, iter_p->buf_pos.offset) + iter_p->buf_pos.is_non_bmp_middle;
} /* lit_utf8_iterator_get_index */
/**
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
*
@@ -221,26 +235,71 @@ convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code poi
} /* convert_code_point_to_low_surrogate */
/**
* Get next code unit form the iterated string and increment iterator to point to next code unit
* Get next code unit form the iterated string
*
* @return next code unit
*/
ecma_char_t
lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p) /**< @in-out: utf-8 string iterator */
lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (buf_iter_p));
if (buf_iter_p->code_point)
{
ecma_char_t code_unit = convert_code_point_to_low_surrogate (buf_iter_p->code_point);
buf_iter_p->code_point = 0;
return code_unit;
}
JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p));
lit_code_point_t code_point;
buf_iter_p->buf_offset += lit_read_code_point_from_utf8 (buf_iter_p->buf_p + buf_iter_p->buf_offset,
buf_iter_p->buf_size - buf_iter_p->buf_offset,
&code_point);
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle);
return (ecma_char_t) code_point;
}
else
{
if (iter_p->buf_pos.is_non_bmp_middle)
{
return convert_code_point_to_low_surrogate (code_point);
}
else
{
return convert_code_point_to_high_surrogate (code_point);
}
}
} /* lit_utf8_iterator_peek_next */
/**
* Get previous code unit form the iterated string
*
* @return previous code unit
*/
ecma_char_t
lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
lit_code_point_t code_point;
lit_utf8_size_t offset = iter_p->buf_pos.offset;
if (iter_p->buf_pos.is_non_bmp_middle)
{
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
return convert_code_point_to_high_surrogate (code_point);
}
do
{
JERRY_ASSERT (offset != 0);
offset--;
}
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
lit_read_code_point_from_utf8 (iter_p->buf_p + offset,
iter_p->buf_size - offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
@@ -248,32 +307,153 @@ lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p)
}
else
{
buf_iter_p->code_point = code_point;
return convert_code_point_to_low_surrogate (code_point);
}
} /* lit_utf8_iterator_peek_prev */
/**
* Increment iterator to point to next code unit
*/
void
lit_utf8_iterator_incr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
lit_utf8_iterator_read_next (iter_p);
} /* lit_utf8_iterator_read_next */
/**
* Decrement iterator to point to previous code unit
*/
void
lit_utf8_iterator_decr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
lit_utf8_iterator_read_prev (iter_p);
} /* lit_utf8_iterator_decr */
/**
* Skip specified number of code units
*/
void
lit_utf8_iterator_advance (lit_utf8_iterator_t *iter_p, /**< in-out: iterator */
ecma_length_t chars_count) /**< number of code units to skip */
{
while (chars_count--)
{
lit_utf8_iterator_incr (iter_p);
}
} /* lit_utf8_iterator_advance */
/**
* Get next code unit form the iterated string and increment iterator to point to next code unit
*
* @return next code unit
*/
ecma_char_t
lit_utf8_iterator_read_next (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p));
lit_code_point_t code_point;
lit_utf8_size_t utf8_char_size = lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle);
iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK;
return (ecma_char_t) code_point;
}
else
{
if (iter_p->buf_pos.is_non_bmp_middle)
{
iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK;
iter_p->buf_pos.is_non_bmp_middle = false;
return convert_code_point_to_low_surrogate (code_point);
}
else
{
iter_p->buf_pos.is_non_bmp_middle = true;
return convert_code_point_to_high_surrogate (code_point);
}
}
} /* lit_utf8_iterator_read_next */
/**
* Get previous code unit form the iterated string and decrement iterator to point to previous code unit
*
* @return previous code unit
*/
ecma_char_t
lit_utf8_iterator_read_prev (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
lit_code_point_t code_point;
lit_utf8_size_t offset = iter_p->buf_pos.offset;
if (iter_p->buf_pos.is_non_bmp_middle)
{
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
iter_p->buf_pos.is_non_bmp_middle = false;
return convert_code_point_to_high_surrogate (code_point);
}
JERRY_ASSERT (false);
return LIT_CHAR_NULL;
} /* lit_utf8_iterator_read_code_unit_and_increment */
do
{
JERRY_ASSERT (offset != 0);
offset--;
}
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
return (ecma_char_t) code_point;
}
else
{
iter_p->buf_pos.is_non_bmp_middle = true;
return convert_code_point_to_low_surrogate (code_point);
}
} /* lit_utf8_iterator_read_prev */
/**
* Checks iterator reached end of the string
*
* @return true - the whole string was iterated
* @return true - iterator is at the end of string
* false - otherwise
*/
bool
lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *buf_iter_p) /**< utf-8 string iterator */
lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *iter_p) /**< utf-8 string iterator */
{
JERRY_ASSERT (buf_iter_p->buf_offset <= buf_iter_p->buf_size);
JERRY_ASSERT (iter_p->buf_pos.offset <= iter_p->buf_size);
if (buf_iter_p->code_point == LIT_UNICODE_CODE_POINT_NULL && buf_iter_p->buf_offset == buf_iter_p->buf_size)
{
return true;
}
return (iter_p->buf_pos.offset == iter_p->buf_size);
} /* lit_utf8_iterator_is_eos */
return false;
} /* lit_utf8_iterator_reached_buffer_end */
/**
* Checks iterator reached beginning of the string
*
* @return true - iterator is at the beginning of a string
* false - otherwise
*/
bool
lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
{
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
} /* lit_utf8_iterator_is_bos */
/**
* Calculate size of a zero-terminated utf-8 string
@@ -300,12 +480,12 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
{
ecma_length_t length = 0;
lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
while (!lit_utf8_iterator_reached_buffer_end (&buf_iter))
while (!lit_utf8_iterator_is_eos (&buf_iter))
{
lit_utf8_iterator_read_code_unit_and_increment (&buf_iter);
lit_utf8_iterator_read_next (&buf_iter);
length++;
}
JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&buf_iter));
JERRY_ASSERT (lit_utf8_iterator_is_eos (&buf_iter));
return length;
} /* lit_utf8_string_length */
@@ -375,13 +555,13 @@ lit_utf8_string_calc_hash_last_bytes (const lit_utf8_byte_t *utf8_buf_p, /**< ch
{
JERRY_ASSERT (utf8_buf_p != NULL);
lit_utf8_size_t byte1 = utf8_buf_size > 0 ? utf8_buf_p[utf8_buf_size - 1] : (lit_utf8_size_t) 0;
lit_utf8_size_t byte2 = utf8_buf_size > 1 ? utf8_buf_p[utf8_buf_size - 2] : (lit_utf8_size_t) 0;
lit_utf8_byte_t byte1 = (utf8_buf_size > 0) ? utf8_buf_p[utf8_buf_size - 1] : 0;
lit_utf8_byte_t byte2 = (utf8_buf_size > 1) ? utf8_buf_p[utf8_buf_size - 2] : 0;
lit_utf8_size_t t1 = byte1 + byte2;
lit_utf8_size_t t2 = t1 * 0x24418b66;
lit_utf8_size_t t3 = (t2 >> 16) ^ (t2 & 0xffffu);
lit_utf8_size_t t4 = (t3 >> 8) ^ (t3 & 0xffu);
uint32_t t1 = (uint32_t) byte1 + (uint32_t) byte2;
uint32_t t2 = t1 * 0x24418b66;
uint32_t t3 = (t2 >> 16) ^ (t2 & 0xffffu);
uint32_t t4 = (t3 >> 8) ^ (t3 & 0xffu);
return (lit_string_hash_t) t4;
} /* lit_utf8_string_calc_hash_last_bytes */
@@ -404,8 +584,8 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri
do
{
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter));
code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter);
JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter));
code_unit = lit_utf8_iterator_read_next (&iter);
}
while (code_unit_offset--);
@@ -560,11 +740,11 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size);
lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size);
while (!lit_utf8_iterator_reached_buffer_end (&iter1)
&& !lit_utf8_iterator_reached_buffer_end (&iter2))
while (!lit_utf8_iterator_is_eos (&iter1)
&& !lit_utf8_iterator_is_eos (&iter2))
{
ecma_char_t code_point1 = lit_utf8_iterator_read_code_unit_and_increment (&iter1);
ecma_char_t code_point2 = lit_utf8_iterator_read_code_unit_and_increment (&iter2);
ecma_char_t code_point1 = lit_utf8_iterator_read_next (&iter1);
ecma_char_t code_point2 = lit_utf8_iterator_read_next (&iter2);
if (code_point1 < code_point2)
{
return true;
@@ -575,5 +755,5 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
}
}
return (lit_utf8_iterator_reached_buffer_end (&iter1) && !lit_utf8_iterator_reached_buffer_end (&iter2));
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
} /* lit_compare_utf8_strings_relational */