Refactor builtins to handle CESU-8 encoded strings.

JerryScript-DCO-1.0-Signed-off-by: Zsolt Borbély zsborbely.u-szeged@partner.samsung.com
JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai.u-szeged@partner.samsung.com
This commit is contained in:
Dániel Bátyai
2015-09-09 14:27:17 +02:00
parent dcd610b305
commit 579b1edaa5
17 changed files with 517 additions and 696 deletions
+17 -154
View File
@@ -194,7 +194,7 @@ lit_is_cesu8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string
if (idx + extra_bytes_count > buf_size)
{
/* utf-8 string breaks in the middle */
/* cesu-8 string breaks in the middle */
return false;
}
@@ -212,7 +212,7 @@ lit_is_cesu8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string
if (code_point < min_code_point)
{
/* utf-8 string doesn't encode valid unicode code point */
/* cesu-8 string doesn't encode valid unicode code point */
return false;
}
@@ -254,8 +254,7 @@ lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
lit_utf8_size_t buf_size) /**< string size */
{
JERRY_ASSERT (utf8_buf_p || !buf_size);
/* TODO: Add back when builtins no longer use iterators */
/* JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size)); */
JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size));
lit_utf8_iterator_t buf_iter =
{
@@ -277,16 +276,6 @@ lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *iter_p) /**< iterator to reset
iter_p->buf_pos.is_non_bmp_middle = false;
} /* lit_utf8_iterator_seek_bos */
/**
* Reset iterator to point to the end of a string
*/
void
lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */
{
iter_p->buf_pos.offset = iter_p->buf_size & LIT_ITERATOR_OFFSET_MASK;
iter_p->buf_pos.is_non_bmp_middle = false;
} /* lit_utf8_iterator_seek_eos */
/**
* Save iterator's position to restore it later
*
@@ -315,17 +304,6 @@ lit_utf8_iterator_seek (lit_utf8_iterator_t *iter_p, /**< utf-8 string iterator
iter_p->buf_pos = iter_pos;
} /* lit_utf8_iterator_seek */
/**
* Get offset (in code units) of the iterator
*
* @return current offset of the iterator in code units
*/
ecma_length_t
lit_utf8_iterator_get_index (const lit_utf8_iterator_t *iter_p)
{
return lit_utf8_string_length (iter_p->buf_p, iter_p->buf_pos.offset) + iter_p->buf_pos.is_non_bmp_middle;
} /* lit_utf8_iterator_get_index */
/**
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
*
@@ -357,7 +335,7 @@ convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code poi
code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
} /* convert_code_point_to_low_surrogate */
} /* convert_code_point_to_high_surrogate */
/**
* Get next code unit form the iterated string
@@ -392,50 +370,6 @@ lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8
}
} /* lit_utf8_iterator_peek_next */
/**
* Get previous code unit form the iterated string
*
* @return previous code unit
*/
ecma_char_t
lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
lit_code_point_t code_point;
lit_utf8_size_t offset = iter_p->buf_pos.offset;
if (iter_p->buf_pos.is_non_bmp_middle)
{
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
return convert_code_point_to_high_surrogate (code_point);
}
do
{
JERRY_ASSERT (offset != 0);
offset--;
}
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
lit_read_code_point_from_utf8 (iter_p->buf_p + offset,
iter_p->buf_size - offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
return (ecma_char_t) code_point;
}
else
{
return convert_code_point_to_low_surrogate (code_point);
}
} /* lit_utf8_iterator_peek_prev */
/**
* Increment iterator to point to next code unit
*/
@@ -443,16 +377,7 @@ void
lit_utf8_iterator_incr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
lit_utf8_iterator_read_next (iter_p);
} /* lit_utf8_iterator_read_next */
/**
* Decrement iterator to point to previous code unit
*/
void
lit_utf8_iterator_decr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
lit_utf8_iterator_read_prev (iter_p);
} /* lit_utf8_iterator_decr */
} /* lit_utf8_iterator_incr */
/**
* Skip specified number of code units
@@ -504,56 +429,6 @@ lit_utf8_iterator_read_next (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 st
}
} /* lit_utf8_iterator_read_next */
/**
* Get previous code unit form the iterated string and decrement iterator to point to previous code unit
*
* @return previous code unit
*/
ecma_char_t
lit_utf8_iterator_read_prev (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
lit_code_point_t code_point;
lit_utf8_size_t offset = iter_p->buf_pos.offset;
if (iter_p->buf_pos.is_non_bmp_middle)
{
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
iter_p->buf_pos.is_non_bmp_middle = false;
return convert_code_point_to_high_surrogate (code_point);
}
do
{
JERRY_ASSERT (offset != 0);
offset--;
}
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
iter_p->buf_size - iter_p->buf_pos.offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
return (ecma_char_t) code_point;
}
else
{
iter_p->buf_pos.is_non_bmp_middle = true;
return convert_code_point_to_low_surrogate (code_point);
}
} /* lit_utf8_iterator_read_prev */
/**
* Checks iterator reached end of the string
*
@@ -568,18 +443,6 @@ lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *iter_p) /**< utf-8 string i
return (iter_p->buf_pos.offset == iter_p->buf_size);
} /* lit_utf8_iterator_is_eos */
/**
* Checks iterator reached beginning of the string
*
* @return true - iterator is at the beginning of a string
* false - otherwise
*/
bool
lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
{
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
} /* lit_utf8_iterator_is_bos */
/**
* Calculate size of a zero-terminated utf-8 string
*
@@ -595,7 +458,7 @@ lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated
} /* lit_zt_utf8_string_size */
/**
* Calculate length of a cesu-8 string
* Calculate length of a cesu-8 encoded string
*
* @return UTF-16 code units count
*/
@@ -733,7 +596,7 @@ lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer wit
/**
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
*
* @return read character
* @return next code unit
*/
ecma_char_t
lit_utf8_read_next (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
@@ -749,7 +612,7 @@ lit_utf8_read_next (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters
/**
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
*
* @return read character
* @return previous code unit
*/
ecma_char_t
lit_utf8_read_prev (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
@@ -766,7 +629,7 @@ lit_utf8_read_prev (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters
/**
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
*
* @return read character
* @return next code unit
*/
ecma_char_t
lit_utf8_peek_next (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters */
@@ -782,7 +645,7 @@ lit_utf8_peek_next (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters *
/**
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
*
* @return read character
* @return previous code unit
*/
ecma_char_t
lit_utf8_peek_prev (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters */
@@ -796,7 +659,7 @@ lit_utf8_peek_prev (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters *
} /* lit_utf8_peek_prev */
/**
* Increase character pointer by one code unit.
* Increase cesu-8 encoded string pointer by one code unit.
*/
void
lit_utf8_incr (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
@@ -807,7 +670,7 @@ lit_utf8_incr (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
} /* lit_utf8_incr */
/**
* Decrease character pointer by one code unit.
* Decrease cesu-8 encoded string pointer by one code unit.
*/
void
lit_utf8_decr (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
@@ -915,9 +778,9 @@ lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte)
} /* lit_get_unicode_char_size_by_utf8_first_byte */
/**
* Convert code_unit to cesu-8 representation
* Convert code unit to cesu-8 representation
*
* @return bytes count, stored required to represent specified code unit
* @return byte count required to represent the code unit
*/
lit_utf8_size_t
lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
@@ -964,7 +827,7 @@ lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
/**
* Convert code point to cesu-8 representation
*
* @return bytes count, stored required to represent specified code unit
* @return byte count required to represent the code point
*/
lit_utf8_size_t
lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
@@ -986,7 +849,7 @@ lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
/**
* Convert code point to utf-8 representation
*
* @return bytes count, stored required to represent specified code unit
* @return byte count required to represent the code point
*/
lit_utf8_size_t
lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
@@ -1073,7 +936,7 @@ lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high
code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
return code_point;
} /* lit_surrogate_pair_to_code_point */
} /* lit_convert_surrogate_pair_to_code_point */
/**
* Compare cesu-8 string to cesu-8 string