Add functions for iterating utf-8 strings.
JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
@@ -25,16 +25,82 @@
|
||||
*/
|
||||
#define LIT_BYTE_NULL (0)
|
||||
|
||||
/**
|
||||
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
||||
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
|
||||
*/
|
||||
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
||||
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
||||
|
||||
#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
|
||||
#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
|
||||
#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
|
||||
#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
|
||||
#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
|
||||
#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
|
||||
#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
|
||||
#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
|
||||
#define LIT_UTF16_BITS_IN_SURROGATE (10)
|
||||
#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
|
||||
|
||||
#define LIT_UTF8_1_BYTE_MARKER (0x00)
|
||||
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
|
||||
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
|
||||
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
|
||||
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
|
||||
|
||||
#define LIT_UTF8_1_BYTE_MASK (0x80)
|
||||
#define LIT_UTF8_2_BYTE_MASK (0xE0)
|
||||
#define LIT_UTF8_3_BYTE_MASK (0xF0)
|
||||
#define LIT_UTF8_4_BYTE_MASK (0xF8)
|
||||
#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
|
||||
|
||||
#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
|
||||
#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
|
||||
#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
|
||||
#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
|
||||
#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
|
||||
#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
|
||||
#define LIT_UTF8_LAST_1_BIT_MASK (0x01)
|
||||
|
||||
#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
|
||||
|
||||
#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
|
||||
#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
|
||||
#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
|
||||
#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
|
||||
#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
|
||||
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
|
||||
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
|
||||
|
||||
/**
|
||||
* Width of the offset field in lit_utf8_iterator_pos_t structure
|
||||
*/
|
||||
#define LIT_ITERATOR_OFFSET_WIDTH (31)
|
||||
|
||||
/**
|
||||
* Iterator's offset field mask
|
||||
*/
|
||||
#define LIT_ITERATOR_OFFSET_MASK ((1ull << LIT_ITERATOR_OFFSET_WIDTH) - 1)
|
||||
|
||||
/**
|
||||
* Represents position of the iterator
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
lit_utf8_size_t offset : LIT_ITERATOR_OFFSET_WIDTH; /** offset to utf-8 char */
|
||||
bool is_non_bmp_middle: 1; /** flag indicating that current position of the iterator is the middle of
|
||||
* 4-byte char */
|
||||
} lit_utf8_iterator_pos_t;
|
||||
|
||||
/**
|
||||
* Represents an iterator over utf-8 buffer
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
lit_utf8_size_t buf_offset; /* current offset in the buffer */
|
||||
lit_utf8_size_t buf_size; /* buffer length */
|
||||
const lit_utf8_byte_t *buf_p; /* buffer */
|
||||
lit_code_point_t code_point; /* code point is saved here when processed Unicode character is higher than
|
||||
* 0xFFFF */
|
||||
lit_utf8_size_t buf_size; /* buffer length */
|
||||
lit_utf8_iterator_pos_t buf_pos; /* position in the buffer */
|
||||
} lit_utf8_iterator_t;
|
||||
|
||||
/* validation */
|
||||
@@ -42,8 +108,27 @@ bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
|
||||
/* iteration */
|
||||
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
ecma_char_t lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *);
|
||||
bool lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *);
|
||||
|
||||
void lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *);
|
||||
void lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *);
|
||||
|
||||
lit_utf8_iterator_pos_t lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *);
|
||||
void lit_utf8_iterator_seek (lit_utf8_iterator_t *, lit_utf8_iterator_pos_t);
|
||||
|
||||
ecma_length_t lit_utf8_iterator_get_index (const lit_utf8_iterator_t *);
|
||||
|
||||
ecma_char_t lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *);
|
||||
ecma_char_t lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *);
|
||||
|
||||
void lit_utf8_iterator_incr (lit_utf8_iterator_t *);
|
||||
void lit_utf8_iterator_decr (lit_utf8_iterator_t *);
|
||||
void lit_utf8_iterator_advance (lit_utf8_iterator_t *, ecma_length_t);
|
||||
|
||||
ecma_char_t lit_utf8_iterator_read_next (lit_utf8_iterator_t *);
|
||||
ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
|
||||
|
||||
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
|
||||
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
|
||||
|
||||
/* size */
|
||||
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
||||
|
||||
Reference in New Issue
Block a user