Add helper functions for implementing unicode support in lexer.
JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
committed by
Ruben Ayrapetyan
parent
c21399cd58
commit
d248d0944c
@@ -24,6 +24,7 @@
|
|||||||
#include "ecma-alloc.h"
|
#include "ecma-alloc.h"
|
||||||
#include "ecma-helpers.h"
|
#include "ecma-helpers.h"
|
||||||
#include "ecma-builtin-helpers.h"
|
#include "ecma-builtin-helpers.h"
|
||||||
|
#include "lit-char-helpers.h"
|
||||||
|
|
||||||
#define LIST_BLOCK_SIZE 256UL
|
#define LIST_BLOCK_SIZE 256UL
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@
|
|||||||
#include "ecma-try-catch-macro.h"
|
#include "ecma-try-catch-macro.h"
|
||||||
#include "jrt.h"
|
#include "jrt.h"
|
||||||
#include "jrt-libc-includes.h"
|
#include "jrt-libc-includes.h"
|
||||||
|
#include "lit-char-helpers.h"
|
||||||
|
|
||||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN
|
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN
|
||||||
|
|
||||||
|
|||||||
@@ -594,7 +594,7 @@ lit_charset_record_get_length (literal_t lit) /**< literal */
|
|||||||
lit_iter.skip (bytes_to_skip);
|
lit_iter.skip (bytes_to_skip);
|
||||||
i += bytes_to_skip;
|
i += bytes_to_skip;
|
||||||
|
|
||||||
length++;
|
length += (bytes_to_skip > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) ? 2 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef JERRY_NDEBUG
|
#ifndef JERRY_NDEBUG
|
||||||
|
|||||||
@@ -17,6 +17,8 @@
|
|||||||
|
|
||||||
#include "jrt-libc-includes.h"
|
#include "jrt-libc-includes.h"
|
||||||
|
|
||||||
|
JERRY_STATIC_ASSERT (sizeof (lit_utf8_iterator_pos_t) == sizeof (lit_utf8_size_t));
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate utf-8 string
|
* Validate utf-8 string
|
||||||
*
|
*
|
||||||
@@ -117,6 +119,28 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
|
|||||||
return true;
|
return true;
|
||||||
} /* lit_is_utf8_string_valid */
|
} /* lit_is_utf8_string_valid */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the code unit type is low surrogate
|
||||||
|
*
|
||||||
|
* @return true / false
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
lit_is_code_unit_low_surrogate (ecma_char_t code_unit) /**< code unit */
|
||||||
|
{
|
||||||
|
return LIT_UTF16_LOW_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_LOW_SURROGATE_MAX;
|
||||||
|
} /* lit_is_code_unit_low_surrogate */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the code unit type is high surrogate
|
||||||
|
*
|
||||||
|
* @return true / false
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
lit_is_code_unit_high_surrogate (ecma_char_t code_unit) /**< code unit */
|
||||||
|
{
|
||||||
|
return LIT_UTF16_HIGH_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_HIGH_SURROGATE_MAX;
|
||||||
|
} /* lit_is_code_unit_high_surrogate */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize iterator for traversing utf-8 string as a string of code units
|
* Initialize iterator for traversing utf-8 string as a string of code units
|
||||||
*
|
*
|
||||||
@@ -455,6 +479,48 @@ lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
|
|||||||
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
|
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
|
||||||
} /* lit_utf8_iterator_is_bos */
|
} /* lit_utf8_iterator_is_bos */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get offset of the iterator
|
||||||
|
*
|
||||||
|
* @return: current offset in bytes of the iterator from the beginning of buffer
|
||||||
|
*/
|
||||||
|
lit_utf8_size_t
|
||||||
|
lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *iter_p) /**< iterator */
|
||||||
|
{
|
||||||
|
return iter_p->buf_pos.offset;
|
||||||
|
} /* lit_utf8_iterator_get_offset */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set iterator to point to specified offset
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_set_offset (lit_utf8_iterator_t *iter_p, /**< pointer to iterator */
|
||||||
|
lit_utf8_size_t offset) /**< offset from the begging of the iterated buffer */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (offset <= iter_p->buf_size);
|
||||||
|
|
||||||
|
#ifndef JERRY_NDEBUG
|
||||||
|
if (offset < iter_p->buf_size)
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (((*(iter_p->buf_p + offset)) & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||||
|
} /* lit_utf8_iterator_set_offset */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get pointer to the current utf-8 char which iterator points to
|
||||||
|
*
|
||||||
|
* @return: pointer to utf-8 char
|
||||||
|
*/
|
||||||
|
lit_utf8_byte_t *
|
||||||
|
lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *iter_p) /**< iterator */
|
||||||
|
{
|
||||||
|
return (lit_utf8_byte_t *) iter_p->buf_p + iter_p->buf_pos.offset;
|
||||||
|
} /* lit_utf8_iterator_get_ptr */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate size of a zero-terminated utf-8 string
|
* Calculate size of a zero-terminated utf-8 string
|
||||||
*
|
*
|
||||||
@@ -702,6 +768,28 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
|
|||||||
}
|
}
|
||||||
} /* lit_code_unit_to_utf8 */
|
} /* lit_code_unit_to_utf8 */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert surrogate pair to code point
|
||||||
|
*
|
||||||
|
* @return code point
|
||||||
|
*/
|
||||||
|
lit_code_point_t
|
||||||
|
lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
|
||||||
|
ecma_char_t low_surrogate) /**< low surrogate code point */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
|
||||||
|
JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
|
||||||
|
|
||||||
|
lit_code_point_t code_point;
|
||||||
|
code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
|
||||||
|
code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
|
||||||
|
|
||||||
|
code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
|
||||||
|
|
||||||
|
code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
|
||||||
|
return code_point;
|
||||||
|
} /* lit_surrogate_pair_to_code_point */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compare utf-8 string to utf-8 string
|
* Compare utf-8 string to utf-8 string
|
||||||
*
|
*
|
||||||
@@ -757,3 +845,20 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
|
|||||||
|
|
||||||
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
|
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
|
||||||
} /* lit_compare_utf8_strings_relational */
|
} /* lit_compare_utf8_strings_relational */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Print code unit to standard output
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_put_ecma_char (ecma_char_t ecma_char) /**< code unit */
|
||||||
|
{
|
||||||
|
if (ecma_char <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||||
|
{
|
||||||
|
putchar (ecma_char);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
FIXME ("Support unicode characters printing.");
|
||||||
|
putchar ('_');
|
||||||
|
}
|
||||||
|
} /* lit_put_ecma_char */
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
#define LIT_UNICODE_HELPERS_H
|
#define LIT_UNICODE_HELPERS_H
|
||||||
|
|
||||||
#include "jrt.h"
|
#include "jrt.h"
|
||||||
#include "lit-char-helpers.h"
|
|
||||||
#include "lit-globals.h"
|
#include "lit-globals.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -27,7 +26,7 @@
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
||||||
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
|
* Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
|
||||||
*/
|
*/
|
||||||
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
||||||
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
||||||
@@ -112,6 +111,10 @@ typedef struct
|
|||||||
/* validation */
|
/* validation */
|
||||||
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||||
|
|
||||||
|
/* checks */
|
||||||
|
bool lit_is_code_unit_low_surrogate (ecma_char_t);
|
||||||
|
bool lit_is_code_unit_high_surrogate (ecma_char_t);
|
||||||
|
|
||||||
/* iteration */
|
/* iteration */
|
||||||
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||||
|
|
||||||
@@ -136,6 +139,11 @@ ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
|
|||||||
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
|
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
|
||||||
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
|
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
|
||||||
|
|
||||||
|
lit_utf8_size_t lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *);
|
||||||
|
void lit_utf8_iterator_set_offset (lit_utf8_iterator_t *, lit_utf8_size_t);
|
||||||
|
|
||||||
|
lit_utf8_byte_t *lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *);
|
||||||
|
|
||||||
/* size */
|
/* size */
|
||||||
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
||||||
|
|
||||||
@@ -152,6 +160,7 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
|
|||||||
/* conversion */
|
/* conversion */
|
||||||
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
|
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
|
||||||
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
|
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
|
||||||
|
lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t);
|
||||||
|
|
||||||
/* comparison */
|
/* comparison */
|
||||||
bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
|
bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
|
||||||
@@ -169,4 +178,7 @@ lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *,
|
|||||||
lit_utf8_size_t,
|
lit_utf8_size_t,
|
||||||
lit_code_point_t *);
|
lit_code_point_t *);
|
||||||
|
|
||||||
|
/* print */
|
||||||
|
void lit_put_ecma_char (ecma_char_t);
|
||||||
|
|
||||||
#endif /* LIT_UNICODE_HELPERS_H */
|
#endif /* LIT_UNICODE_HELPERS_H */
|
||||||
|
|||||||
Reference in New Issue
Block a user