Add core unicode functionality.

Add utf-8 processing routines.
Change ecma_char_t from char/uint16_t to uint16_t.
Apply all utf-8 processing routines.
Change char to jerry_api_char in API functions' declarations.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov
2015-06-29 19:17:17 +03:00
parent c4b0cd2196
commit fd9ff8e3bd
56 changed files with 2468 additions and 1480 deletions
+76 -13
View File
@@ -18,25 +18,63 @@
#include "jrt.h"
#if CONFIG_ECMA_CHAR_ENCODING == CONFIG_ECMA_CHAR_ASCII
/**
* Description of an ecma-character
* ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value
* used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6).
*
* The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer
* than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units.
*
* According to the standard all strings and source text are assumed to be a sequence of code units.
* Length of a string equals to number of code units in the string, which is not the same as number of Unicode
* characters in a string.
*
* Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character
* occupies from one to four bytes in UTF-8 representation.
*
* Unicode scalar value | Bytes in UTF-8 | Bytes in UTF-16
* | (internal representation) |
* ----------------------------------------------------------------------
* 0x0 - 0x7F | 1 byte | 2 bytes
* 0x80 - 0x7FF | 2 bytes | 2 bytes
* 0x800 - 0xFFFF | 3 bytes | 2 bytes
* 0x10000 - 0x10FFFF | 4 bytes | 4 bytes
*
* Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low
* surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16).
* Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow
* them to be encoded inside strings. The reason for that is compatibility with ECMA standard.
*
* For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon).
* It has the following representation in UTF-16: 0xD835 0xDF00.
*
* ECMA standard allows extracting a substring from this string:
* > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700
* > str.length; // 2
* > var str1 = str.substring (0, 1);
* > str1.length; // 1
* > str1.charCodeAt (0); // 55349 (this equals to 0xD835)
*
* Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80.
* After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5.
*
* Pair of low and high surrogates encoded separately should never occur in internal string representation,
* it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates,
* it should be processed gracefully;
* > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation
* > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation
* > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation,
* // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80
*/
typedef uint8_t ecma_char_t;
#elif CONFIG_ECMA_CHAR_ENCODING == CONFIG_ECMA_CHAR_UTF16
/**
* Description of an ecma-character
* Description of an ecma-character, which represents 16-bit code unit,
* which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1)
*/
typedef uint16_t ecma_char_t;
#endif /* CONFIG_ECMA_CHAR_ENCODING == CONFIG_ECMA_CHAR_UTF16 */
/**
* Description of an ecma-character pointer
*/
typedef ecma_char_t *ecma_char_ptr_t;
/**
* Null character (zt-string end marker)
* Null character
*/
#define ECMA_CHAR_NULL ((ecma_char_t) '\0')
@@ -45,13 +83,38 @@ typedef ecma_char_t *ecma_char_ptr_t;
*/
typedef uint32_t ecma_length_t;
/**
* Description of an ecma-character pointer
*/
typedef ecma_char_t *ecma_char_ptr_t;
/**
* Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding
*/
#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
/**
* A byte of utf-8 string
*/
typedef uint8_t lit_utf8_byte_t;
/**
* Size of a utf-8 string in bytes
*/
typedef uint32_t lit_utf8_size_t;
/**
* Unicode code point
*/
typedef uint32_t lit_code_point_t;
/**
* ECMA string hash
*/
typedef uint8_t lit_string_hash_t;
/**
* Length of string hash, in bits
* ECMA string hash value length, in bits
*/
#define LIT_STRING_HASH_BITS (sizeof (lit_string_hash_t) * JERRY_BITSINBYTE)
+51 -60
View File
@@ -16,6 +16,7 @@
#include "lit-literal-storage.h"
#include "ecma-helpers.h"
#include "lit-literal.h"
#include "lit-magic-strings.h"
/**
* Literal storage
@@ -57,18 +58,18 @@ lit_charset_record_t::set_prev (rcs_record_t *prev_rec_p) /**< pointer to the re
* Set the charset of the record
*/
void
lit_charset_record_t::set_charset (const ecma_char_t *str, /**< buffer containing characters to set */
size_t size) /**< size of the buffer in bytes */
lit_charset_record_t::set_charset (const lit_utf8_byte_t *str, /**< buffer containing characters to set */
lit_utf8_size_t size) /**< size of the buffer in bytes */
{
JERRY_ASSERT (header_size () + size == get_size () - get_alignment_bytes_count ());
rcs_record_iterator_t it ((rcs_recordset_t *)&lit_storage, (rcs_record_t *)this);
it.skip (header_size ());
for (size_t i = 0; i < get_length (); ++i)
for (lit_utf8_size_t i = 0; i < get_length (); ++i)
{
it.write<ecma_char_t> (str[i]);
it.skip<ecma_char_t> ();
it.write<lit_utf8_byte_t> (str[i]);
it.skip<lit_utf8_byte_t> ();
}
} /* lit_charset_record_t::set_charset */
@@ -77,38 +78,39 @@ lit_charset_record_t::set_charset (const ecma_char_t *str, /**< buffer containin
*
* @return number of code units written to the buffer
*/
ecma_length_t
lit_charset_record_t::get_charset (ecma_char_t *buff, /**< output buffer */
lit_utf8_size_t
lit_charset_record_t::get_charset (lit_utf8_byte_t *buff, /**< output buffer */
size_t size) /**< size of the output buffer in bytes */
{
JERRY_ASSERT (buff && size >= sizeof (ecma_char_t));
JERRY_ASSERT (buff && size >= sizeof (lit_utf8_byte_t));
rcs_record_iterator_t it ((rcs_recordset_t *)&lit_storage, (rcs_record_t *)this);
it.skip (header_size ());
ecma_length_t len = get_length ();
size_t i;
lit_utf8_size_t len = get_length ();
lit_utf8_size_t i;
for (i = 0; i < len && size > sizeof (ecma_char_t); ++i)
for (i = 0; i < len && size > 0; ++i)
{
buff[i] = it.read<ecma_char_t> ();
it.skip<ecma_char_t> ();
size -= sizeof (ecma_char_t);
buff[i] = it.read<lit_utf8_byte_t> ();
it.skip<lit_utf8_byte_t> ();
size -= sizeof (lit_utf8_byte_t);
}
return (ecma_length_t) i;
return i;
} /* lit_charset_record_t::get_charset */
/**
* Compares characters from the record to the string
*
* @return 0 if strings are equal
* -1 if str2 is greater
* 1 if str2 is less
* -1 if str_to_compare_with is greater
* 1 if str_to_compare_with is less
*/
int
lit_charset_record_t::compare_zt (const ecma_char_t *str_to_compare_with, /**< buffer with string to compare */
size_t length) /**< length of the string in buffer str2 */
lit_charset_record_t::compare_utf8 (const lit_utf8_byte_t *str_to_compare_with, /**< buffer with string to compare */
lit_utf8_size_t str_size) /**< size of the string */
{
TODO ("Support utf-8 in comparison.");
size_t i;
if (get_length () == 0)
@@ -132,9 +134,9 @@ lit_charset_record_t::compare_zt (const ecma_char_t *str_to_compare_with, /**< b
it_this.skip (header_size ());
for (i = 0; i < get_length () && i < length; i++)
for (i = 0; i < get_length () && i < str_size; i++)
{
ecma_char_t chr = it_this.read<ecma_char_t> ();
lit_utf8_byte_t chr = it_this.read<lit_utf8_byte_t> ();
if (chr > str_to_compare_with[i])
{
@@ -145,10 +147,10 @@ lit_charset_record_t::compare_zt (const ecma_char_t *str_to_compare_with, /**< b
return -1;
}
it_this.skip<ecma_char_t> ();
it_this.skip<lit_utf8_byte_t> ();
}
if (i < length)
if (i < str_size)
{
return -1;
}
@@ -163,7 +165,7 @@ lit_charset_record_t::compare_zt (const ecma_char_t *str_to_compare_with, /**< b
* false otherwise
*/
bool
lit_charset_record_t::equal (lit_charset_record_t *rec) /**< charset record to compare with */
lit_charset_record_t::is_equal (lit_charset_record_t *rec) /**< charset record to compare with */
{
if (get_length () != rec->get_length ())
{
@@ -176,31 +178,19 @@ lit_charset_record_t::equal (lit_charset_record_t *rec) /**< charset record to c
it_this.skip (header_size ());
it_record.skip (rec->header_size ());
for (ecma_length_t i = 0; i < get_length (); i++)
for (lit_utf8_size_t i = 0; i < get_length (); i++)
{
if (it_this.read<ecma_char_t> () != it_record.read<ecma_char_t> ())
if (it_this.read<lit_utf8_byte_t> () != it_record.read<lit_utf8_byte_t> ())
{
return false;
}
it_this.skip<ecma_char_t> ();
it_record.skip<ecma_char_t> ();
it_this.skip<lit_utf8_byte_t> ();
it_record.skip<lit_utf8_byte_t> ();
}
return true;
} /* lit_charset_record_t::equal */
/**
* Compares this lit_charset_record_t records with zero-terminated string for equality
*
* @return true if compared instances are equal
* false otherwise
*/
bool
lit_charset_record_t::equal_zt (const ecma_char_t *str) /**< zero-terminated string */
{
return equal_non_zt (str, ecma_zt_string_length (str));
} /* lit_charset_record_t::equal_zt */
} /* lit_charset_record_t::is_equal */
/**
* Compare this lit_charset_record_t record with string (which could contain '\0' characters) for equality
@@ -209,24 +199,24 @@ lit_charset_record_t::equal_zt (const ecma_char_t *str) /**< zero-terminated str
* false otherwise
*/
bool
lit_charset_record_t::equal_non_zt (const ecma_char_t *str, /**< string to compare with */
ecma_length_t len) /**< length of the string */
lit_charset_record_t::is_equal_utf8_string (const lit_utf8_byte_t *str, /**< string to compare with */
lit_utf8_size_t str_size) /**< length of the string */
{
rcs_record_iterator_t it_this (&lit_storage, this);
it_this.skip (header_size ());
for (ecma_length_t i = 0; i < get_length () && i < len; i++)
for (lit_utf8_size_t i = 0; i < get_length () && i < str_size; i++)
{
if (it_this.read<ecma_char_t> () != str[i])
if (it_this.read<lit_utf8_byte_t> () != str[i])
{
return false;
}
it_this.skip<ecma_char_t> ();
it_this.skip<lit_utf8_byte_t> ();
}
return get_length () == len;
return get_length () == str_size;
} /* lit_charset_record_t::equal_non_zt */
/**
@@ -235,9 +225,9 @@ lit_charset_record_t::equal_non_zt (const ecma_char_t *str, /**< string to compa
* @return pointer to the created record
*/
lit_charset_record_t *
lit_literal_storage_t::create_charset_record (const ecma_char_t *str, /**< string to be placed in the record */
size_t buf_size) /**< size in bytes of the buffer which holds the
* string */
lit_literal_storage_t::create_charset_record (const lit_utf8_byte_t *str, /**< string to be placed in the record */
lit_utf8_size_t buf_size) /**< size in bytes of the buffer which holds the
* string */
{
const size_t alignment = lit_charset_record_t::size (buf_size) - (lit_charset_record_t::header_size () + buf_size);
@@ -245,7 +235,7 @@ lit_literal_storage_t::create_charset_record (const ecma_char_t *str, /**< strin
ret->set_alignment_bytes_count (alignment);
ret->set_charset (str, buf_size);
ret->set_hash (ecma_chars_buffer_calc_hash_last_chars (str, ret->get_length ()));
ret->set_hash (lit_utf8_string_calc_hash_last_bytes (str, ret->get_length ()));
return ret;
} /* lit_literal_storage_t::create_charset_record */
@@ -319,8 +309,9 @@ lit_literal_storage_t::dump ()
for (size_t i = 0; i < lit_p->get_length (); ++i)
{
printf ("%c", it_this.read<ecma_char_t> ());
it_this.skip<ecma_char_t> ();
FIXME ("Support proper printing of characters which occupy more than one byte.")
printf ("%c", it_this.read<lit_utf8_byte_t> ());
it_this.skip<lit_utf8_byte_t> ();
}
printf (" : STRING");
@@ -330,7 +321,7 @@ lit_literal_storage_t::dump ()
case LIT_MAGIC_STR:
{
lit_magic_string_id_t id = lit_magic_record_get_magic_str_id (rec_p);
printf ("%s : MAGIC STRING", lit_get_magic_string_zt (id));
printf ("%s : MAGIC STRING", lit_get_magic_string_utf8 (id));
printf (" [id=%d] ", id);
break;
@@ -338,7 +329,7 @@ lit_literal_storage_t::dump ()
case LIT_MAGIC_STR_EX:
{
lit_magic_string_ex_id_t id = lit_magic_record_ex_get_magic_str_id (rec_p);
printf ("%s : EXT MAGIC STRING", lit_get_magic_string_ex_zt (id));
printf ("%s : EXT MAGIC STRING", lit_get_magic_string_ex_utf8 (id));
printf (" [id=%d] ", id);
break;
@@ -353,8 +344,8 @@ lit_literal_storage_t::dump ()
}
else
{
ecma_char_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
ecma_number_to_zt_string (lit_p->get_number (), buff, ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER);
lit_utf8_byte_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
ecma_number_to_utf8_string (lit_p->get_number (), buff, sizeof (buff));
printf ("%s : NUMBER", buff);
}
@@ -465,12 +456,12 @@ lit_literal_storage_t::get_record_size (rcs_record_t* rec_p) /**< pointer to a r
}
} /* lit_literal_storage_t::get_record_size */
template void rcs_record_iterator_t::skip<ecma_char_t> ();
template void rcs_record_iterator_t::skip<uint8_t> ();
template void rcs_record_iterator_t::skip<uint16_t> ();
template void rcs_record_iterator_t::skip<uint32_t> ();
template void rcs_record_iterator_t::write<ecma_char_t> (ecma_char_t);
template ecma_char_t rcs_record_iterator_t::read<ecma_char_t> ();
template void rcs_record_iterator_t::write<uint8_t> (uint8_t);
template uint8_t rcs_record_iterator_t::read<uint8_t> ();
template void rcs_record_iterator_t::write<ecma_number_t> (ecma_number_t);
template ecma_number_t rcs_record_iterator_t::read<ecma_number_t> ();
+11 -12
View File
@@ -106,12 +106,12 @@ public:
/**
* Get the length of the string, which is contained inside the record
*
* @return length of the string (count of the ecma_char_t characters inside the charset)
* @return length of the string (bytes count)
*/
ecma_length_t
lit_utf8_size_t
get_length () const
{
return (ecma_length_t) ((get_size () - header_size () - get_alignment_bytes_count ()) / sizeof (ecma_char_t));
return (lit_utf8_size_t) (get_size () - header_size () - get_alignment_bytes_count ());
} /* get_length */
/**
@@ -127,12 +127,11 @@ public:
rcs_record_t *get_prev () const;
ecma_length_t get_charset (ecma_char_t *buff, size_t size);
lit_utf8_size_t get_charset (lit_utf8_byte_t *, size_t);
int compare_zt (const ecma_char_t *, size_t);
bool equal (lit_charset_record_t *);
bool equal_zt (const ecma_char_t *);
bool equal_non_zt (const ecma_char_t *, ecma_length_t);
int compare_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t);
bool is_equal (lit_charset_record_t *);
bool is_equal_utf8_string (const lit_utf8_byte_t *, lit_utf8_size_t);
private:
/**
@@ -157,7 +156,7 @@ private:
void set_prev (rcs_record_t *);
void set_charset (const ecma_char_t *, size_t);
void set_charset (const lit_utf8_byte_t *, lit_utf8_size_t);
/**
* Offset and length of 'alignment' field, in bits
@@ -242,7 +241,6 @@ public:
magic_string_id_t get_magic_str_id () const
{
uint32_t id = get_field (magic_field_pos, magic_field_width);
// JERRY_ASSERT (id < LIT_MAGIC_STRING__COUNT);
return (magic_string_id_t) id;
} /* get_magic_str_id */
@@ -303,9 +301,10 @@ private:
* Layout:
* ------- header -----------------------
* type (4 bits)
* magic string id (12 bits)
* padding (12 bits)
* pointer to prev (16 bits)
* --------------------------------------
* ecma_number_t
*/
class lit_number_record_t : public rcs_record_t
{
@@ -417,7 +416,7 @@ public:
LIT_NUMBER
};
lit_charset_record_t *create_charset_record (const ecma_char_t *, size_t);
lit_charset_record_t *create_charset_record (const lit_utf8_byte_t *, lit_utf8_size_t);
lit_magic_record_t *create_magic_record (lit_magic_string_id_t);
lit_magic_record_t *create_magic_record_ex (lit_magic_string_ex_id_t);
lit_number_record_t *create_number_record (ecma_number_t);
+137 -82
View File
@@ -14,7 +14,9 @@
*/
#include "lit-literal.h"
#include "ecma-helpers.h"
#include "lit-magic-strings.h"
/**
* Initialize literal storage
@@ -54,43 +56,43 @@ lit_dump_literals ()
* @return pointer to created record
*/
literal_t
lit_create_literal_from_charset (const ecma_char_t *str, /**< string to initialize the record,
* could be non-zero-terminated */
ecma_length_t len) /**< length of the string */
lit_create_literal_from_utf8_string (const lit_utf8_byte_t *str_p, /**< string to initialize the record,
* could be non-zero-terminated */
lit_utf8_size_t str_size) /**< length of the string */
{
JERRY_ASSERT (str || !len);
JERRY_ASSERT (str_p || !str_size);
for (lit_magic_string_id_t msi = (lit_magic_string_id_t) 0;
msi < LIT_MAGIC_STRING__COUNT;
msi = (lit_magic_string_id_t) (msi + 1))
{
if (ecma_zt_string_length (lit_get_magic_string_zt (msi)) != len)
if (lit_get_magic_string_size (msi) != str_size)
{
continue;
}
if (!strncmp ((const char *) str, (const char *) lit_get_magic_string_zt (msi), len))
if (!strncmp ((const char *) str_p, (const char *) lit_get_magic_string_utf8 (msi), str_size))
{
return lit_storage.create_magic_record (msi);
}
}
for (lit_magic_string_ex_id_t msi = (lit_magic_string_ex_id_t) 0;
msi < ecma_get_magic_string_ex_count ();
msi < lit_get_magic_string_ex_count ();
msi = (lit_magic_string_ex_id_t) (msi + 1))
{
if (ecma_zt_string_length (lit_get_magic_string_ex_zt (msi)) != len)
if (lit_get_magic_string_ex_size (msi) != str_size)
{
continue;
}
if (!strncmp ((const char *) str, (const char *) lit_get_magic_string_ex_zt (msi), len))
if (!strncmp ((const char *) str_p, (const char *) lit_get_magic_string_ex_utf8 (msi), str_size))
{
return lit_storage.create_magic_record_ex (msi);
}
}
return lit_storage.create_charset_record (str, len * sizeof (ecma_char_t));
} /* lit_create_literal_from_charset */
return lit_storage.create_charset_record (str_p, str_size);
} /* lit_create_literal_from_utf8_string */
/**
* Find a literal in literal storage.
@@ -99,22 +101,22 @@ lit_create_literal_from_charset (const ecma_char_t *str, /**< string to initiali
* @return pointer to a literal or NULL if no corresponding literal exists
*/
literal_t
lit_find_literal_by_charset (const ecma_char_t *str, /**< a string to search for */
ecma_length_t len) /**< length of the string */
lit_find_literal_by_utf8_string (const lit_utf8_byte_t *str_p, /**< a string to search for */
lit_utf8_size_t str_size) /**< length of the string */
{
JERRY_ASSERT (str || !len);
JERRY_ASSERT (str_p || !str_size);
for (literal_t lit = lit_storage.get_first (); lit != NULL; lit = lit_storage.get_next (lit))
{
rcs_record_t::type_t type = lit->get_type ();
if (type == LIT_STR_T)
{
if (static_cast<lit_charset_record_t *>(lit)->get_length () != len)
if (static_cast<lit_charset_record_t *>(lit)->get_length () != str_size)
{
continue;
}
if (!static_cast<lit_charset_record_t *>(lit)->compare_zt (str, len))
if (!static_cast<lit_charset_record_t *>(lit)->compare_utf8 (str_p, str_size))
{
return lit;
}
@@ -122,14 +124,14 @@ lit_find_literal_by_charset (const ecma_char_t *str, /**< a string to search for
else if (type == LIT_MAGIC_STR_T)
{
lit_magic_string_id_t magic_id = lit_magic_record_get_magic_str_id (lit);
const char *magic_str = (const char *) lit_get_magic_string_zt (magic_id);
const lit_utf8_byte_t *magic_str_p = lit_get_magic_string_utf8 (magic_id);
if (strlen (magic_str) != len)
if (lit_zt_utf8_string_size (magic_str_p) != str_size)
{
continue;
}
if (!strncmp (magic_str, (const char *) str, strlen (magic_str)))
if (!strncmp ((const char *) magic_str_p, (const char *) str_p, str_size))
{
return lit;
}
@@ -137,14 +139,14 @@ lit_find_literal_by_charset (const ecma_char_t *str, /**< a string to search for
else if (type == LIT_MAGIC_STR_EX_T)
{
lit_magic_string_ex_id_t magic_id = lit_magic_record_ex_get_magic_str_id (lit);
const char *magic_str = (const char *) lit_get_magic_string_ex_zt (magic_id);
const lit_utf8_byte_t *magic_str_p = lit_get_magic_string_ex_utf8 (magic_id);
if (strlen (magic_str) != len)
if (lit_zt_utf8_string_size (magic_str_p) != str_size)
{
continue;
}
if (!strncmp (magic_str, (const char *) str, strlen (magic_str)))
if (!strncmp ((const char *) magic_str_p, (const char *) str_p, str_size))
{
return lit;
}
@@ -152,7 +154,7 @@ lit_find_literal_by_charset (const ecma_char_t *str, /**< a string to search for
}
return NULL;
} /* lit_find_literal_by_charset */
} /* lit_find_literal_by_utf8_string */
/**
* Check if a literal which holds the passed string exists.
@@ -161,18 +163,18 @@ lit_find_literal_by_charset (const ecma_char_t *str, /**< a string to search for
* @return pointer to existing or newly created record
*/
literal_t
lit_find_or_create_literal_from_charset (const ecma_char_t *str, /**< string, could be non-zero-terminated */
ecma_length_t len) /**< length of the string */
lit_find_or_create_literal_from_utf8_string (const lit_utf8_byte_t *str_p, /**< string, could be non-zero-terminated */
lit_utf8_size_t str_size) /**< length of the string */
{
literal_t lit = lit_find_literal_by_charset (str, len);
literal_t lit = lit_find_literal_by_utf8_string (str_p, str_size);
if (lit == NULL)
{
lit = lit_create_literal_from_charset (str, len);
lit = lit_create_literal_from_utf8_string (str_p, str_size);
}
return lit;
} /* lit_find_or_create_literal_from_s */
} /* lit_find_or_create_literal_from_utf8_string */
/**
@@ -235,7 +237,7 @@ lit_find_literal_by_num (ecma_number_t num) /**< a number to search for */
/**
* Check if literal equals to charset record
*
* @return true if equal
* @return true if is_equal
* false otherwise
*/
static bool
@@ -246,24 +248,28 @@ lit_literal_equal_charset_rec (literal_t lit, /**< literal to com
{
case LIT_STR_T:
{
return static_cast<lit_charset_record_t *>(lit)->equal (record);
return static_cast<lit_charset_record_t *>(lit)->is_equal (record);
}
case LIT_MAGIC_STR_T:
{
return record->equal_zt (lit_get_magic_string_zt (lit_magic_record_get_magic_str_id (lit)));
lit_magic_string_id_t magic_string_id = lit_magic_record_get_magic_str_id (lit);
return record->is_equal_utf8_string (lit_get_magic_string_utf8 (magic_string_id),
lit_get_magic_string_size (magic_string_id));
}
case LIT_MAGIC_STR_EX_T:
{
return record->equal_zt (lit_get_magic_string_ex_zt (lit_magic_record_ex_get_magic_str_id (lit)));
lit_magic_string_ex_id_t magic_string_id = lit_magic_record_ex_get_magic_str_id (lit);
return record->is_equal_utf8_string (lit_get_magic_string_ex_utf8 (magic_string_id),
lit_get_magic_string_ex_size (magic_string_id));
}
case LIT_NUMBER_T:
{
ecma_char_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
ecma_number_to_zt_string (static_cast<lit_number_record_t *>(lit)->get_number (),
buff,
ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER);
lit_utf8_byte_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
lit_utf8_size_t copied = ecma_number_to_utf8_string (static_cast<lit_number_record_t *>(lit)->get_number (),
buff,
sizeof (buff));
return record->equal_zt (buff);
return record->is_equal_utf8_string (buff, copied);
}
default:
{
@@ -273,46 +279,47 @@ lit_literal_equal_charset_rec (literal_t lit, /**< literal to com
} /* lit_literal_equal_charset_rec */
/**
* Check if literal equals to zero-terminated string
* Check if literal equals to utf-8 string
*
* @return true if equal
* false otherwise
*/
bool
lit_literal_equal_zt (literal_t lit, /**< literal to compare */
const ecma_char_t *str) /**< zero-terminated string to compare */
lit_literal_equal_utf8 (literal_t lit, /**< literal to compare */
const lit_utf8_byte_t *str_p, /**< utf-8 string to compare */
lit_utf8_size_t str_size) /**< string size in bytes */
{
switch (lit->get_type ())
{
case LIT_STR_T:
{
return static_cast<lit_charset_record_t *>(lit)->equal_zt (str);
return static_cast<lit_charset_record_t *>(lit)->is_equal_utf8_string (str_p, str_size);
}
case LIT_MAGIC_STR_T:
{
lit_magic_string_id_t magic_id = lit_magic_record_get_magic_str_id (lit);
return ecma_compare_zt_strings (str, lit_get_magic_string_zt (magic_id));
return lit_compare_utf8_string_and_magic_string (str_p, str_size, magic_id);
}
case LIT_MAGIC_STR_EX_T:
{
lit_magic_string_ex_id_t magic_id = lit_magic_record_ex_get_magic_str_id (lit);
return ecma_compare_zt_strings (str, lit_get_magic_string_ex_zt (magic_id));
return lit_compare_utf8_string_and_magic_string_ex (str_p, str_size, magic_id);
}
case LIT_NUMBER_T:
{
ecma_char_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
ecma_number_to_zt_string (static_cast<lit_number_record_t *>(lit)->get_number (),
buff,
ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER);
lit_utf8_byte_t num_buf[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
lit_utf8_size_t num_size = ecma_number_to_utf8_string (static_cast<lit_number_record_t *>(lit)->get_number (),
num_buf,
sizeof (num_buf));
return ecma_compare_zt_strings (str, buff);
return lit_compare_utf8_strings (str_p, str_size, num_buf, num_size);
}
default:
{
JERRY_UNREACHABLE ();
}
}
} /* lit_literal_equal_zt */
} /* lit_literal_equal_utf8 */
/**
* Check if literal contains the string equal to the passed number
@@ -324,10 +331,10 @@ bool
lit_literal_equal_num (literal_t lit, /**< literal to check */
ecma_number_t num) /**< number to compare with */
{
ecma_char_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
ecma_number_to_zt_string (num, buff, ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER);
lit_utf8_byte_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER];
lit_utf8_size_t copied = ecma_number_to_utf8_string (num, buff, sizeof (buff));
return lit_literal_equal_zt (lit, buff);
return lit_literal_equal_utf8 (lit, buff, copied);
} /* lit_literal_equal_num */
/**
@@ -348,11 +355,17 @@ lit_literal_equal (literal_t lit1, /**< first literal */
}
case lit_literal_storage_t::LIT_MAGIC_STR:
{
return lit_literal_equal_zt (lit1, lit_get_magic_string_zt (lit_magic_record_get_magic_str_id (lit2)));
lit_magic_string_id_t magic_str_id = lit_magic_record_get_magic_str_id (lit2);
return lit_literal_equal_utf8 (lit1,
lit_get_magic_string_utf8 (magic_str_id),
lit_get_magic_string_size (magic_str_id));
}
case lit_literal_storage_t::LIT_MAGIC_STR_EX:
{
return lit_literal_equal_zt (lit1, lit_get_magic_string_ex_zt (lit_magic_record_ex_get_magic_str_id (lit2)));
lit_magic_string_ex_id_t magic_str_ex_id = lit_magic_record_ex_get_magic_str_id (lit2);
return lit_literal_equal_utf8 (lit1,
lit_get_magic_string_ex_utf8 (magic_str_ex_id),
lit_get_magic_string_ex_size (magic_str_ex_id));
}
case lit_literal_storage_t::LIT_NUMBER:
{
@@ -366,15 +379,16 @@ lit_literal_equal (literal_t lit1, /**< first literal */
} /* lit_literal_equal */
/**
* Check if literal equals to zero-terminated string.
* Check if literal equals to utf-8 string.
* Check that literal is a string literal before performing detailed comparison.
*
* @return true if equal
* false otherwise
*/
bool
lit_literal_equal_type_zt (literal_t lit, /**< literal to compare */
const ecma_char_t *str) /**< zero-terminated string */
lit_literal_equal_type_utf8 (literal_t lit, /**< literal to compare */
const lit_utf8_byte_t *str_p, /**< utf-8 string */
lit_utf8_size_t str_size) /**< string size */
{
if (lit->get_type () != LIT_STR_T
&& lit->get_type () != LIT_MAGIC_STR_T
@@ -383,8 +397,22 @@ lit_literal_equal_type_zt (literal_t lit, /**< literal to compare */
return false;
}
return lit_literal_equal_zt (lit, str);
} /* lit_literal_equal_type_zt */
return lit_literal_equal_utf8 (lit, str_p, str_size);
} /* lit_literal_equal_type_utf8 */
/**
* Check if literal equals to C string.
* Check that literal is a string literal before performing detailed comparison.
*
* @return true if equal
* false otherwise
*/
bool
lit_literal_equal_type_cstr (literal_t lit, /**< literal to compare */
const char *c_str_p) /**< zero-terminated C-string */
{
return lit_literal_equal_type_utf8 (lit, (const lit_utf8_byte_t *) c_str_p, (lit_utf8_size_t) strlen (c_str_p));
} /* lit_literal_equal_type_cstr */
/**
* Check if literal contains the string equal to the passed number.
@@ -432,12 +460,12 @@ lit_literal_equal_type (literal_t lit1, /**< first literal */
*
* @return pointer to the zero-terminated string.
*/
const ecma_char_t *
lit_literal_to_charset (literal_t lit, /**< literal to be processed */
ecma_char_t *buff, /**< buffer to use as a string storage */
size_t size) /**< size of the buffer */
const lit_utf8_byte_t *
lit_literal_to_utf8_string (literal_t lit, /**< literal to be processed */
lit_utf8_byte_t *buff_p, /**< buffer to use as a string storage */
size_t size) /**< size of the buffer */
{
JERRY_ASSERT (buff != NULL && size > sizeof (ecma_char_t));
JERRY_ASSERT (buff_p != NULL && size > 0);
rcs_record_t::type_t type = lit->get_type ();
switch (type)
@@ -445,35 +473,28 @@ lit_literal_to_charset (literal_t lit, /**< literal to be processed */
case LIT_STR_T:
{
lit_charset_record_t *ch_rec_p = static_cast<lit_charset_record_t *> (lit);
ecma_length_t index = ch_rec_p->get_charset (buff, size);
if (index != 0 && ((size_t)index + 1) * sizeof (ecma_char_t) > size)
{
index--;
}
buff[index] = '\0';
return buff;
ch_rec_p->get_charset (buff_p, size);
return buff_p;
}
case LIT_MAGIC_STR_T:
{
return lit_get_magic_string_zt (lit_magic_record_get_magic_str_id (lit));
return lit_get_magic_string_utf8 (lit_magic_record_get_magic_str_id (lit));
}
case LIT_MAGIC_STR_EX_T:
{
return lit_get_magic_string_ex_zt (lit_magic_record_ex_get_magic_str_id (lit));
return lit_get_magic_string_ex_utf8 (lit_magic_record_ex_get_magic_str_id (lit));
}
case LIT_NUMBER_T:
{
ecma_number_to_zt_string (static_cast<lit_number_record_t *> (lit)->get_number (), buff, (ssize_t)size);
ecma_number_to_utf8_string (static_cast<lit_number_record_t *> (lit)->get_number (), buff_p, (ssize_t)size);
return buff;
return buff_p;
}
default: JERRY_UNREACHABLE ();
}
JERRY_UNREACHABLE ();
} /* lit_literal_to_charset */
} /* lit_literal_to_utf8_string */
/**
* Get the contents of the literal as a C string.
@@ -484,10 +505,10 @@ lit_literal_to_charset (literal_t lit, /**< literal to be processed */
const char *
lit_literal_to_str_internal_buf (literal_t lit) /**< literal */
{
const ecma_length_t buff_size = ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER;
static ecma_char_t buff[buff_size];
static lit_utf8_byte_t buff[ECMA_MAX_CHARS_IN_STRINGIFIED_NUMBER + 1];
memset (buff, 0, sizeof (buff));
return (const char *)lit_literal_to_charset (lit, buff, buff_size);
return (const char *) lit_literal_to_utf8_string (lit, buff, sizeof (buff) - 1);
} /* lit_literal_to_str_internal_buf */
@@ -544,10 +565,44 @@ lit_magic_record_ex_get_magic_str_id (literal_t lit) /**< literal */
return static_cast<lit_magic_record_t *> (lit)->get_magic_str_id<lit_magic_string_ex_id_t> ();
} /* lit_magic_record_ex_get_magic_str_id */
lit_utf8_size_t
lit_charset_record_get_size (literal_t lit) /**< literal */
{
return static_cast<lit_charset_record_t *> (lit)->get_length ();
} /* lit_charset_record_get_size */
/**
* Get length of the literal
*
* @return code units count
*/
ecma_length_t
lit_charset_record_get_length (literal_t lit) /**< literal */
{
return static_cast<lit_charset_record_t *> (lit)->get_length ();;
TODO ("Add special case for literals which doesn't contain long characters");
lit_charset_record_t *charset_record_p = static_cast<lit_charset_record_t *> (lit);
rcs_record_iterator_t lit_iter (&lit_storage, lit);
lit_iter.skip (lit_charset_record_t::header_size ());
lit_utf8_size_t lit_utf8_str_size = charset_record_p->get_length ();
ecma_length_t length = 0;
for (lit_utf8_size_t i = 0; i < lit_utf8_str_size;)
{
lit_utf8_byte_t byte = lit_iter.read <lit_utf8_byte_t> ();
lit_utf8_size_t bytes_to_skip = lit_get_unicode_char_size_by_utf8_first_byte (byte);
lit_iter.skip (bytes_to_skip);
i += bytes_to_skip;
length++;
}
#ifndef JERRY_NDEBUG
lit_iter.skip (charset_record_p->get_alignment_bytes_count ());
JERRY_ASSERT (lit_iter.finished ());
#endif
return length;
} /* lit_charset_record_get_length */
ecma_number_t
+10 -8
View File
@@ -16,8 +16,9 @@
#ifndef LIT_LITERAL_H
#define LIT_LITERAL_H
#include "ecma-globals.h"
#include "lit-globals.h"
#include "lit-literal-storage.h"
#include "lit-magic-strings.h"
#define LITERAL_TO_REWRITE (INVALID_VALUE - 1)
@@ -25,30 +26,31 @@ void lit_init ();
void lit_finalize ();
void lit_dump_literals ();
literal_t lit_create_literal_from_charset (const ecma_char_t *, ecma_length_t);
literal_t lit_find_literal_by_charset (const ecma_char_t *, ecma_length_t);
literal_t lit_find_or_create_literal_from_charset (const ecma_char_t *, ecma_length_t);
literal_t lit_create_literal_from_utf8_string (const lit_utf8_byte_t *, lit_utf8_size_t);
literal_t lit_find_literal_by_utf8_string (const lit_utf8_byte_t *, lit_utf8_size_t);
literal_t lit_find_or_create_literal_from_utf8_string (const lit_utf8_byte_t *, lit_utf8_size_t);
literal_t lit_create_literal_from_num (ecma_number_t);
literal_t lit_find_literal_by_num (ecma_number_t);
literal_t lit_find_or_create_literal_from_num (ecma_number_t);
bool lit_literal_equal_zt (literal_t, const ecma_char_t *);
bool lit_literal_equal_utf8 (literal_t, const lit_utf8_byte_t *, lit_utf8_size_t);
bool lit_literal_equal_num (literal_t, ecma_number_t);
bool lit_literal_equal (literal_t, literal_t);
bool lit_literal_equal_type_zt (literal_t, const ecma_char_t *);
bool lit_literal_equal_type_utf8 (literal_t, const lit_utf8_byte_t *, lit_utf8_size_t);
bool lit_literal_equal_type_cstr (literal_t, const char *);
bool lit_literal_equal_type_num (literal_t, ecma_number_t);
bool lit_literal_equal_type (literal_t, literal_t);
const ecma_char_t *lit_literal_to_charset (literal_t, ecma_char_t *, size_t);
const lit_utf8_byte_t *lit_literal_to_utf8_string (literal_t, lit_utf8_byte_t *, size_t);
const char *lit_literal_to_str_internal_buf (literal_t);
literal_t lit_get_literal_by_cp (lit_cpointer_t);
lit_string_hash_t lit_charset_literal_get_hash (literal_t);
ecma_number_t lit_charset_literal_get_number (literal_t);
lit_utf8_size_t lit_charset_record_get_size (literal_t);
ecma_length_t lit_charset_record_get_length (literal_t);
lit_magic_string_id_t lit_magic_record_get_magic_str_id (literal_t);
+150 -86
View File
@@ -15,26 +15,26 @@
#include "lit-magic-strings.h"
#include "ecma-helpers.h"
#include "lit-strings.h"
/**
* Lengths of magic strings
*/
static ecma_length_t lit_magic_string_lengths[LIT_MAGIC_STRING__COUNT];
static lit_utf8_size_t lit_magic_string_sizes[LIT_MAGIC_STRING__COUNT];
/**
* External magic strings data array, count and lengths
*/
static const ecma_char_ptr_t *lit_magic_string_ex_array = NULL;
static const lit_utf8_byte_t **lit_magic_string_ex_array = NULL;
static uint32_t lit_magic_string_ex_count = 0;
static const ecma_length_t *lit_magic_string_ex_lengths = NULL;
static const lit_utf8_size_t *lit_magic_string_ex_sizes = NULL;
#ifndef JERRY_NDEBUG
/**
* Maximum length among lengths of magic strings
*/
static ecma_length_t lit_magic_string_max_length;
#endif /* !JERRY_NDEBUG */
static ecma_length_t ecma_magic_string_max_length;
#endif /* JERRY_NDEBUG */
/**
* Initialize data for string helpers
@@ -45,22 +45,22 @@ lit_magic_strings_init (void)
/* Initializing magic strings information */
#ifndef JERRY_NDEBUG
lit_magic_string_max_length = 0;
ecma_magic_string_max_length = 0;
#endif /* !JERRY_NDEBUG */
for (lit_magic_string_id_t id = (lit_magic_string_id_t) 0;
id < LIT_MAGIC_STRING__COUNT;
id = (lit_magic_string_id_t) (id + 1))
{
lit_magic_string_lengths[id] = ecma_zt_string_length (lit_get_magic_string_zt (id));
lit_magic_string_sizes[id] = lit_zt_utf8_string_size (lit_get_magic_string_utf8 (id));
#ifndef JERRY_NDEBUG
lit_magic_string_max_length = JERRY_MAX (lit_magic_string_max_length, lit_magic_string_lengths[id]);
ecma_magic_string_max_length = JERRY_MAX (ecma_magic_string_max_length, lit_magic_string_sizes[id]);
JERRY_ASSERT (lit_magic_string_max_length <= LIT_MAGIC_STRING_LENGTH_LIMIT);
JERRY_ASSERT (ecma_magic_string_max_length <= LIT_MAGIC_STRING_LENGTH_LIMIT);
#endif /* !JERRY_NDEBUG */
}
} /* ecma_strings_init */
} /* lit_magic_strings_init */
/**
* Initialize external magic strings
@@ -70,44 +70,8 @@ lit_magic_strings_ex_init (void)
{
lit_magic_string_ex_array = NULL;
lit_magic_string_ex_count = 0;
lit_magic_string_ex_lengths = NULL;
} /* ecma_strings_ex_init */
/**
* Register external magic strings
*/
void
lit_magic_strings_ex_set (const ecma_char_ptr_t* ex_str_items, /**< character arrays, representing
* external magic strings' contents */
uint32_t count, /**< number of the strings */
const ecma_length_t* ex_str_lengths) /**< lengths of the strings */
{
JERRY_ASSERT (ex_str_items != NULL);
JERRY_ASSERT (count > 0);
JERRY_ASSERT (ex_str_lengths != NULL);
JERRY_ASSERT (lit_magic_string_ex_array == NULL);
JERRY_ASSERT (lit_magic_string_ex_count == 0);
JERRY_ASSERT (lit_magic_string_ex_lengths == NULL);
/* Set external magic strings information */
lit_magic_string_ex_array = ex_str_items;
lit_magic_string_ex_count = count;
lit_magic_string_ex_lengths = ex_str_lengths;
#ifndef JERRY_NDEBUG
for (lit_magic_string_ex_id_t id = (lit_magic_string_ex_id_t) 0;
id < lit_magic_string_ex_count;
id = (lit_magic_string_ex_id_t) (id + 1))
{
JERRY_ASSERT (lit_magic_string_ex_lengths[id] == ecma_zt_string_length (lit_get_magic_string_ex_zt (id)));
lit_magic_string_max_length = JERRY_MAX (lit_magic_string_max_length, lit_magic_string_ex_lengths[id]);
JERRY_ASSERT (lit_magic_string_max_length <= LIT_MAGIC_STRING_LENGTH_LIMIT);
}
#endif /* !JERRY_NDEBUG */
} /* ecma_strings_ex_init */
lit_magic_string_ex_sizes = NULL;
} /* lit_magic_strings_ex_init */
/**
* Get number of external magic strings
@@ -116,25 +80,23 @@ lit_magic_strings_ex_set (const ecma_char_ptr_t* ex_str_items, /**< character ar
* zero - otherwise.
*/
uint32_t
ecma_get_magic_string_ex_count (void)
lit_get_magic_string_ex_count (void)
{
return lit_magic_string_ex_count;
} /* ecma_get_magic_string_ex_count */
} /* lit_get_magic_string_ex_count */
/**
* Get specified magic string as zero-terminated string
*
* @return pointer to zero-terminated magic string
*/
const ecma_char_t *
lit_get_magic_string_zt (lit_magic_string_id_t id) /**< magic string id */
const lit_utf8_byte_t *
lit_get_magic_string_utf8 (lit_magic_string_id_t id) /**< magic string id */
{
TODO (Support UTF-16);
switch (id)
{
#define LIT_MAGIC_STRING_DEF(id, ascii_zt_string) \
case id: return (ecma_char_t*) ascii_zt_string;
#define LIT_MAGIC_STRING_DEF(id, utf8_string) \
case id: return (lit_utf8_byte_t*) utf8_string;
#include "lit-magic-strings.inc.h"
#undef LIT_MAGIC_STRING_DEF
@@ -142,58 +104,94 @@ lit_get_magic_string_zt (lit_magic_string_id_t id) /**< magic string id */
}
JERRY_UNREACHABLE ();
} /* lit_get_magic_string_zt */
} /* lit_get_magic_string_utf8 */
/**
* Get length of specified magic string
* Get size of specified magic string
*
* @return length
* @return size in bytes
*/
ecma_length_t
lit_get_magic_string_length (lit_magic_string_id_t id) /**< magic string id */
lit_utf8_size_t
lit_get_magic_string_size (lit_magic_string_id_t id) /**< magic string id */
{
return lit_magic_string_lengths[id];
} /* ecma_get_magic_string_size */
return lit_magic_string_sizes[id];
} /* lit_get_magic_string_size */
/**
* Get specified magic string as zero-terminated string from external table
*
* @return pointer to zero-terminated magic string
*/
const ecma_char_t*
lit_get_magic_string_ex_zt (lit_magic_string_ex_id_t id) /**< extern magic string id */
const lit_utf8_byte_t *
lit_get_magic_string_ex_utf8 (lit_magic_string_ex_id_t id) /**< extern magic string id */
{
TODO (Support UTF-16);
if (lit_magic_string_ex_array && id < lit_magic_string_ex_count)
{
return lit_magic_string_ex_array[id];
}
JERRY_UNREACHABLE ();
} /* lit_get_magic_string_ex_zt */
} /* lit_get_magic_string_ex_utf8 */
/**
* Get length of specified external magic string
* Get size of specified external magic string
*
* @return length
* @return size in bytes
*/
ecma_length_t
lit_get_magic_string_ex_length (lit_magic_string_ex_id_t id) /**< external magic string id */
lit_utf8_size_t
lit_get_magic_string_ex_size (lit_magic_string_ex_id_t id) /**< external magic string id */
{
return lit_magic_string_ex_lengths[id];
} /* lit_get_magic_string_ex_length */
return lit_magic_string_ex_sizes[id];
} /* lit_get_magic_string_ex_size */
/**
* Check if passed zt-string equals to one of magic strings
* Register external magic strings
*/
void
lit_magic_strings_ex_set (const lit_utf8_byte_t **ex_str_items, /**< character arrays, representing
* external magic strings' contents */
uint32_t count, /**< number of the strings */
const lit_utf8_size_t *ex_str_sizes) /**< sizes of the strings */
{
JERRY_ASSERT (ex_str_items != NULL);
JERRY_ASSERT (count > 0);
JERRY_ASSERT (ex_str_sizes != NULL);
JERRY_ASSERT (lit_magic_string_ex_array == NULL);
JERRY_ASSERT (lit_magic_string_ex_count == 0);
JERRY_ASSERT (lit_magic_string_ex_sizes == NULL);
/* Set external magic strings information */
lit_magic_string_ex_array = ex_str_items;
lit_magic_string_ex_count = count;
lit_magic_string_ex_sizes = ex_str_sizes;
#ifndef JERRY_NDEBUG
for (lit_magic_string_ex_id_t id = (lit_magic_string_ex_id_t) 0;
id < lit_magic_string_ex_count;
id = (lit_magic_string_ex_id_t) (id + 1))
{
JERRY_ASSERT (lit_magic_string_ex_sizes[id] == lit_zt_utf8_string_size (lit_get_magic_string_ex_utf8 (id)));
ecma_magic_string_max_length = JERRY_MAX (ecma_magic_string_max_length, lit_magic_string_ex_sizes[id]);
JERRY_ASSERT (ecma_magic_string_max_length <= LIT_MAGIC_STRING_LENGTH_LIMIT);
}
#endif /* !JERRY_NDEBUG */
} /* lit_magic_strings_ex_set */
/**
* Check if passed utf-8 string equals to one of magic strings
* and if equal magic string was found, return it's id in 'out_id_p' argument.
*
* @return true - if magic string equal to passed string was found,
* false - otherwise.
*/
bool
lit_is_zt_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated string */
lit_magic_string_id_t *out_id_p) /**< out: magic string's id */
lit_is_utf8_string_magic (const lit_utf8_byte_t *string_p, /**< utf-8 string */
lit_utf8_size_t string_size, /**< string size in bytes */
lit_magic_string_id_t *out_id_p) /**< out: magic string's id */
{
TODO (Improve performance of search);
@@ -201,7 +199,7 @@ lit_is_zt_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated str
id < LIT_MAGIC_STRING__COUNT;
id = (lit_magic_string_id_t) (id + 1))
{
if (ecma_compare_zt_strings (zt_string_p, lit_get_magic_string_zt (id)))
if (lit_compare_utf8_string_and_magic_string (string_p, string_size, id))
{
*out_id_p = id;
@@ -212,18 +210,18 @@ lit_is_zt_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated str
*out_id_p = LIT_MAGIC_STRING__COUNT;
return false;
} /* lit_is_zt_string_magic */
} /* lit_is_utf8_string_magic */
/**
* Check if passed zt-string equals to one of external magic strings
* Check if passed utf-8 string equals to one of external magic strings
* and if equal magic string was found, return it's id in 'out_id_p' argument.
*
* @return true - if external magic string equal to passed string was found,
* false - otherwise.
*/
bool
lit_is_zt_ex_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated string */
lit_magic_string_ex_id_t *out_id_p) /**< out: external magic string's id */
bool lit_is_ex_utf8_string_magic (const lit_utf8_byte_t *string_p, /**< utf-8 string */
lit_utf8_size_t string_size, /**< string size in bytes */
lit_magic_string_ex_id_t *out_id_p) /**< out: magic string's id */
{
TODO (Improve performance of search);
@@ -231,7 +229,7 @@ lit_is_zt_ex_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated
id < lit_magic_string_ex_count;
id = (lit_magic_string_ex_id_t) (id + 1))
{
if (ecma_compare_zt_strings (zt_string_p, lit_get_magic_string_ex_zt (id)))
if (lit_compare_utf8_string_and_magic_string_ex (string_p, string_size, id))
{
*out_id_p = id;
@@ -242,4 +240,70 @@ lit_is_zt_ex_string_magic (const ecma_char_t *zt_string_p, /**< zero-terminated
*out_id_p = lit_magic_string_ex_count;
return false;
} /* lit_is_zt_ex_string_magic */
} /* lit_is_ex_utf8_string_magic */
/**
* Compare utf-8 string and magic string for equality
*
* @return true if strings are equal
* false otherwise
*/
bool
lit_compare_utf8_string_and_magic_string (const lit_utf8_byte_t *string_p, /**< utf-8 string */
lit_utf8_size_t string_size, /**< string size in bytes */
lit_magic_string_id_t magic_string_id) /**< magic string's id */
{
return lit_compare_utf8_strings (string_p,
string_size,
lit_get_magic_string_utf8 (magic_string_id),
lit_get_magic_string_size (magic_string_id));
} /* lit_compare_utf8_string_and_magic_string */
/**
* Compare utf-8 string and external magic string for equality
*
* @return true if strings are equal
* false otherwise
*/
bool
lit_compare_utf8_string_and_magic_string_ex (const lit_utf8_byte_t *string_p, /**< utf-8 string */
lit_utf8_size_t string_size, /**< string size in bytes */
lit_magic_string_ex_id_t magic_string_ex_id) /**< external magic string's
* id */
{
return lit_compare_utf8_strings (string_p,
string_size,
lit_get_magic_string_ex_utf8 (magic_string_ex_id),
lit_get_magic_string_ex_size (magic_string_ex_id));
} /* lit_compare_utf8_string_and_magic_string_ex */
/**
* Copy magic string to buffer
*
* Warning:
* the routine requires that buffer size is enough
*
* @return pointer to the byte next to the last copied in the buffer
*/
extern lit_utf8_byte_t *
lit_copy_magic_string_to_buffer (lit_magic_string_id_t id, /**< magic string id */
lit_utf8_byte_t *buffer_p, /**< destination buffer */
ssize_t buffer_size) /**< size of buffer */
{
const lit_utf8_byte_t *magic_string_bytes_p = lit_get_magic_string_utf8 (id);
lit_utf8_size_t magic_string_bytes_count = lit_get_magic_string_size (id);
const lit_utf8_byte_t *str_iter_p = magic_string_bytes_p;
lit_utf8_byte_t *buf_iter_p = buffer_p;
ssize_t bytes_copied = 0;
while (magic_string_bytes_count--)
{
bytes_copied ++;
JERRY_ASSERT (bytes_copied <= buffer_size);
*buf_iter_p++ = *str_iter_p++;
}
return buf_iter_p;
} /* lit_copy_magic_string_to_buffer */
+27 -10
View File
@@ -44,18 +44,35 @@ typedef uint32_t lit_magic_string_ex_id_t;
extern void lit_magic_strings_init (void);
extern void lit_magic_strings_ex_init (void);
extern void lit_magic_strings_ex_set (const ecma_char_ptr_t *,
uint32_t,
const ecma_length_t *);
extern uint32_t ecma_get_magic_string_ex_count (void);
extern uint32_t lit_get_magic_string_ex_count (void);
extern const ecma_char_t *lit_get_magic_string_zt (lit_magic_string_id_t);
extern ecma_length_t lit_get_magic_string_length (lit_magic_string_id_t);
extern const lit_utf8_byte_t *lit_get_magic_string_utf8 (lit_magic_string_id_t);
extern lit_utf8_size_t lit_get_magic_string_size (lit_magic_string_id_t);
extern const ecma_char_t *lit_get_magic_string_ex_zt (lit_magic_string_ex_id_t);
extern ecma_length_t lit_get_magic_string_ex_length (lit_magic_string_ex_id_t);
extern const lit_utf8_byte_t *lit_get_magic_string_ex_utf8 (lit_magic_string_ex_id_t);
extern lit_utf8_size_t lit_get_magic_string_ex_size (lit_magic_string_ex_id_t);
extern bool lit_is_zt_string_magic (const ecma_char_t *, lit_magic_string_id_t *);
extern bool lit_is_zt_ex_string_magic (const ecma_char_t *, lit_magic_string_ex_id_t *);
extern void lit_magic_strings_ex_set (const lit_utf8_byte_t **,
uint32_t count,
const lit_utf8_size_t *);
extern bool lit_is_utf8_string_magic (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_magic_string_id_t *);
extern bool lit_is_ex_utf8_string_magic (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_magic_string_ex_id_t *);
extern bool lit_compare_utf8_string_and_magic_string (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_magic_string_id_t);
extern bool lit_compare_utf8_string_and_magic_string_ex (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_magic_string_ex_id_t);
extern lit_utf8_byte_t *lit_copy_magic_string_to_buffer (lit_magic_string_id_t,
lit_utf8_byte_t *buffer_p,
ssize_t buffer_size);
#endif /* LIT_MAGIC_STRINGS_H */
+579
View File
@@ -0,0 +1,579 @@
/* Copyright 2015 Samsung Electronics Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lit-strings.h"
#include "jrt-libc-includes.h"
/**
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
*/
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
#define LIT_UTF16_BITS_IN_SURROGATE (10)
#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
#define LIT_UTF8_1_BYTE_MARKER (0x00)
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
#define LIT_UTF8_1_BYTE_MASK (0x80)
#define LIT_UTF8_2_BYTE_MASK (0xE0)
#define LIT_UTF8_3_BYTE_MASK (0xF0)
#define LIT_UTF8_4_BYTE_MASK (0xF8)
#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
#define LIT_UTF8_LAST_1_BIT_MASK (0x01)
#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x1000)
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
/**
* Validate utf-8 string
*
* NOTE:
* Isolated surrogates are allowed.
* Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
*
* @return true if utf-8 string is well-formed
* false otherwise
*/
bool
lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
lit_utf8_size_t buf_size) /**< string size */
{
lit_utf8_size_t idx = 0;
bool is_prev_code_point_high_surrogate = false;
while (idx < buf_size)
{
lit_utf8_byte_t c = utf8_buf_p[idx++];
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
{
continue;
}
lit_code_point_t code_point = 0;
lit_code_point_t min_code_point = 0;
lit_utf8_size_t extra_bytes_count;
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
{
extra_bytes_count = 1;
min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
}
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
{
extra_bytes_count = 2;
min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
}
else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
{
extra_bytes_count = 3;
min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
}
else
{
/* utf-8 string could not contain 5- and 6-byte sequences. */
return false;
}
if (idx + extra_bytes_count > buf_size)
{
/* utf-8 string breaks in the middle */
return false;
}
for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
{
c = utf8_buf_p[idx + offset];
if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
{
/* invalid continuation byte */
return false;
}
code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
}
if (code_point < min_code_point
|| code_point > LIT_UNICODE_CODE_POINT_MAX)
{
/* utf-8 string doesn't encode valid unicode code point */
return false;
}
if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
&& code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
{
is_prev_code_point_high_surrogate = true;
}
else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
&& code_point <= LIT_UTF16_LOW_SURROGATE_MAX
&& is_prev_code_point_high_surrogate)
{
/* sequence of high and low surrogate is not allowed */
return false;
}
else
{
is_prev_code_point_high_surrogate = false;
}
idx += extra_bytes_count;
}
return true;
} /* lit_is_utf8_string_valid */
/**
* Initialize iterator for traversing utf-8 string as a string of code units
*
* @return iterator
*/
lit_utf8_iterator_t
lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
lit_utf8_size_t buf_size) /**< string size */
{
JERRY_ASSERT (utf8_buf_p || !buf_size);
lit_utf8_iterator_t buf_iter =
{
0,
buf_size,
utf8_buf_p,
0,
};
return buf_iter;
} /* lit_utf8_iterator_create */
/**
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
*
* @return lower code_unit of the surrogate pair
*/
static ecma_char_t
convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
{
JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
ecma_char_t code_unit_bits;
code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
} /* convert_code_point_to_low_surrogate */
/**
* Represents code point (>0xFFFF) as surrogate pair and returns its higher part
*
* @return higher code_unit of the surrogate pair
*/
static ecma_char_t
convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
{
JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
ecma_char_t code_unit_bits;
code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
} /* convert_code_point_to_low_surrogate */
/**
* Get next code unit form the iterated string and increment iterator to point to next code unit
*
* @return next code unit
*/
ecma_char_t
lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p) /**< @in-out: utf-8 string iterator */
{
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (buf_iter_p));
if (buf_iter_p->code_point)
{
ecma_char_t code_unit = convert_code_point_to_low_surrogate (buf_iter_p->code_point);
buf_iter_p->code_point = 0;
return code_unit;
}
lit_code_point_t code_point;
buf_iter_p->buf_offset += lit_read_code_point_from_utf8 (buf_iter_p->buf_p + buf_iter_p->buf_offset,
buf_iter_p->buf_size - buf_iter_p->buf_offset,
&code_point);
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
{
return (ecma_char_t) code_point;
}
else
{
buf_iter_p->code_point = code_point;
return convert_code_point_to_high_surrogate (code_point);
}
JERRY_ASSERT (false);
return ECMA_CHAR_NULL;
} /* lit_utf8_iterator_read_code_unit_and_increment */
/**
* Checks iterator reached end of the string
*
* @return true - the whole string was iterated
* false - otherwise
*/
bool
lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *buf_iter_p) /**< utf-8 string iterator */
{
JERRY_ASSERT (buf_iter_p->buf_offset <= buf_iter_p->buf_size);
if (buf_iter_p->code_point == LIT_UNICODE_CODE_POINT_NULL && buf_iter_p->buf_offset == buf_iter_p->buf_size)
{
return true;
}
return false;
} /* lit_utf8_iterator_reached_buffer_end */
/**
* Calculate size of a zero-terminated utf-8 string
*
* NOTE:
* string should not contain zero characters in the middel
*
* @return size of a string
*/
lit_utf8_size_t
lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
{
return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
} /* lit_zt_utf8_string_size */
/**
* Calculate length of a utf-8 string
*
* @return UTF-16 code units count
*/
ecma_length_t
lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
lit_utf8_size_t utf8_buf_size) /**< string size */
{
ecma_length_t length = 0;
lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
while (!lit_utf8_iterator_reached_buffer_end (&buf_iter))
{
lit_utf8_iterator_read_code_unit_and_increment (&buf_iter);
length++;
}
JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&buf_iter));
return length;
} /* lit_utf8_string_length */
/**
* Decodes a unicode code point from non-empty utf-8-encoded buffer
*
* @return number of bytes occupied by code point in the string
*/
lit_utf8_size_t
lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
lit_code_point_t *code_point) /**< @out: code point */
{
JERRY_ASSERT (buf_p && buf_size);
lit_utf8_byte_t c = (uint8_t) buf_p[0];
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
{
*code_point = (uint32_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
return 1;
}
lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
ecma_length_t bytes_count = 0;
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
{
bytes_count = 2;
ret = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
}
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
{
bytes_count = 3;
ret = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
}
else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
{
bytes_count = 4;
ret = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
}
else
{
JERRY_ASSERT (false);
}
JERRY_ASSERT (buf_size >= bytes_count);
for (uint32_t i = 1; i < bytes_count; ++i)
{
ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
}
*code_point = ret;
return bytes_count;
} /* lit_read_code_point_from_utf8 */
/**
* Calculate hash from last LIT_STRING_HASH_LAST_BYTES_COUNT characters from the buffer.
*
* @return ecma-string's hash
*/
lit_string_hash_t
lit_utf8_string_calc_hash_last_bytes (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
{
JERRY_ASSERT (utf8_buf_p != NULL);
lit_utf8_byte_t byte1 = (utf8_buf_size > 0) ? utf8_buf_p[utf8_buf_size - 1] : 0;
lit_utf8_byte_t byte2 = (utf8_buf_size > 1) ? utf8_buf_p[utf8_buf_size - 2] : 0;
uint32_t t1 = (uint32_t) byte1 + (uint32_t) byte2;
uint32_t t2 = t1 * 0x24418b66;
uint32_t t3 = (t2 >> 16) ^ (t2 & 0xffffu);
uint32_t t4 = (t3 >> 8) ^ (t3 & 0xffu);
return (lit_string_hash_t) t4;
} /* lit_utf8_string_calc_hash_last_bytes */
/**
* Return code unit at the specified position in string
*
* NOTE:
* code_unit_offset should be less then string's length
*
* @return code unit value
*/
ecma_char_t
lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
ecma_length_t code_unit_offset) /**< ofset of a code_unit */
{
lit_utf8_iterator_t iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
ecma_char_t code_unit;
do
{
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter));
code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter);
}
while (code_unit_offset--);
return code_unit;
} /* lit_utf8_string_code_unit_at */
/**
* Return number of bytes occupied by a unicode character in utf-8 representation
*
* @return size of a unicode character in utf-8 format
*/
lit_utf8_size_t
lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t first_byte) /**< first byte of a utf-8 byte sequence */
{
if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
{
return 1;
}
else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
{
return 2;
}
else if ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
{
return 3;
}
else
{
JERRY_ASSERT ((first_byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
return 4;
}
} /* lit_get_unicode_char_size_by_utf8_first_byte */
/**
* Convert code_unit to utf-8 representation
*
* @return bytes count, stored required to represent specified code unit
*/
lit_utf8_size_t
lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
lit_utf8_byte_t *buf_p) /**< buffer where to store the result,
* its size should be at least MAX_BYTES_IN_CODE_UNIT */
{
return lit_code_point_to_utf8 (code_unit, buf_p);
} /* lit_code_unit_to_utf8 */
/**
* Convert code point to utf-8 representation
*
* @return bytes count, stored required to represent specified code unit
*/
lit_utf8_size_t
lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
lit_utf8_byte_t *buf) /**< buffer where to store the result,
* its size should be at least 4 bytes */
{
if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
{
buf[0] = (lit_utf8_byte_t) code_point;
return 1;
}
else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
{
uint32_t code_point_bits = code_point;
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
JERRY_ASSERT (first_byte_bits == code_point_bits);
buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
return 2;
}
else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
{
uint32_t code_point_bits = code_point;
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
JERRY_ASSERT (first_byte_bits == code_point_bits);
buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
return 3;
}
else
{
JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
uint32_t code_point_bits = code_point;
lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
JERRY_ASSERT (first_byte_bits == code_point_bits);
buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
return 4;
}
} /* lit_code_unit_to_utf8 */
/**
* Compare utf-8 string to utf-8 string
*
* @return true - if strings are equal;
* false - otherwise.
*/
bool
lit_compare_utf8_strings (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
lit_utf8_size_t string1_size, /**< string size */
const lit_utf8_byte_t *string2_p, /**< utf-8 string */
lit_utf8_size_t string2_size) /**< string size */
{
if (string1_size != string2_size)
{
return false;
}
return memcmp (string1_p, string2_p, string1_size) == 0;
} /* lit_compare_utf8_strings */
/**
* Relational compare of utf-8 strings
*
* First string is less than second string if:
* - strings are not equal;
* - first string is prefix of second or is lexicographically less than second.
*
* @return true - if first string is less than second string,
* false - otherwise.
*/
bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
lit_utf8_size_t string1_size, /**< string size */
const lit_utf8_byte_t *string2_p, /**< utf-8 string */
lit_utf8_size_t string2_size) /**< string size */
{
lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size);
lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size);
while (!lit_utf8_iterator_reached_buffer_end (&iter1)
&& !lit_utf8_iterator_reached_buffer_end (&iter2))
{
ecma_char_t code_point1 = lit_utf8_iterator_read_code_unit_and_increment (&iter1);
ecma_char_t code_point2 = lit_utf8_iterator_read_code_unit_and_increment (&iter2);
if (code_point1 < code_point2)
{
return true;
}
else if (code_point1 > code_point2)
{
return false;
}
}
return (lit_utf8_iterator_reached_buffer_end (&iter1) && !lit_utf8_iterator_reached_buffer_end (&iter2));
} /* lit_compare_utf8_strings_relational */
+80
View File
@@ -0,0 +1,80 @@
/* Copyright 2015 Samsung Electronics Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LIT_UNICODE_HELPERS_H
#define LIT_UNICODE_HELPERS_H
#include "jrt.h"
#include "lit-globals.h"
/**
* Null character (used in few cases as utf-8 string end marker)
*/
#define LIT_BYTE_NULL (0)
/**
* Represents an iterator over utf-8 buffer
*/
typedef struct
{
lit_utf8_size_t buf_offset; /* current offset in the buffer */
lit_utf8_size_t buf_size; /* buffer length */
const lit_utf8_byte_t *buf_p; /* buffer */
lit_code_point_t code_point; /* code point is saved here when processed Unicode character is higher than
* 0xFFFF */
} lit_utf8_iterator_t;
/* validation */
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
/* iteration */
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
ecma_char_t lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *);
bool lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *);
/* size */
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
/* length */
ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *, lit_utf8_size_t);
/* hash */
lit_string_hash_t lit_utf8_string_calc_hash_last_bytes (const lit_utf8_byte_t *, lit_utf8_size_t);
/* code unit access */
ecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *, lit_utf8_size_t, ecma_length_t);
lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
/* conversion */
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
/* comparison */
bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
lit_utf8_size_t,
const lit_utf8_byte_t *,
lit_utf8_size_t);
bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p,
lit_utf8_size_t,
const lit_utf8_byte_t *string2_p,
lit_utf8_size_t);
/* read code point from buffer */
lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_code_point_t *);
#endif /* LIT_UNICODE_HELPERS_H */