Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support
for additional features.
- Added full support for web compatibility syntax defined in Annex B
- Implemented parsing and matching patterns in unicode mode
- Fixed capture results when iterating with nested capturing groups
- Significantly reduced regexp bytecode size
- Reduced stack usage during regexp execution
- Improved matching performance

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
Dániel Bátyai
2020-05-26 15:28:54 +02:00
committed by GitHub
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
+361
View File
@@ -0,0 +1,361 @@
// Copyright JS Foundation and other contributors, http://js.foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
var result = /\0/.exec("\u0000");
assert (result !== null);
assert (result[0] === "\u0000");
result = /\0/u.exec("\u0000");
assert (result !== null);
assert (result[0] === "\u0000");
result = /\000/.exec("\u0000");
assert (result !== null);
assert (result[0] === "\u0000");
try {
new RegExp("\\000", 'u').exec("\u0000");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\0000/.exec("\u0000\u0030");
assert (result !== null);
assert (result[0] === "\u0000\u0030");
result = /\377/.exec("\u00ff");
assert (result !== null);
assert (result[0] === "\u00ff");
try {
new RegExp("\\377", 'u').exec("\u00ff");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\3777/.exec("\u00ff\u0037");
assert (result !== null);
assert (result[0] === "\u00ff\u0037");
try {
new RegExp("\\3777", 'u').exec("\u00ff\u0037");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\400/.exec("\u0020\u0030");
assert (result !== null);
assert (result[0] === "\u0020\u0030");
try {
new RegExp("\\400", 'u').exec("\u0020\u0030");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /(\1)/.exec("\u0001");
assert (result !== null);
assert (result[0].length === 0);
result = /(\1)/u.exec("\u0001");
assert (result !== null);
assert (result[0].length === 0);
result = /(\2)/.exec("\u0002");
assert (result !== null);
assert (result[0] === '\u0002');
try {
new RegExp("(\\2)", 'u').exec("\u0002");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\8/.exec("\u0038");
assert (result !== null);
assert (result[0] === '8');
result = /\99/.exec("\u0039\u0039");
assert (result !== null);
assert (result[0] === "99");
// CharClassEscape
assert (/\d+/.exec("123")[0] === "123");
assert (/\D+/.exec("abc")[0] === "abc");
assert (/\s+/.exec(" ")[0] === " ");
assert (/\S+/.exec("abc")[0] === "abc");
assert (/\w+/.exec("abc")[0] === "abc");
assert (/\W+/.exec("|||")[0] === "|||");
assert (/\d+/u.exec("123")[0] === "123");
assert (/\D+/u.exec("abc")[0] === "abc");
assert (/\s+/u.exec(" ")[0] === " ");
assert (/\S+/u.exec("abc")[0] === "abc");
assert (/\w+/u.exec("abc")[0] === "abc");
assert (/\W+/u.exec("|||")[0] === "|||");
assert (/\d+/u.exec("\u{10CAF}") === null);
assert (/\D+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
assert (/\s+/u.exec("\u{10CAF}") === null);
assert (/\S+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
assert (/\w+/u.exec("\u{10CAF}") === null);
assert (/\W+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
result = /\xz/.exec("xz");
assert (result !== null);
assert (result[0] === "xz");
try {
new RegExp("\\xz", "u").exec("xz");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\c/.exec("\\c");
assert (result !== null);
assert (result[0] === "\\c");
try {
new RegExp("\\c", 'u').exec("\\c")
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
result = /\c1/.exec("\\c1");
assert (result !== null);
assert (result[0] === "\\c1");
try {
new RegExp("\\c1", 'u').exec("\\c1");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("^+");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("$+");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("\\b+");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("\\B+");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/[\b]/.exec("\u0008")[0] === "\u0008");
assert (/[\b]/u.exec("\u0008")[0] === "\u0008");
assert (/[\B]/.exec("\u0042")[0] === "\u0042");
try {
new RegExp ("[\\B]", 'u').exec("\u0042");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/[\c1]/.exec("\u0011")[0] === "\u0011");
assert (/[\c_]/.exec("\u001f")[0] === "\u001f");
assert (/[\c]/.exec("\\")[0] === "\\");
assert (/[\c]/.exec("c")[0] === "c");
try {
new RegExp("[\\c1]", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("[\\c]", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("[\\c_]", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/{{1,2}/.exec("{{")[0] === "{{");
try {
new RegExp("{{1,2}", 'u').exec("{{");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/a{1,2/.exec("a{1,2")[0] === "a{1,2");
try {
new RegExp("a{1,2", 'u').exec("a{1,2");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/\u017f/i.exec("s") === null);
assert (/\u017f/ui.exec("s")[0] === "s");
assert (/𐲯/.exec("𐲯")[0] === "𐲯");
assert (/𐲯/u.exec("𐲯")[0] === "𐲯");
assert (/𐲯*?/.exec("𐲯")[0] === "\ud803");
assert (/𐲯*?/u.exec("𐲯")[0] === "");
assert (/𐲯+/.exec("𐲯𐲯𐲯")[0] === "𐲯");
assert (/𐲯+/u.exec("𐲯𐲯𐲯")[0] === "𐲯𐲯𐲯");
assert (/\ud803\udc96*?/.exec("𐲖")[0] === '\ud803');
assert (/\ud803\udc96*?/u.exec("𐲖")[0] === '');
assert (/\ud803\udc96+/.exec("𐲖𐲖𐲖")[0] === '𐲖');
assert (/\ud803\udc96+/u.exec("𐲖𐲖𐲖")[0] === '𐲖𐲖𐲖');
assert (/.*𐲗𐲘/u.exec("𐲓𐲔𐲕𐲖𐲗𐲘")[0] === '𐲓𐲔𐲕𐲖𐲗𐲘');
assert (/[\u{10000}]/.exec("\u{10000}") === null);
assert (/[\u{10000}]/.exec("{")[0] === "{");
assert (/[^\u{10000}]/.exec("\u{10000}")[0] === "\ud800");
assert (/[^\u{10000}]/.exec("{") === null);
assert (/[\uffff]/.exec("\uffff")[0] === "\uffff");
assert (/[^\uffff]/.exec("\uffff") === null);
assert (/[\u{10000}]/u.exec("\u{10000}")[0] === "\u{10000}");
assert (/[\u{10000}]/u.exec("{") === null);
assert (/[^\u{10000}]/u.exec("\u{10000}") === null);
assert (/[^\u{10000}]/u.exec("{")[0] === "{");
assert (/[\uffff]/u.exec("\uffff")[0] === "\uffff");
assert (/[^\uffff]/u.exec("\uffff") === null);
assert (/a{4294967296,4294967297}/.exec("aaaa") === null);
assert (/a{4294967294,4294967295}/.exec("aaaa") === null);
assert (/a{0000000000000000001,0000000000000000002}/u.exec("aaaa")[0] === 'aa');
assert (/(\4294967297)/.exec("\4294967297")[0] === "\4294967297");
assert (/(\1)/u.exec("aaaa")[0] === "");
try {
new RegExp("a{4294967295,4294967294}", '');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/[\d-\s]/.exec("-")[0] === "-");
assert (/[0-\s]/.exec("-")[0] === "-");
assert (/[\d-0]/.exec("-")[0] === "-");
try {
new RegExp("[\\d-\\s]", 'u').exec("-");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("[0-\\s]", 'u').exec("-");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("[\\d-0]", 'u').exec("-");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/[-]/.exec("-")[0] === "-");
assert (/[-]/u.exec("-")[0] === "-");
assert (/[--]/.exec("-")[0] === "-");
assert (/[--]/u.exec("-")[0] === "-");
assert (/}/.exec("}")[0] === "}");
assert (/\}/u.exec("}")[0] === "}");
try {
new RegExp("}", 'u').exec("}");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/]/.exec("]")[0] === "]");
assert (/\]/u.exec("]")[0] === "]");
try {
new RegExp("]", 'u').exec("]");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/(?=)*/.exec("")[0] === "");
assert (/(?=)+/.exec("")[0] === "");
assert (/(?=){1,2}/.exec("")[0] === "");
try {
new RegExp("(?=)*", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("(?=)+", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("(?=){1,2}", 'u');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
try {
new RegExp("(?=){2,1}", '');
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
+3
View File
@@ -58,3 +58,6 @@ assert (r.exec("a") == "a");
r = new RegExp ("a|bb|c|d");
assert (r.exec("b") == undefined);
r = new RegExp("(?:a|b)\\b|\\.\\w+", "g");
assert (r.exec("name.lower()")[0] === ".lower")
+3
View File
@@ -24,3 +24,6 @@ assert (r == undefined);
r = new RegExp ("(a)*b\\1").exec("b");
assert (r[0] == "b");
assert (r[1] == undefined);
assert (JSON.stringify (/[[]?(a)\1/.exec("aa")) === '["aa","a"]');
assert (JSON.stringify (/\1{2,5}()\B/.exec("asd")) === '["",""]');
+115
View File
@@ -0,0 +1,115 @@
// Copyright JS Foundation and other contributors, http://js.foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
assert (JSON.stringify (/(?:(a)*){3,}/.exec("aaaab")) === '["aaaa",null]');
assert (JSON.stringify (/((a)*){3,}/.exec("aaaab")) === '["aaaa","",null]');
assert (JSON.stringify (/((a)+){3,}/.exec("aaaab")) === '["aaaa","a","a"]');
assert (JSON.stringify (/((.)*){3,}/.exec("abcd")) === '["abcd","",null]');
assert (JSON.stringify (/((.)+){3,}/.exec("abcd")) === '["abcd","d","d"]');
assert (JSON.stringify (/((.){1,2}){1,2}/.exec("abc")) === '["abc","c","c"]');
assert (JSON.stringify (/(?:(a)*?)asd/.exec("aaasd")) === '["aaasd","a"]');
assert (JSON.stringify (/(?:(a)*)asd/.exec("aaasd")) === '["aaasd","a"]');
assert (JSON.stringify (/(.)*((a)*|(b)*)/.exec("ab")) === '["ab","b","",null,null]');
assert (JSON.stringify (/(.)*((x)|(y))+/.exec("xy")) === '["xy","x","y",null,"y"]');
assert (JSON.stringify (/(.)*((y)|(x))+/.exec("xy")) === '["xy","x","y","y",null]');
assert (JSON.stringify (/((?:a)*)/.exec("aaaad")) === '["aaaa","aaaa"]');
assert (JSON.stringify (/((y)+|x)+/.exec("x")) === '["x","x",null]');
assert (JSON.stringify (/((?:y)*|x)+/.exec("x")) === '["x","x"]');
assert (JSON.stringify (/((y)*|x)+/.exec("x")) === '["x","x",null]');
assert (JSON.stringify (/((y)*|x)*/.exec("x")) === '["x","x",null]');
assert (JSON.stringify (/(?:(y)*|x)*/.exec("x")) === '["x",null]');
assert (JSON.stringify (/(?:(y)*|(x))*/.exec("x")) === '["x",null,"x"]');
assert (JSON.stringify (/((?:a)*)asd/.exec("aaasd")) === '["aaasd","aa"]');
assert (JSON.stringify (/((?:a)+)asd/.exec("aaasd")) === '["aaasd","aa"]');
assert (JSON.stringify (/((?:a)*?)asd/.exec("aaasd")) === '["aaasd","aa"]');
assert (JSON.stringify (/((?:a)+?)asd/.exec("aaasd")) === '["aaasd","aa"]');
assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]');
assert (JSON.stringify (/((y)|(z)|(.))*/.exec("yaz")) === '["yaz","z",null,"z",null]');
assert (JSON.stringify (/((y)*|(z)*|(a)*)*/.exec("yazx")) === '["yaz","z",null,"z",null]')
assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]')
assert (JSON.stringify (/(?:(y)|(z)|(a))*/.exec("yazx")) === '["yaz",null,"z",null]')
assert (JSON.stringify (/((y)|(z)|(a))+?/.exec("yazx")) === '["y","y","y",null,null]')
assert (JSON.stringify (/(?:(y)|(z)|(a))+?/.exec("yazx")) === '["y","y",null,null]')
assert (JSON.stringify (/(?:(x|y)*|z)*/.exec("yz")) === '["yz",null]');
assert (JSON.stringify (/((x|y)*|z)*/.exec("yz")) == '["yz","z",null]');
assert (JSON.stringify (/(((x|y)*|(v|w)*|z)*)asd/.exec("xyzwvxzasd")) === '["xyzwvxzasd","xyzwvxz","z",null,null]');
assert (JSON.stringify (/((a)*){1,3}b/.exec("ab")) === '["ab","a","a"]')
assert (JSON.stringify (/((a)*){2,3}b/.exec("ab")) === '["ab","",null]')
assert (JSON.stringify (/((a)*){3,3}b/.exec("ab")) === '["ab","",null]')
assert (JSON.stringify (/((a)*){3,}b/.exec("aaaab")) === '["aaaab","",null]');
assert (JSON.stringify (/((a)*)*b/.exec("aaaab")) === '["aaaab","aaaa","a"]');
assert (JSON.stringify (/((bb?)*)*a/.exec("bbba")) === '["bbba","bbb","b"]');
assert (JSON.stringify (/((b)*)*a/.exec("bbba")) === '["bbba","bbb","b"]');
assert (JSON.stringify (/(aa|a)a/.exec("aa")) === '["aa","a"]');
assert (JSON.stringify (/(aa|a)?a/.exec("aa")) === '["aa","a"]');
assert (JSON.stringify (/(aa|a)+?a/.exec("aa")) === '["aa","a"]');
assert (JSON.stringify (/(?:aa|a)a/.exec("aa")) === '["aa"]');
assert (JSON.stringify (/(?:aa|a)?a/.exec("aa")) === '["aa"]');
assert (JSON.stringify (/(?:aa|a)+?a/.exec("aa")) === '["aa"]');
assert (JSON.stringify (/(aa|a)a/.exec("a")) === 'null');
assert (JSON.stringify (/(aa|a)?a/.exec("a")) === '["a",null]');
assert (JSON.stringify (/(aa|a)+?a/.exec("a")) === 'null');
assert (JSON.stringify (/(?:aa|a)a/.exec("a")) === 'null');
assert (JSON.stringify (/(?:aa|a)?a/.exec("a")) === '["a"]');
assert (JSON.stringify (/(?:aa|a)+?a/.exec("a")) === 'null');
assert (JSON.stringify (/a+/.exec("aaasd")) === '["aaa"]');
assert (JSON.stringify (/a+?/.exec("aaasd")) === '["a"]');
assert (JSON.stringify (/a+sd/.exec("aaasd")) === '["aaasd"]');
assert (JSON.stringify (/a+?sd/.exec("aaasd")) === '["aaasd"]');
assert (JSON.stringify (/a{2}sd/.exec("aaasd")) === '["aasd"]');
assert (JSON.stringify (/a{3}sd/.exec("aaasd")) === '["aaasd"]');
assert (JSON.stringify (/(?=a)/.exec("a")) === '[""]');
assert (JSON.stringify (/(?=a)+/.exec("a")) === '[""]');
assert (JSON.stringify (/(?=a)*/.exec("a")) === '[""]');
assert (JSON.stringify (/(?=(a))?/.exec("a")) === '["",null]');
assert (JSON.stringify (/(?=(a))+?/.exec("a")) === '["","a"]');
assert (JSON.stringify (/(?=(a))*?/.exec("a")) === '["",null]');
assert (JSON.stringify (/(?!a)/.exec("a")) === '[""]');
assert (JSON.stringify (/(?!a)+/.exec("a")) === '[""]');
assert (JSON.stringify (/(?!a)*/.exec("a")) === '[""]');
assert (JSON.stringify (/(?!(a))?/.exec("a")) === '["",null]');
assert (JSON.stringify (/(?!(a))+?/.exec("a")) === '["",null]');
assert (JSON.stringify (/(?!(a))*?/.exec("a")) === '["",null]');
assert (JSON.stringify (/al(?=(ma))*ma/.exec("alma")) === '["alma",null]');
assert (JSON.stringify (/al(?!(ma))*ma/.exec("alma")) === '["alma",null]');
assert (JSON.stringify (/al(?=(ma))+ma/.exec("alma")) === '["alma","ma"]');
assert (JSON.stringify (/al(?!(ma))+ma/.exec("alma")) === 'null');
assert (JSON.stringify (/(?=())x|/.exec("asd")) === '["",null]');
assert (JSON.stringify (/(?!())x|/.exec("asd")) === '["",null]');
assert (JSON.stringify (/(().*)+.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]');
assert (JSON.stringify (/(().*)+?.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]');
assert (JSON.stringify (/(?:().*)+.$/.exec("abcdefg")) === '["abcdefg",""]');
assert (JSON.stringify (/(?:().*)+?.$/.exec("abcdefg")) === '["abcdefg",""]');
assert (JSON.stringify(/((?=())|.)+^/.exec("a")) === '["","",""]');
assert (JSON.stringify(/(?:(|\b\w+?){2})+$/.exec("aaaa")) === '["aaaa","aaaa"]');
+9
View File
@@ -196,3 +196,12 @@ assert (r.exec("aa") == "aa,a");
r = new RegExp ("(a{0,1}?){0,1}a");
assert (r.exec("aa") == "aa,a");
r = new RegExp ("(|.)+");
assert (JSON.stringify (r.exec("asdfgh")) === '["asdfgh","h"]');
assert (JSON.stringify (/([^\W](){8,}?){5}/.exec("asdfghijk")) === '["asdfg","g",""]');
assert (JSON.stringify (/(()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi","",null,null]')
assert (JSON.stringify (/(()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi","",null,null]')
assert (JSON.stringify (/(?:()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi",null,null]')
assert (JSON.stringify (/(?:()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi",null,null]')
@@ -88,3 +88,6 @@ assert (r.exec ("\\c3") == "\\c3");
r = /\cIasd/;
assert (r.exec ("\tasd") == "\tasd");
r = /.??$/;
assert (JSON.stringify (r.exec("asd")) === '["d"]');
+1 -1
View File
@@ -13,7 +13,7 @@
// limitations under the License.
try {
/(?:(?=x)){1000}xyz/.exec('xyz');
/(?:(?=x)){10000}xyz/.exec('xyz');
assert(false);
} catch (e) {
assert(e instanceof RangeError);
+2
View File
@@ -85,3 +85,5 @@ assert("\u000A\u000D\u2028\u202911".trim() === "11");
assert("\u0009\u000B\u000C\u0020\u00A01\u0009\u000B\u000C\u0020\u00A0".trim() === "1");
assert("\u000A\u000D\u2028\u202911\u000A\u000D\u2028\u2029".trim() === "11");
assert ("\u200B".trim() === '\u200B')