mirror of
https://github.com/zebrajr/ladybird.git
synced 2025-12-06 12:20:00 +01:00
LibRegex: Fix backreferences to undefined capture groups
Fixes handling of backreferences when the referenced capture group is undefined or hasn't participated in the match. CharacterCompareType::NamedReference is added to distinguish numbered (\1) from named (\k<name>) backreferences. Numbered backreferences use exact group lookup. Named backreferences search for participating groups among duplicates.
This commit is contained in:
parent
9b8f6b8108
commit
c4eef822de
|
|
@ -609,12 +609,21 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
}
|
||||
case CharacterCompareType::Reference: {
|
||||
auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1;
|
||||
if (input.match_index >= state.capture_group_matches_size())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
if (input.match_index >= state.capture_group_matches_size()) {
|
||||
had_zero_length_match = true;
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
break;
|
||||
}
|
||||
|
||||
auto groups = state.capture_group_matches(input.match_index);
|
||||
if (groups.size() <= reference_number)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (groups.size() <= reference_number) {
|
||||
had_zero_length_match = true;
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
break;
|
||||
}
|
||||
|
||||
auto str = groups.at(reference_number).view;
|
||||
|
||||
|
|
@ -628,6 +637,59 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
}
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::NamedReference: {
|
||||
auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1;
|
||||
|
||||
if (input.match_index >= state.capture_group_matches_size()) {
|
||||
had_zero_length_match = true;
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
break;
|
||||
}
|
||||
|
||||
auto groups = state.capture_group_matches(input.match_index);
|
||||
|
||||
if (groups.size() <= reference_number) {
|
||||
had_zero_length_match = true;
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
break;
|
||||
}
|
||||
|
||||
RegexStringView str {};
|
||||
|
||||
auto reference_name_index = m_bytecode->get_group_name_index(reference_number);
|
||||
|
||||
if (reference_name_index.has_value()) {
|
||||
auto target_name_string = m_bytecode->get_string(reference_name_index.value());
|
||||
|
||||
for (size_t i = 0; i < groups.size(); ++i) {
|
||||
if (groups[i].view.is_null())
|
||||
continue;
|
||||
|
||||
auto group_name_index = m_bytecode->get_group_name_index(i);
|
||||
|
||||
if (group_name_index.has_value()) {
|
||||
auto group_name_string = m_bytecode->get_string(group_name_index.value());
|
||||
|
||||
if (group_name_string == target_name_string) {
|
||||
str = groups[i].view;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (input.view.length() < state.string_position + str.length()) {
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
if (compare_string(input, state, str, had_zero_length_match)) {
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::Property: {
|
||||
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
|
||||
compare_property(input, state, property, current_inversion_state(), inverse_matched);
|
||||
|
|
@ -946,6 +1008,9 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
|
|||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.append({ compare_type, ref });
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.append({ compare_type, ref });
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
auto& length = m_bytecode->at(offset++);
|
||||
for (size_t k = 0; k < length; ++k)
|
||||
|
|
@ -1028,6 +1093,24 @@ Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<Ma
|
|||
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.empend(ByteString::formatted(" named_number={}", ref));
|
||||
if (input.has_value()) {
|
||||
if (state().capture_group_matches_size() > input->match_index) {
|
||||
auto match = state().capture_group_matches(input->match_index);
|
||||
if (match.size() > ref) {
|
||||
auto& group = match[ref];
|
||||
result.empend(ByteString::formatted(" left={}", group.left_column));
|
||||
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
|
||||
result.empend(ByteString::formatted(" contents='{}'", group.view));
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid ref {}, max={})", ref, match.size() - 1));
|
||||
}
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
auto& length = m_bytecode->at(offset++);
|
||||
StringBuilder str_builder;
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType {
|
|||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
||||
|
|
@ -261,6 +262,11 @@ public:
|
|||
FlyString get_string(size_t index) const { return m_string_table.get(index); }
|
||||
auto const& string_table() const { return m_string_table; }
|
||||
|
||||
Optional<size_t> get_group_name_index(size_t group_index) const
|
||||
{
|
||||
return m_group_name_mappings.get(group_index);
|
||||
}
|
||||
|
||||
void last_chunk() const = delete;
|
||||
void first_chunk() const = delete;
|
||||
|
||||
|
|
@ -279,6 +285,10 @@ public:
|
|||
m_string_table.m_table.set(entry.key, entry.value);
|
||||
}
|
||||
m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);
|
||||
|
||||
for (auto const& mapping : other.m_group_name_mappings) {
|
||||
m_group_name_mappings.set(mapping.key, mapping.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -326,8 +336,11 @@ public:
|
|||
void insert_bytecode_group_capture_right(size_t capture_groups_count, FlyString name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
|
||||
empend(m_string_table.set(move(name)));
|
||||
auto name_string_index = m_string_table.set(move(name));
|
||||
empend(name_string_index);
|
||||
empend(capture_groups_count);
|
||||
|
||||
m_group_name_mappings.set(capture_groups_count - 1, name_string_index);
|
||||
}
|
||||
|
||||
enum class LookAroundType {
|
||||
|
|
@ -618,6 +631,7 @@ private:
|
|||
static bool s_opcodes_initialized;
|
||||
static size_t s_next_checkpoint_serial_id;
|
||||
StringTable m_string_table;
|
||||
HashMap<size_t, size_t> m_group_name_mappings;
|
||||
};
|
||||
|
||||
#define ENUMERATE_EXECUTION_RESULTS \
|
||||
|
|
|
|||
|
|
@ -131,6 +131,7 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
|
|||
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
|
||||
return false;
|
||||
case CharacterCompareType::Reference:
|
||||
case CharacterCompareType::NamedReference:
|
||||
// We've handled this before coming here.
|
||||
break;
|
||||
case CharacterCompareType::Property:
|
||||
|
|
@ -512,6 +513,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
|
|||
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
|
||||
return true;
|
||||
case CharacterCompareType::Reference:
|
||||
case CharacterCompareType::NamedReference:
|
||||
// We've handled this before coming here.
|
||||
break;
|
||||
case CharacterCompareType::Property:
|
||||
|
|
@ -755,7 +757,7 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi
|
|||
break;
|
||||
|
||||
if (any_of(compares, [&](auto& compare) {
|
||||
return compare.type == CharacterCompareType::AnyChar || compare.type == CharacterCompareType::Reference;
|
||||
return compare.type == CharacterCompareType::AnyChar || compare.type == CharacterCompareType::Reference || compare.type == CharacterCompareType::NamedReference;
|
||||
}))
|
||||
return AtomicRewritePreconditionResult::NotSatisfied;
|
||||
|
||||
|
|
@ -1835,6 +1837,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
|
|||
case CharacterCompareType::And:
|
||||
return LookupTableInsertionOutcome::FlushOnInsertion;
|
||||
case CharacterCompareType::Reference:
|
||||
case CharacterCompareType::NamedReference:
|
||||
case CharacterCompareType::Property:
|
||||
case CharacterCompareType::GeneralCategory:
|
||||
case CharacterCompareType::Script:
|
||||
|
|
|
|||
|
|
@ -496,7 +496,6 @@ bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match
|
|||
if (try_skip({ backref_name, 2 })) {
|
||||
if (!m_capture_group_seen[i - 1])
|
||||
return set_error(Error::InvalidNumber);
|
||||
match_length_minimum += m_capture_group_minimum_lengths[i - 1];
|
||||
bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } });
|
||||
return true;
|
||||
}
|
||||
|
|
@ -1656,8 +1655,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
return false;
|
||||
}
|
||||
|
||||
match_length_minimum += maybe_length.value();
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } });
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, (ByteCodeValueType)group_index } });
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1379,3 +1379,93 @@ TEST_CASE(account_for_opcode_size_calculating_incoming_jump_edges)
|
|||
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(backreference_to_undefined_capture_groups)
|
||||
{
|
||||
{
|
||||
// Test duplicate named groups in alternatives where backreference refers to participating group
|
||||
Regex<ECMA262> re("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
|
||||
auto result = re.match("bb"sv);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.matches.size(), 1u);
|
||||
EXPECT_EQ(result.matches.first().view.to_byte_string(), "bb"sv);
|
||||
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
|
||||
EXPECT(result.capture_group_matches.first()[0].view.is_null());
|
||||
EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv);
|
||||
}
|
||||
|
||||
{
|
||||
// Test duplicate named groups with quantifier
|
||||
Regex<ECMA262> re("(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}"sv);
|
||||
auto result = re.match("aabb"sv);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.matches.size(), 1u);
|
||||
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aabb"sv);
|
||||
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
|
||||
EXPECT(result.capture_group_matches.first()[0].view.is_null());
|
||||
EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv);
|
||||
}
|
||||
|
||||
{
|
||||
// Test that first alternative works too
|
||||
Regex<ECMA262> re("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
|
||||
auto result = re.match("aa"sv);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.matches.size(), 1u);
|
||||
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv);
|
||||
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
|
||||
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "a"sv);
|
||||
EXPECT(result.capture_group_matches.first()[1].view.is_null());
|
||||
}
|
||||
|
||||
{
|
||||
// Test numbered backreference to undefined group
|
||||
Regex<ECMA262> re("(.*?)a(?!(a+)b\\2c)\\2(.*)"sv);
|
||||
auto result = re.match("baaabaac"sv);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.matches.size(), 1u);
|
||||
EXPECT_EQ(result.matches.first().view.to_byte_string(), "baaabaac"sv);
|
||||
EXPECT_EQ(result.capture_group_matches.first().size(), 3u);
|
||||
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "ba"sv);
|
||||
EXPECT(result.capture_group_matches.first()[1].view.is_null());
|
||||
EXPECT_EQ(result.capture_group_matches.first()[2].view.to_byte_string(), "abaac"sv);
|
||||
}
|
||||
|
||||
{
|
||||
Regex<ECMA262> re("^(?:(?<a>x)|(?<a>y)|z)\\k<a>$"sv);
|
||||
|
||||
// Third alternative matches and backreference is undefined
|
||||
auto result1 = re.match("z"sv);
|
||||
EXPECT_EQ(result1.success, true);
|
||||
EXPECT_EQ(result1.matches.size(), 1u);
|
||||
EXPECT_EQ(result1.matches.first().view.to_byte_string(), "z"sv);
|
||||
EXPECT_EQ(result1.capture_group_matches.first().size(), 2u);
|
||||
EXPECT(result1.capture_group_matches.first()[0].view.is_null());
|
||||
EXPECT(result1.capture_group_matches.first()[1].view.is_null());
|
||||
}
|
||||
|
||||
{
|
||||
// Quantified version of the above pattern
|
||||
Regex<ECMA262> re("^(?:(?<a>x)|(?<a>y)|z){2}\\k<a>$"sv);
|
||||
|
||||
auto result1 = re.match("xz"sv);
|
||||
EXPECT_EQ(result1.success, true);
|
||||
EXPECT_EQ(result1.matches.size(), 1u);
|
||||
EXPECT_EQ(result1.matches.first().view.to_byte_string(), "xz"sv);
|
||||
EXPECT_EQ(result1.capture_group_matches.first().size(), 2u);
|
||||
EXPECT(result1.capture_group_matches.first()[0].view.is_null());
|
||||
EXPECT(result1.capture_group_matches.first()[1].view.is_null());
|
||||
|
||||
auto result2 = re.match("yz"sv);
|
||||
EXPECT_EQ(result2.success, true);
|
||||
EXPECT_EQ(result2.matches.size(), 1u);
|
||||
EXPECT_EQ(result2.matches.first().view.to_byte_string(), "yz"sv);
|
||||
EXPECT_EQ(result2.capture_group_matches.first().size(), 2u);
|
||||
EXPECT(result2.capture_group_matches.first()[0].view.is_null());
|
||||
EXPECT(result2.capture_group_matches.first()[1].view.is_null());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user