LibRegex: Fix backreferences to undefined capture groups

Fixes handling of backreferences when the referenced capture group is
undefined or hasn't participated in the match.
CharacterCompareType::NamedReference is added to distinguish numbered
(\1) from named (\k<name>) backreferences. Numbered backreferences use
exact group lookup. Named backreferences search for participating
groups among duplicates.
This commit is contained in:
aplefull 2025-07-23 20:48:34 +02:00 committed by Ali Mohammad Pur
parent 9b8f6b8108
commit c4eef822de
5 changed files with 197 additions and 9 deletions

View File

@ -609,12 +609,21 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
}
case CharacterCompareType::Reference: {
auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1;
if (input.match_index >= state.capture_group_matches_size())
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (input.match_index >= state.capture_group_matches_size()) {
had_zero_length_match = true;
if (current_inversion_state())
inverse_matched = true;
break;
}
auto groups = state.capture_group_matches(input.match_index);
if (groups.size() <= reference_number)
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (groups.size() <= reference_number) {
had_zero_length_match = true;
if (current_inversion_state())
inverse_matched = true;
break;
}
auto str = groups.at(reference_number).view;
@ -628,6 +637,59 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
}
break;
}
case CharacterCompareType::NamedReference: {
auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1;
if (input.match_index >= state.capture_group_matches_size()) {
had_zero_length_match = true;
if (current_inversion_state())
inverse_matched = true;
break;
}
auto groups = state.capture_group_matches(input.match_index);
if (groups.size() <= reference_number) {
had_zero_length_match = true;
if (current_inversion_state())
inverse_matched = true;
break;
}
RegexStringView str {};
auto reference_name_index = m_bytecode->get_group_name_index(reference_number);
if (reference_name_index.has_value()) {
auto target_name_string = m_bytecode->get_string(reference_name_index.value());
for (size_t i = 0; i < groups.size(); ++i) {
if (groups[i].view.is_null())
continue;
auto group_name_index = m_bytecode->get_group_name_index(i);
if (group_name_index.has_value()) {
auto group_name_string = m_bytecode->get_string(group_name_index.value());
if (group_name_string == target_name_string) {
str = groups[i].view;
break;
}
}
}
}
if (input.view.length() < state.string_position + str.length()) {
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
if (compare_string(input, state, str, had_zero_length_match)) {
if (current_inversion_state())
inverse_matched = true;
}
break;
}
case CharacterCompareType::Property: {
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
compare_property(input, state, property, current_inversion_state(), inverse_matched);
@ -946,6 +1008,9 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
} else if (compare_type == CharacterCompareType::Reference) {
auto ref = m_bytecode->at(offset++);
result.append({ compare_type, ref });
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ref = m_bytecode->at(offset++);
result.append({ compare_type, ref });
} else if (compare_type == CharacterCompareType::String) {
auto& length = m_bytecode->at(offset++);
for (size_t k = 0; k < length; ++k)
@ -1028,6 +1093,24 @@ Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<Ma
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1));
}
}
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ref = m_bytecode->at(offset++);
result.empend(ByteString::formatted(" named_number={}", ref));
if (input.has_value()) {
if (state().capture_group_matches_size() > input->match_index) {
auto match = state().capture_group_matches(input->match_index);
if (match.size() > ref) {
auto& group = match[ref];
result.empend(ByteString::formatted(" left={}", group.left_column));
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
result.empend(ByteString::formatted(" contents='{}'", group.view));
} else {
result.empend(ByteString::formatted(" (invalid ref {}, max={})", ref, match.size() - 1));
}
} else {
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1));
}
}
} else if (compare_type == CharacterCompareType::String) {
auto& length = m_bytecode->at(offset++);
StringBuilder str_builder;

View File

@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType {
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
@ -261,6 +262,11 @@ public:
FlyString get_string(size_t index) const { return m_string_table.get(index); }
auto const& string_table() const { return m_string_table; }
Optional<size_t> get_group_name_index(size_t group_index) const
{
return m_group_name_mappings.get(group_index);
}
void last_chunk() const = delete;
void first_chunk() const = delete;
@ -279,6 +285,10 @@ public:
m_string_table.m_table.set(entry.key, entry.value);
}
m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);
for (auto const& mapping : other.m_group_name_mappings) {
m_group_name_mappings.set(mapping.key, mapping.value);
}
}
}
@ -326,8 +336,11 @@ public:
void insert_bytecode_group_capture_right(size_t capture_groups_count, FlyString name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
empend(m_string_table.set(move(name)));
auto name_string_index = m_string_table.set(move(name));
empend(name_string_index);
empend(capture_groups_count);
m_group_name_mappings.set(capture_groups_count - 1, name_string_index);
}
enum class LookAroundType {
@ -618,6 +631,7 @@ private:
static bool s_opcodes_initialized;
static size_t s_next_checkpoint_serial_id;
StringTable m_string_table;
HashMap<size_t, size_t> m_group_name_mappings;
};
#define ENUMERATE_EXECUTION_RESULTS \

View File

@ -131,6 +131,7 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return false;
case CharacterCompareType::Reference:
case CharacterCompareType::NamedReference:
// We've handled this before coming here.
break;
case CharacterCompareType::Property:
@ -512,6 +513,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true;
case CharacterCompareType::Reference:
case CharacterCompareType::NamedReference:
// We've handled this before coming here.
break;
case CharacterCompareType::Property:
@ -755,7 +757,7 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi
break;
if (any_of(compares, [&](auto& compare) {
return compare.type == CharacterCompareType::AnyChar || compare.type == CharacterCompareType::Reference;
return compare.type == CharacterCompareType::AnyChar || compare.type == CharacterCompareType::Reference || compare.type == CharacterCompareType::NamedReference;
}))
return AtomicRewritePreconditionResult::NotSatisfied;
@ -1835,6 +1837,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
case CharacterCompareType::And:
return LookupTableInsertionOutcome::FlushOnInsertion;
case CharacterCompareType::Reference:
case CharacterCompareType::NamedReference:
case CharacterCompareType::Property:
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Script:

View File

@ -496,7 +496,6 @@ bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match
if (try_skip({ backref_name, 2 })) {
if (!m_capture_group_seen[i - 1])
return set_error(Error::InvalidNumber);
match_length_minimum += m_capture_group_minimum_lengths[i - 1];
bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } });
return true;
}
@ -1656,8 +1655,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
return false;
}
match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } });
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, (ByteCodeValueType)group_index } });
return true;
}

View File

@ -1379,3 +1379,93 @@ TEST_CASE(account_for_opcode_size_calculating_incoming_jump_edges)
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv);
}
}
TEST_CASE(backreference_to_undefined_capture_groups)
{
{
// Test duplicate named groups in alternatives where backreference refers to participating group
Regex<ECMA262> re("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
auto result = re.match("bb"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "bb"sv);
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
EXPECT(result.capture_group_matches.first()[0].view.is_null());
EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv);
}
{
// Test duplicate named groups with quantifier
Regex<ECMA262> re("(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}"sv);
auto result = re.match("aabb"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aabb"sv);
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
EXPECT(result.capture_group_matches.first()[0].view.is_null());
EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv);
}
{
// Test that first alternative works too
Regex<ECMA262> re("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
auto result = re.match("aa"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv);
EXPECT_EQ(result.capture_group_matches.first().size(), 2u);
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "a"sv);
EXPECT(result.capture_group_matches.first()[1].view.is_null());
}
{
// Test numbered backreference to undefined group
Regex<ECMA262> re("(.*?)a(?!(a+)b\\2c)\\2(.*)"sv);
auto result = re.match("baaabaac"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "baaabaac"sv);
EXPECT_EQ(result.capture_group_matches.first().size(), 3u);
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "ba"sv);
EXPECT(result.capture_group_matches.first()[1].view.is_null());
EXPECT_EQ(result.capture_group_matches.first()[2].view.to_byte_string(), "abaac"sv);
}
{
Regex<ECMA262> re("^(?:(?<a>x)|(?<a>y)|z)\\k<a>$"sv);
// Third alternative matches and backreference is undefined
auto result1 = re.match("z"sv);
EXPECT_EQ(result1.success, true);
EXPECT_EQ(result1.matches.size(), 1u);
EXPECT_EQ(result1.matches.first().view.to_byte_string(), "z"sv);
EXPECT_EQ(result1.capture_group_matches.first().size(), 2u);
EXPECT(result1.capture_group_matches.first()[0].view.is_null());
EXPECT(result1.capture_group_matches.first()[1].view.is_null());
}
{
// Quantified version of the above pattern
Regex<ECMA262> re("^(?:(?<a>x)|(?<a>y)|z){2}\\k<a>$"sv);
auto result1 = re.match("xz"sv);
EXPECT_EQ(result1.success, true);
EXPECT_EQ(result1.matches.size(), 1u);
EXPECT_EQ(result1.matches.first().view.to_byte_string(), "xz"sv);
EXPECT_EQ(result1.capture_group_matches.first().size(), 2u);
EXPECT(result1.capture_group_matches.first()[0].view.is_null());
EXPECT(result1.capture_group_matches.first()[1].view.is_null());
auto result2 = re.match("yz"sv);
EXPECT_EQ(result2.success, true);
EXPECT_EQ(result2.matches.size(), 1u);
EXPECT_EQ(result2.matches.first().view.to_byte_string(), "yz"sv);
EXPECT_EQ(result2.capture_group_matches.first().size(), 2u);
EXPECT(result2.capture_group_matches.first()[0].view.is_null());
EXPECT(result2.capture_group_matches.first()[1].view.is_null());
}
}