LibRegex: Add support for forward references to named capture groups

This commit implements support for forward references to named capture
groups. We now allow patterns like \k<name>(?<name>x) and
self-references like (?<name>\k<name>x).
This commit is contained in:
aplefull 2025-07-24 14:04:04 +02:00 committed by Ali Mohammad Pur
parent 25a47ceb1b
commit 4b989b8efd
3 changed files with 98 additions and 18 deletions

View File

@ -284,3 +284,47 @@ test("complex named groups ordering", () => {
expect(result3.groups.first).toBe(undefined);
expect(result3.groups.second).toBe(undefined);
});
test("forward references to named groups", () => {
// Self-reference inside group
let result1 = /(?<a>\k<a>\w)../.exec("bab");
expect(result1).not.toBe(null);
expect(result1[0]).toBe("bab");
expect(result1[1]).toBe("b");
expect(result1.groups.a).toBe("b");
// Reference before group definition
let result2 = /\k<a>(?<a>b)\w\k<a>/.exec("bab");
expect(result2).not.toBe(null);
expect(result2[0]).toBe("bab");
expect(result2[1]).toBe("b");
expect(result2.groups.a).toBe("b");
let result3 = /(?<b>b)\k<a>(?<a>a)\k<b>/.exec("bab");
expect(result3).not.toBe(null);
expect(result3[0]).toBe("bab");
expect(result3[1]).toBe("b");
expect(result3[2]).toBe("a");
expect(result3.groups.a).toBe("a");
expect(result3.groups.b).toBe("b");
// Backward reference
let result4 = /(?<a>a)(?<b>b)\k<a>/.exec("aba");
expect(result4).not.toBe(null);
expect(result4[0]).toBe("aba");
expect(result4.groups.a).toBe("a");
expect(result4.groups.b).toBe("b");
// Mixed forward/backward with alternation
let result5 = /(?<a>a)(?<b>b)\k<a>|(?<c>c)/.exec("aba");
expect(result5).not.toBe(null);
expect(result5.groups.a).toBe("a");
expect(result5.groups.b).toBe("b");
expect(result5.groups.c).toBe(undefined);
});
test("invalid named group references", () => {
expect(() => {
new RegExp("(?<a>x)\\k<nonexistent>");
}).toThrow();
});

View File

@ -173,6 +173,7 @@ ALWAYS_INLINE void Parser::reset()
m_parser_state.capture_groups_count = 0;
m_parser_state.named_capture_groups_count = 0;
m_parser_state.named_capture_groups.clear();
m_parser_state.unresolved_named_references.clear();
}
Parser::Result Parser::parse(Optional<AllOptions> regex_options)
@ -182,10 +183,13 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
reset();
if (regex_options.has_value())
m_parser_state.regex_options = regex_options.value();
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) {
consume(TokenType::Eof, Error::InvalidPattern);
else
if (!resolve_forward_named_references())
set_error(Error::InvalidNameForCaptureGroup);
} else {
set_error(Error::InvalidPattern);
}
auto capture_groups = m_parser_state.named_capture_groups.keys();
@ -1641,23 +1645,32 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
auto it = m_parser_state.named_capture_groups.find(name);
if (it == m_parser_state.named_capture_groups.end()) {
set_error(Error::InvalidNameForCaptureGroup);
return false;
if (it != m_parser_state.named_capture_groups.end()) {
// Use the first occurrence of the named group for the backreference
// This follows ECMAScript behavior where \k<name> refers to the first
// group with that name in left-to-right order, regardless of alternative
auto group_index = it->value.first().group_index;
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
if (maybe_length.has_value()) {
// Backward reference
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(group_index) } });
} else {
// Self-reference or forward reference
auto placeholder_index = 0;
auto bytecode_offset = stack.size();
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(placeholder_index) } });
m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 });
}
} else {
// Forward reference
auto placeholder_index = 0;
auto bytecode_offset = stack.size();
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(placeholder_index) } });
m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 });
}
// Use the first occurrence of the named group for the backreference
// This follows ECMAScript behavior where \k<name> refers to the first
// group with that name in left-to-right order, regardless of alternative
auto group_index = it->value.first().group_index;
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
if (!maybe_length.has_value()) {
set_error(Error::InvalidNameForCaptureGroup);
return false;
}
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, (ByteCodeValueType)group_index } });
return true;
}
@ -2817,4 +2830,20 @@ size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
return count;
}
bool Parser::resolve_forward_named_references()
{
for (auto const& unresolved_ref : m_parser_state.unresolved_named_references) {
auto it = m_parser_state.named_capture_groups.find(unresolved_ref.name);
if (it == m_parser_state.named_capture_groups.end()) {
return false;
}
auto group_index = it->value.first().group_index;
m_parser_state.bytecode.at(unresolved_ref.bytecode_offset) = (ByteCodeValueType)group_index;
}
return true;
}
}

View File

@ -90,6 +90,7 @@ public:
protected:
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
bool resolve_forward_named_references();
ALWAYS_INLINE bool match(TokenType type) const;
ALWAYS_INLINE bool match(char ch) const;
@ -122,6 +123,12 @@ protected:
HashMap<size_t, size_t> capture_group_minimum_lengths;
OrderedHashMap<FlyString, Vector<NamedCaptureGroup>> named_capture_groups;
struct UnresolvedNamedReference {
FlyString name;
size_t bytecode_offset;
};
Vector<UnresolvedNamedReference> unresolved_named_references;
explicit ParserState(Lexer& lexer)
: lexer(lexer)
, current_token(lexer.next())