mirror of
https://github.com/zebrajr/ladybird.git
synced 2025-12-06 00:19:53 +01:00
LibRegex: Add support for forward references to named capture groups
This commit implements support for forward references to named capture groups. We now allow patterns like \k<name>(?<name>x) and self-references like (?<name>\k<name>x).
This commit is contained in:
parent
25a47ceb1b
commit
4b989b8efd
|
|
@ -284,3 +284,47 @@ test("complex named groups ordering", () => {
|
|||
expect(result3.groups.first).toBe(undefined);
|
||||
expect(result3.groups.second).toBe(undefined);
|
||||
});
|
||||
|
||||
test("forward references to named groups", () => {
|
||||
// Self-reference inside group
|
||||
let result1 = /(?<a>\k<a>\w)../.exec("bab");
|
||||
expect(result1).not.toBe(null);
|
||||
expect(result1[0]).toBe("bab");
|
||||
expect(result1[1]).toBe("b");
|
||||
expect(result1.groups.a).toBe("b");
|
||||
|
||||
// Reference before group definition
|
||||
let result2 = /\k<a>(?<a>b)\w\k<a>/.exec("bab");
|
||||
expect(result2).not.toBe(null);
|
||||
expect(result2[0]).toBe("bab");
|
||||
expect(result2[1]).toBe("b");
|
||||
expect(result2.groups.a).toBe("b");
|
||||
|
||||
let result3 = /(?<b>b)\k<a>(?<a>a)\k<b>/.exec("bab");
|
||||
expect(result3).not.toBe(null);
|
||||
expect(result3[0]).toBe("bab");
|
||||
expect(result3[1]).toBe("b");
|
||||
expect(result3[2]).toBe("a");
|
||||
expect(result3.groups.a).toBe("a");
|
||||
expect(result3.groups.b).toBe("b");
|
||||
|
||||
// Backward reference
|
||||
let result4 = /(?<a>a)(?<b>b)\k<a>/.exec("aba");
|
||||
expect(result4).not.toBe(null);
|
||||
expect(result4[0]).toBe("aba");
|
||||
expect(result4.groups.a).toBe("a");
|
||||
expect(result4.groups.b).toBe("b");
|
||||
|
||||
// Mixed forward/backward with alternation
|
||||
let result5 = /(?<a>a)(?<b>b)\k<a>|(?<c>c)/.exec("aba");
|
||||
expect(result5).not.toBe(null);
|
||||
expect(result5.groups.a).toBe("a");
|
||||
expect(result5.groups.b).toBe("b");
|
||||
expect(result5.groups.c).toBe(undefined);
|
||||
});
|
||||
|
||||
test("invalid named group references", () => {
|
||||
expect(() => {
|
||||
new RegExp("(?<a>x)\\k<nonexistent>");
|
||||
}).toThrow();
|
||||
});
|
||||
|
|
|
|||
|
|
@ -173,6 +173,7 @@ ALWAYS_INLINE void Parser::reset()
|
|||
m_parser_state.capture_groups_count = 0;
|
||||
m_parser_state.named_capture_groups_count = 0;
|
||||
m_parser_state.named_capture_groups.clear();
|
||||
m_parser_state.unresolved_named_references.clear();
|
||||
}
|
||||
|
||||
Parser::Result Parser::parse(Optional<AllOptions> regex_options)
|
||||
|
|
@ -182,10 +183,13 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
|
|||
reset();
|
||||
if (regex_options.has_value())
|
||||
m_parser_state.regex_options = regex_options.value();
|
||||
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
|
||||
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) {
|
||||
consume(TokenType::Eof, Error::InvalidPattern);
|
||||
else
|
||||
if (!resolve_forward_named_references())
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
} else {
|
||||
set_error(Error::InvalidPattern);
|
||||
}
|
||||
|
||||
auto capture_groups = m_parser_state.named_capture_groups.keys();
|
||||
|
||||
|
|
@ -1641,23 +1645,32 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
}
|
||||
|
||||
auto it = m_parser_state.named_capture_groups.find(name);
|
||||
if (it == m_parser_state.named_capture_groups.end()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
if (it != m_parser_state.named_capture_groups.end()) {
|
||||
|
||||
// Use the first occurrence of the named group for the backreference
|
||||
// This follows ECMAScript behavior where \k<name> refers to the first
|
||||
// group with that name in left-to-right order, regardless of alternative
|
||||
auto group_index = it->value.first().group_index;
|
||||
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
|
||||
if (maybe_length.has_value()) {
|
||||
// Backward reference
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(group_index) } });
|
||||
} else {
|
||||
// Self-reference or forward reference
|
||||
auto placeholder_index = 0;
|
||||
auto bytecode_offset = stack.size();
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(placeholder_index) } });
|
||||
|
||||
m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 });
|
||||
}
|
||||
} else {
|
||||
// Forward reference
|
||||
auto placeholder_index = 0;
|
||||
auto bytecode_offset = stack.size();
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast<ByteCodeValueType>(placeholder_index) } });
|
||||
|
||||
m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 });
|
||||
}
|
||||
|
||||
// Use the first occurrence of the named group for the backreference
|
||||
// This follows ECMAScript behavior where \k<name> refers to the first
|
||||
// group with that name in left-to-right order, regardless of alternative
|
||||
auto group_index = it->value.first().group_index;
|
||||
|
||||
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
|
||||
if (!maybe_length.has_value()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, (ByteCodeValueType)group_index } });
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -2817,4 +2830,20 @@ size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
|
|||
return count;
|
||||
}
|
||||
|
||||
bool Parser::resolve_forward_named_references()
|
||||
{
|
||||
for (auto const& unresolved_ref : m_parser_state.unresolved_named_references) {
|
||||
auto it = m_parser_state.named_capture_groups.find(unresolved_ref.name);
|
||||
if (it == m_parser_state.named_capture_groups.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto group_index = it->value.first().group_index;
|
||||
|
||||
m_parser_state.bytecode.at(unresolved_ref.bytecode_offset) = (ByteCodeValueType)group_index;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ public:
|
|||
|
||||
protected:
|
||||
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
|
||||
bool resolve_forward_named_references();
|
||||
|
||||
ALWAYS_INLINE bool match(TokenType type) const;
|
||||
ALWAYS_INLINE bool match(char ch) const;
|
||||
|
|
@ -122,6 +123,12 @@ protected:
|
|||
HashMap<size_t, size_t> capture_group_minimum_lengths;
|
||||
OrderedHashMap<FlyString, Vector<NamedCaptureGroup>> named_capture_groups;
|
||||
|
||||
struct UnresolvedNamedReference {
|
||||
FlyString name;
|
||||
size_t bytecode_offset;
|
||||
};
|
||||
Vector<UnresolvedNamedReference> unresolved_named_references;
|
||||
|
||||
explicit ParserState(Lexer& lexer)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user