AK: Add a couple of Utf16String factories

* Utf16String::from_utf8_with_replacement_character
* Utf16String::from_code_point
This commit is contained in:
Timothy Flynn 2025-07-24 11:39:15 -04:00 committed by Jelle Raaijmakers
parent b4435bd50c
commit f53389bab1
3 changed files with 87 additions and 0 deletions

View File

@ -13,6 +13,28 @@ namespace AK {
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
Utf16String Utf16String::from_utf8_with_replacement_character(StringView utf8_string, WithBOMHandling with_bom_handling)
{
if (auto bytes = utf8_string.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } }))
utf8_string = utf8_string.substring_view(3);
Utf8View utf8_view { utf8_string };
if (utf8_view.validate(AllowLonelySurrogates::No))
return Utf16String::from_utf8_without_validation(utf8_string);
StringBuilder builder(StringBuilder::Mode::UTF16);
for (auto code_point : utf8_view) {
if (is_unicode_surrogate(code_point))
builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
else
builder.append_code_point(code_point);
}
return builder.to_utf16_string_without_validation();
}
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
{
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {

View File

@ -45,6 +45,12 @@ public:
return from_utf8_without_validation(utf8_string);
}
enum class WithBOMHandling {
No,
Yes,
};
static Utf16String from_utf8_with_replacement_character(StringView, WithBOMHandling = WithBOMHandling::Yes);
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
{
if (!Utf8View { utf8_string }.validate())
@ -81,6 +87,18 @@ public:
requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
static Utf16String from_utf16_without_validation(T&&) = delete;
ALWAYS_INLINE static Utf16String from_code_point(u32 code_point)
{
Array<char16_t, 2> code_units;
size_t length_in_code_units = 0;
(void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
code_units[length_in_code_units++] = code_unit;
});
return from_utf16_without_validation({ code_units.data(), length_in_code_units });
}
template<typename... Parameters>
ALWAYS_INLINE static Utf16String formatted(CheckedFormatString<Parameters...>&& format, Parameters const&... parameters)
{

View File

@ -96,6 +96,27 @@ TEST_CASE(from_utf8)
}
}
TEST_CASE(from_utf8_with_replacement_character)
{
auto string1 = Utf16String::from_utf8_with_replacement_character("long string \xf4\x8f\xbf\xc0"sv, Utf16String::WithBOMHandling::No); // U+110000
EXPECT_EQ(string1, u"long string \ufffd\ufffd\ufffd\ufffd"sv);
auto string3 = Utf16String::from_utf8_with_replacement_character("A valid string!"sv, Utf16String::WithBOMHandling::No);
EXPECT_EQ(string3, "A valid string!"sv);
auto string4 = Utf16String::from_utf8_with_replacement_character(""sv, Utf16String::WithBOMHandling::No);
EXPECT_EQ(string4, ""sv);
auto string5 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::Yes);
EXPECT_EQ(string5, "WHF!"sv);
auto string6 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::No);
EXPECT_EQ(string6, u"\ufeffWHF!"sv);
auto string7 = Utf16String::from_utf8_with_replacement_character("\xED\xA0\x80WHF!"sv); // U+D800
EXPECT_EQ(string7, u"\ufffdWHF!"sv);
}
TEST_CASE(from_utf16)
{
{
@ -235,6 +256,32 @@ TEST_CASE(from_utf32)
}
}
TEST_CASE(from_code_point)
{
u32 code_point = 0;
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; ++code_point) {
auto string = Utf16String::from_code_point(code_point);
EXPECT_EQ(string.length_in_code_units(), 1uz);
EXPECT_EQ(string.length_in_code_points(), 1uz);
EXPECT_EQ(string.code_point_at(0), code_point);
EXPECT_EQ(string.code_unit_at(0), code_point);
}
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT + 10'000; ++code_point) {
auto string = Utf16String::from_code_point(code_point);
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 1uz);
EXPECT_EQ(string.code_point_at(0), code_point);
size_t i = 0;
(void)AK::UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
EXPECT_EQ(string.code_unit_at(i++), code_unit);
});
EXPECT_EQ(i, 2uz);
}
}
TEST_CASE(formatted)
{
{