mirror of
https://github.com/zebrajr/ladybird.git
synced 2025-12-06 00:19:53 +01:00
AK: Add a couple of Utf16String factories
* Utf16String::from_utf8_with_replacement_character * Utf16String::from_code_point
This commit is contained in:
parent
b4435bd50c
commit
f53389bab1
|
|
@ -13,6 +13,28 @@ namespace AK {
|
|||
|
||||
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
|
||||
|
||||
Utf16String Utf16String::from_utf8_with_replacement_character(StringView utf8_string, WithBOMHandling with_bom_handling)
|
||||
{
|
||||
if (auto bytes = utf8_string.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } }))
|
||||
utf8_string = utf8_string.substring_view(3);
|
||||
|
||||
Utf8View utf8_view { utf8_string };
|
||||
|
||||
if (utf8_view.validate(AllowLonelySurrogates::No))
|
||||
return Utf16String::from_utf8_without_validation(utf8_string);
|
||||
|
||||
StringBuilder builder(StringBuilder::Mode::UTF16);
|
||||
|
||||
for (auto code_point : utf8_view) {
|
||||
if (is_unicode_surrogate(code_point))
|
||||
builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
|
||||
else
|
||||
builder.append_code_point(code_point);
|
||||
}
|
||||
|
||||
return builder.to_utf16_string_without_validation();
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
|
||||
{
|
||||
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {
|
||||
|
|
|
|||
|
|
@ -45,6 +45,12 @@ public:
|
|||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
enum class WithBOMHandling {
|
||||
No,
|
||||
Yes,
|
||||
};
|
||||
static Utf16String from_utf8_with_replacement_character(StringView, WithBOMHandling = WithBOMHandling::Yes);
|
||||
|
||||
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
|
||||
{
|
||||
if (!Utf8View { utf8_string }.validate())
|
||||
|
|
@ -81,6 +87,18 @@ public:
|
|||
requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
|
||||
static Utf16String from_utf16_without_validation(T&&) = delete;
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_code_point(u32 code_point)
|
||||
{
|
||||
Array<char16_t, 2> code_units;
|
||||
size_t length_in_code_units = 0;
|
||||
|
||||
(void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
|
||||
code_units[length_in_code_units++] = code_unit;
|
||||
});
|
||||
|
||||
return from_utf16_without_validation({ code_units.data(), length_in_code_units });
|
||||
}
|
||||
|
||||
template<typename... Parameters>
|
||||
ALWAYS_INLINE static Utf16String formatted(CheckedFormatString<Parameters...>&& format, Parameters const&... parameters)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -96,6 +96,27 @@ TEST_CASE(from_utf8)
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE(from_utf8_with_replacement_character)
|
||||
{
|
||||
auto string1 = Utf16String::from_utf8_with_replacement_character("long string \xf4\x8f\xbf\xc0"sv, Utf16String::WithBOMHandling::No); // U+110000
|
||||
EXPECT_EQ(string1, u"long string \ufffd\ufffd\ufffd\ufffd"sv);
|
||||
|
||||
auto string3 = Utf16String::from_utf8_with_replacement_character("A valid string!"sv, Utf16String::WithBOMHandling::No);
|
||||
EXPECT_EQ(string3, "A valid string!"sv);
|
||||
|
||||
auto string4 = Utf16String::from_utf8_with_replacement_character(""sv, Utf16String::WithBOMHandling::No);
|
||||
EXPECT_EQ(string4, ""sv);
|
||||
|
||||
auto string5 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::Yes);
|
||||
EXPECT_EQ(string5, "WHF!"sv);
|
||||
|
||||
auto string6 = Utf16String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, Utf16String::WithBOMHandling::No);
|
||||
EXPECT_EQ(string6, u"\ufeffWHF!"sv);
|
||||
|
||||
auto string7 = Utf16String::from_utf8_with_replacement_character("\xED\xA0\x80WHF!"sv); // U+D800
|
||||
EXPECT_EQ(string7, u"\ufffdWHF!"sv);
|
||||
}
|
||||
|
||||
TEST_CASE(from_utf16)
|
||||
{
|
||||
{
|
||||
|
|
@ -235,6 +256,32 @@ TEST_CASE(from_utf32)
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE(from_code_point)
|
||||
{
|
||||
u32 code_point = 0;
|
||||
|
||||
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; ++code_point) {
|
||||
auto string = Utf16String::from_code_point(code_point);
|
||||
EXPECT_EQ(string.length_in_code_units(), 1uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 1uz);
|
||||
EXPECT_EQ(string.code_point_at(0), code_point);
|
||||
EXPECT_EQ(string.code_unit_at(0), code_point);
|
||||
}
|
||||
|
||||
for (; code_point < AK::UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT + 10'000; ++code_point) {
|
||||
auto string = Utf16String::from_code_point(code_point);
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 1uz);
|
||||
EXPECT_EQ(string.code_point_at(0), code_point);
|
||||
|
||||
size_t i = 0;
|
||||
(void)AK::UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
|
||||
EXPECT_EQ(string.code_unit_at(i++), code_unit);
|
||||
});
|
||||
EXPECT_EQ(i, 2uz);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(formatted)
|
||||
{
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user