44 unknown, utf8, utf16, utf32,
45 utf7, utf1, utf_ebcdic,
53 std::endian endianness;
61 constexpr inline text_encoding utf8_encoding = { text_encoding_type::utf8, std::endian::native };
62 constexpr inline text_encoding utf16_le_encoding = { text_encoding_type::utf16, std::endian::little };
63 constexpr inline text_encoding utf16_be_encoding = { text_encoding_type::utf16, std::endian::big };
64 constexpr inline text_encoding utf32_le_encoding = { text_encoding_type::utf32, std::endian::little };
65 constexpr inline text_encoding utf32_be_encoding = { text_encoding_type::utf32, std::endian::big };
72 struct text_decode_result;
79 template <
bytelike BYTE_TYPE,
size_t N>
96 template <
bytelike_range T>
107 constexpr char32_t consume_utf8(string_view8
auto& str);
119 template <
string8 RESULT = std::
string>
125 template <
string8 RESULT = std::
string,
stringable16 STR>
131 template <
string8 RESULT = std::
string,
stringable32 STR>
135 [[
nodiscard]] std::string to_string(std::wstring_view str);
147 template <
string8 RESULT = std::
string>
153 template <std::ranges::view R>
170 template <
string16 RESULT = std::w
string>
176 template <
string16 RESULT = std::w
string,
stringable8 STR>
200 template <
typename T>
205 using in_char_type = std::ranges::range_value_t<std::remove_cvref_t<T>>;
217 template <
typename T>
227 static_assert(
string8<T>,
"Unsupported character type");
231 template <
typename TO,
typename FROM>
240 template <
typename TO,
typename FROM>
251 template <
typename T>
258 template <
typename RESULT = std::
string>
272 template <
bytelike BYTE_TYPE,
size_t N>
275 static_assert(
N == std::dynamic_extent,
"consume_bom() can only be used with dynamic spans");
291 using enum std::endian;
310 static constexpr std::string_view
bom_for_gb18030{
"\x84\x31\x95\x33", 4 };
311 static constexpr std::string_view
bom_for_utf32_be{
"\x00\x00\xFE\xFF", 4 };
312 static constexpr std::string_view
bom_for_utf32_le{
"\xFF\xFE\x00\x00", 4 };
314 static constexpr std::string_view
bom_for_utf8{
"\xEF\xBB\xBF", 3 };
315 static constexpr std::string_view
bom_for_utf7{
"\x2B\x2F\x76", 3 };
316 static constexpr std::string_view
bom_for_utf1{
"\xF7\x64\x4C", 3 };
317 static constexpr std::string_view
bom_for_scsu{
"\x0E\xFE\xFF", 3 };
318 static constexpr std::string_view
bom_for_bocu1{
"\xFB\xEE\x28", 3 };
324 using enum std::endian;
348 const auto as_bytes = std::string_view{ (
char*)
sv.data(),
sv.size() *
sizeof(
char16_t) };
350 if (as_bytes.starts_with(
bom_for_utf16_be)) {
sv.remove_prefix(1);
return { text_encoding_type::utf16, std::endian::big }; }
351 if (as_bytes.starts_with(
bom_for_utf16_le)) {
sv.remove_prefix(1);
return { text_encoding_type::utf16, std::endian::little }; }
358 static constexpr std::string_view
bom_for_utf32_be{
"\xFF\xFE\x00\x00", 4 };
359 static constexpr std::string_view
bom_for_utf32_le{
"\x00\x00\xFE\xFF", 4 };
363 const auto as_bytes = std::string_view{ (
char*)
sv.data(),
sv.size() *
sizeof(
char32_t) };
365 if (as_bytes.starts_with(
bom_for_utf32_be)) {
sv.remove_prefix(1);
return { text_encoding_type::utf32, std::endian::big }; }
366 if (as_bytes.starts_with(
bom_for_utf32_le)) {
sv.remove_prefix(1);
return { text_encoding_type::utf32, std::endian::little }; }
375 unsupported_encoding,
382 status status = unsupported_encoding;
384 char32_t point =
static_cast<char32_t>(-1);
396 basic_multilingual_plane,
397 supplementary_multilingual_plane,
398 supplementary_ideographic_plane,
399 tertiary_ideographic_plane,
400 supplementary_special_purpose_plane = 14,
404 bmp = basic_multilingual_plane,
405 smp = supplementary_multilingual_plane,
406 sip = supplementary_ideographic_plane,
407 tip = tertiary_ideographic_plane,
408 ssp = supplementary_special_purpose_plane,
409 spua_a = private_use_plane_a, pup_a = spua_a,
410 spua_b = private_use_plane_b, pup_b = spua_b,
413 constexpr inline char32_t last_unicode_code_point = 0x10FFFF;
414 constexpr inline char32_t first_unicode_high_surrogate = 0xD800;
415 constexpr inline char32_t last_unicode_high_surrogate = 0xDBFF;
416 constexpr inline char32_t first_unicode_low_surrogate = 0xDC00;
417 constexpr inline char32_t last_unicode_low_surrogate = 0xDFFF;
419 constexpr bool is_high_surrogate(
char32_t cp)
noexcept {
return cp >= first_unicode_high_surrogate && cp <= last_unicode_high_surrogate; }
420 constexpr bool is_low_surrogate(
char32_t cp)
noexcept {
return cp >= first_unicode_low_surrogate && cp <= last_unicode_low_surrogate; }
421 constexpr bool is_surrogate(
char32_t cp)
noexcept {
return cp >= first_unicode_high_surrogate && cp <= last_unicode_low_surrogate; }
422 constexpr bool is_unicode(
char32_t cp)
noexcept {
return cp <= last_unicode_code_point; }
428 return 0x10000 + ((
high - first_unicode_high_surrogate) << 10) + (
low - first_unicode_low_surrogate);
431 constexpr std::pair<char32_t, char32_t> codepoint_to_surrogate_pair(
char32_t cp)
noexcept
433 return { ((cp - 0x10000) >> 10) + first_unicode_high_surrogate, ((cp - 0x10000) & 0x3FF) + first_unicode_low_surrogate };
438 template <std::
integral T>
442 std::memcpy(&result, source,
sizeof(T));
444 return byteswap(result);
451 constexpr size_t lut[33] = { 7,6,6,6,6,6,5,5,5,5,5,4,4,4,4,4,3,3,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1 };
452 return lut[std::countl_zero(std::bit_cast<uint32_t>(cp))];
459 return { text_decode_result::truncated };
463 case text_encoding_type::utf8:
465 const auto first = std::bit_cast<uint8_t>(
sv[0]);
467 return { text_decode_result::valid,
first, 1 };
471 if ((
first >> 5) == 0x6) { length = 2; value =
first & 0x1F; }
472 else if ((
first >> 4) == 0xe) { length = 3; value =
first & 0xF; }
473 else if ((
first >> 3) == 0x1e) { length = 4; value =
first & 0x7; }
475 if (length == 0 ||
sv.size() < length)
476 return { text_decode_result::truncated,
first, 1 };
481 if ((
sv[3] & 0xC0) != 0x80)
482 return { text_decode_result::invalid,
first, 1 };
483 value = (value << 6) | (
sv[3] & 0x3f);
485 if ((
sv[2] & 0xC0) != 0x80)
486 return { text_decode_result::invalid,
first, 1 };
487 value = (value << 6) | (
sv[2] & 0x3f);
489 if ((
sv[1] & 0xC0) != 0x80)
490 return { text_decode_result::invalid,
first, 1 };
491 value = (value << 6) | (
sv[1] & 0x3f);
492 return { text_decode_result::valid, value, length };
495 case text_encoding_type::utf16:
498 return { text_decode_result::truncated };
506 return { text_decode_result::truncated,
first, 2 };
513 return { text_decode_result::invalid,
first, 2 };
516 return { text_decode_result::valid,
first, 2 };
518 case text_encoding_type::utf32:
521 return { text_decode_result::truncated };
525 return { text_decode_result::valid, cp, 4 };
526 return { text_decode_result::invalid, cp, 4 };
533 template <
bytelike_range T>
538 size_t valid_points = 0;
539 size_t control_points = 0;
540 size_t plain_ascii = 0;
541 size_t whitespace = 0;
542 size_t extended_codepoints = 0;
543 float one_over_points = 0.f;
544 size_t invalid_points()
const {
return points - valid_points; }
545 float score()
const {
return (2.5f * whitespace + plain_ascii - 100.f * invalid_points() - 50.f * control_points + 5.f * extended_codepoints) * one_over_points; }
553 if (
decoded.status == text_decode_result::truncated)
558 if (
decoded.status == text_decode_result::valid)
560 stats.valid_points++;
568 else if (
decoded.point ==
'\r')
571 stats.control_points++;
579 else if (
decoded.point >= 65536)
580 stats.extended_codepoints++;
583 if (
stats.points > 0)
588 auto sv = make_sv(str);
596 sv =
sv.substr(0, 4096);
603 return utf8_encoding;
606 if (
stats8.invalid_points() == 0 &&
stats8.control_points == 0)
607 return utf8_encoding;
670 [[gsl::suppress(type.1,
es.79)]]
672 [[gsl::suppress(
"type.1",
"es.79")]]
676 using char_type =
typename std::remove_cvref_t<
decltype(str)>::value_type;
679 if (str.empty())
return 0;
680 auto it = std::to_address(str.begin());
684 if (cp < 0x80) length = 1;
685 else if ((cp >> 5) == 0x6) length = 2;
686 else if ((cp >> 4) == 0xe) length = 3;
687 else if ((cp >> 3) == 0x1e) length = 4;
704 str.remove_prefix(length);
710 using char_type =
typename std::remove_cvref_t<
decltype(str)>::value_type;
713 auto it = std::to_address(str.begin());
714 const auto end = std::to_address(str.end());
722 if ((cp >> 5) == 0x6) length = 2;
723 else if ((cp >> 4) == 0xe) length = 3;
724 else if ((cp >> 3) == 0x1e) length = 4;
734 [[gsl::suppress(type.1)]]
736 [[gsl::suppress(
"type.1")]]
740 using char_type =
typename std::remove_cvref_t<
decltype(
buffer)>::value_type;
768 else if (cp < 0x10000)
788 using dest_char =
typename std::decay_t<
decltype(
dest)>::value_type;
798 template <
typename T>
801 using dest_char =
typename std::decay_t<
decltype(
dest)>::value_type;
811 template <
typename RESULT>
828 [[gsl::suppress(type.1,
es.79)]]
830 [[gsl::suppress(
"type.1",
"es.79")]]
834 using char_type =
typename std::remove_cvref_t<
decltype(str)>::value_type;
837 if (str.empty())
return 0;
841 const int length =
int(cp >= 0xD800 && cp <= 0xDBFF) + 1;
846 cp = ((cp - 0xD800) << 10) | (*
it - 0xDC00);
848 str.remove_prefix(length);
854 if (str.empty())
return 0;
855 const auto result = str[0];
856 str.remove_prefix(1);
861 [[gsl::suppress(type.1)]]
863 [[gsl::suppress(
"type.1")]]
867 using char_type =
typename std::remove_cvref_t<
decltype(
buffer)>::value_type;
887 [[gsl::suppress(type.1)]]
889 [[gsl::suppress(
"type.1")]]
893 using char_type =
typename T::value_type;
897 return {
static_cast<char_type>((cp >> 6) | 0xc0),
static_cast<char_type>((cp & 0x3f) | 0x80) };
898 else if (cp < 0x10000)
899 return {
static_cast<char_type>((cp >> 12) | 0xe0),
static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80),
static_cast<char_type>((cp & 0x3f) | 0x80) };
901 return {
static_cast<char_type>((cp >> 18) | 0xf0),
static_cast<char_type>(((cp >> 12) & 0x3f) | 0x80),
static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80),
static_cast<char_type>((cp & 0x3f) | 0x80) };
904 template <
string8 RESULT,
stringable8 STR>
907 if constexpr (std::same_as<STR, RESULT>)
908 return std::forward<STR>(str);
911 using char_type =
typename RESULT::value_type;
916 template <
string8 RESULT,
stringable16 STR>
920 auto sv = make_sv(str);
927 template <
string8 RESULT,
stringable32 STR>
931 auto sv = make_sv(str);
938 [[
nodiscard]]
inline std::string to_string(std::wstring_view str)
943 template <
string16 T>
945 [[gsl::suppress(type.1)]]
947 [[gsl::suppress(
"type.1")]]
955 return {
static_cast<char_type>((cp >> 10) + 0xD800),
static_cast<char_type>((cp & 0x3FF) + 0xDC00) };
958 template <
string16 T,
stringable8 STR>
962 auto sv = make_sv(str);
973 template <std::ranges::view R>
974 struct utf8_view :
public std::ranges::view_interface<utf8_view<R>>
976 template <
typename RANGE_ITER,
typename SENTINEL>
979 using iterator_category = std::forward_iterator_tag;
984 constexpr utf8_iterator(
RANGE_ITER current,
SENTINEL end) : mCurrent(std::move(current)), mEnd(std::move(end)) {}
986 [[
nodiscard]]
constexpr value_type operator*()
const {
987 const auto length = len();
989 char32_t cp = std::bit_cast<uint8_t>(*
it);
993 ++
it; cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
996 ++
it; cp = ((cp << 12) & 0xffff) + ((*
it << 6) & 0xfff);
997 ++
it; cp += (*it) & 0x3f;
1000 ++
it; cp = ((cp << 18) & 0x1fffff) + (((*it) << 12) & 0x3ffff);
1001 ++
it; cp += ((*it) << 6) & 0xfff;
1002 ++
it; cp += (*it) & 0x3f;
1008 constexpr utf8_iterator& operator++() {
1009 std::advance(mCurrent, len());
1013 constexpr utf8_iterator operator++(
int) {
1019 constexpr auto operator<=>(utf8_iterator
const&)
const noexcept =
default;
1025 if (mCurrent >= mEnd)
1026 throw std::out_of_range(
"utf8 iterator out of range");
1028 const unsigned cp = std::bit_cast<uint8_t>(*mCurrent);
1030 if (cp < 0x80) length = 1;
1031 else if ((cp >> 5) == 0x6) length = 2;
1032 else if ((cp >> 4) == 0xe) length = 3;
1033 else if ((cp >> 3) == 0x1e) length = 4;
1035 throw std::runtime_error(
"invalid utf-8 prefix");
1037 if (mCurrent + length > mEnd)
1038 throw std::runtime_error(
"utf-8 range contains codepoint with length beyond end of range");
1054 template <
typename...
ARGS>
1055 requires std::constructible_from<std::string_view,
ARGS...>
1057 : mBase(std::forward<ARGS>(
args)...)
1061 constexpr R base()
const&
1065 constexpr R base()&&
1067 return std::move(mBase);
1070 constexpr auto begin()
const
1072 return utf8_iterator<
decltype(std::begin(mBase)),
decltype(std::end(mBase))>{std::begin(mBase), std::end(mBase)};
1075 constexpr auto end()
const
1077 return utf8_iterator<
decltype(std::end(mBase)),
decltype(std::end(mBase))>{std::end(mBase), std::end(mBase)};
Whether the type is a native char type.
The type is a string with a 16-bit char type.
The type is a string with an 32-bit char type.
The type is a string with an 8-bit char type.
The type is a string view with a 16-bit char type.
The type is a string view with a 32-bit char type.
The type is a string view with an 8-bit char type.
The type is convertible to a string view with a 16-bit char type.
The type is convertible to a string view with a 32-bit char type.
The type is convertible to a string view with an 8-bit char type.
constexpr auto bit_count
Equal to the number of bits in the type.
unicode_plane
Represents the Unicode plane.
constexpr char32_t consume_utf32(string_view32 auto &str)
Consumes (see consume()) a UTF-32 codepoint from str.
text_encoding_type
Specifies a base text-encoding, ignoring endianness for multi-byte encodings.
constexpr void transcode_unicode(FROM const &from, TO &out)
Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings...
constexpr void transcode_codepage_to_unicode(T &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
Transcodes an Extended ASCII string source into unicode-encoded dest, according to codepage_map.
constexpr bool is_unicode_character(char32_t cp) noexcept
Returns whether cp has a value that is a valid Unicode character (ie.
constexpr size_t append_utf32(string32 auto &buffer, char32_t cp)
Appends 32-bit values to buffer by encoding cp into UTF-32.
constexpr char32_t surrogate_pair_to_codepoint(char32_t high, char32_t low) noexcept
Returns the codepoint encoded by two surrogates.
constexpr size_t count_utf8_codepoints(stringable8 auto str)
Returns the number of codepoints in the given UTF-8 string str
constexpr bool is_unicode(char32_t cp) noexcept
Returns whether cp has a value that is a valid Unicode codepoint (ie. between 0 and 0x10FFFF).
constexpr void transcode_codepage_to_utf8(string8 auto &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
Transcodes an Extended ASCII string source into UTF-8 dest, according to codepage_map
constexpr size_t append_utf8(string8 auto &buffer, char32_t cp)
Appends octets to buffer by encoding cp into UTF-8.
constexpr RESULT to_utf8(char32_t cp)
Returns cp encoded as a UTF-8 string.
constexpr char32_t consume_utf16(string_view16 auto &str)
Consumes (see consume()) a UTF-16 codepoint from str.
constexpr RESULT to_utf16(char32_t cp)
Returns cp encoded as a UTF-16 string.
constexpr bool is_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes any part of a codepoint with a more-than-16-bit value.
constexpr void append_codepoint(T &str, char32_t cp)
Appends a codepoint to a UTF-encoded string. Supports UTF-8, UTF-16 and UTF-32, decides based on char...
constexpr char32_t consume_codepoint(T &str)
Consumes a codepoint from a UTF-encoded string and returns it.
constexpr char32_t consume_utf8(string_view8 auto &str)
Consumes (see consume()) a UTF-8 codepoint from str.
constexpr size_t codepoint_utf8_count(char32_t cp) noexcept
Returns the number of UTF-8 octets necessarity to encode the given codepoint.
constexpr bool is_high_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes the high part of a codepoint with a more-than-16-bit v...
text_encoding detect_encoding(T const &range)
Attempts to detect the encoding of a given bytelike range.
constexpr size_t append_utf16(string16 auto &buffer, char32_t cp)
Appends 16-bit values to buffer by encoding cp into UTF-16.
text_decode_result decode_codepoint(bytelike_range auto range, text_encoding encoding)
Attempts to decode the first codepoint in bytelike range range, assuming it is encoded in encoding.
constexpr text_encoding unknown_text_encoding
Represents an unknown text encoding (e.g. when an encoding could not be determined)
std::wstring to_wstring(std::string_view str)
Returns str (a UTF-8-encoded string) encoded as a UTF-16/32 string in a std::wstring (depending on th...
constexpr bool is_low_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes the low part of a codepoint with a more-than-16-bit va...
text_encoding consume_bom(std::span< BYTE_TYPE, N > &spn)
Consumes (see consume()) a byte order mark from the beginning of spn (a span of bytelike),...
@ invalid
Represents an invalid plane number.
Type that represents a specific text encoding - a combination of ghassanpl::string_ops::text_encoding...
The below code is based on Sun's libm library code, which is licensed under the following license:
Shamelessly stolen from https://github.com/arc80/plywood/.
uint8_t byte_count
The number of bytes this codepoint takes up in the input string.
char32_t point
The decoded codepoint (or -1 if failed)
A simple view over an UTF8 string range with codepoint values.