header_utils
Loading...
Searching...
No Matches
unicode.h
1
4
5#pragma once
6
7#include "string_ops.h"
8#include "bytes.h"
9#include "cpp23.h"
10
12{
18
20 [[nodiscard]] constexpr bool is_high_surrogate(char32_t cp) noexcept;
21
23 [[nodiscard]] constexpr bool is_low_surrogate(char32_t cp) noexcept;
24
26 [[nodiscard]] constexpr bool is_surrogate(char32_t cp) noexcept;
27
29 [[nodiscard]] constexpr bool is_unicode(char32_t cp) noexcept;
30
33 [[nodiscard]] constexpr bool is_unicode_character(char32_t cp) noexcept;
34
36 [[nodiscard]] constexpr char32_t surrogate_pair_to_codepoint(char32_t high, char32_t low) noexcept;
37
38 enum class unicode_plane;
39 [[nodiscard]] constexpr auto get_unicode_plane(char32_t cp) noexcept -> unicode_plane;
40
43 {
44 unknown, utf8, utf16, utf32,
45 utf7, utf1, utf_ebcdic,
46 scsu, bocu1, gb18030,
47 };
48
51 {
53 std::endian endianness;
54
55 [[nodiscard]] constexpr auto operator<=>(text_encoding const& other) const noexcept = default;
56 };
57
61 constexpr inline text_encoding utf8_encoding = { text_encoding_type::utf8, std::endian::native };
62 constexpr inline text_encoding utf16_le_encoding = { text_encoding_type::utf16, std::endian::little };
63 constexpr inline text_encoding utf16_be_encoding = { text_encoding_type::utf16, std::endian::big };
64 constexpr inline text_encoding utf32_le_encoding = { text_encoding_type::utf32, std::endian::little };
65 constexpr inline text_encoding utf32_be_encoding = { text_encoding_type::utf32, std::endian::big };
67 constexpr inline text_encoding unknown_text_encoding = { text_encoding_type::unknown, std::endian::native };
69
71
72 struct text_decode_result;
73
75 [[nodiscard]] text_decode_result decode_codepoint(bytelike_range auto range, text_encoding encoding);
76
79 template <bytelike BYTE_TYPE, size_t N>
80 text_encoding consume_bom(std::span<BYTE_TYPE, N>& spn);
81
84 text_encoding consume_bom(string_view8 auto& sv);
85
88 text_encoding consume_bom(string_view16 auto& sv);
89
92 text_encoding consume_bom(string_view32 auto& sv);
93
96 template <bytelike_range T>
97 [[nodiscard]] text_encoding detect_encoding(T const& range);
98
101
103 [[nodiscard]] constexpr size_t codepoint_utf8_count(char32_t cp) noexcept;
104
107 constexpr char32_t consume_utf8(string_view8 auto& str);
108
111 [[nodiscard]] constexpr size_t count_utf8_codepoints(stringable8 auto str);
112
114 constexpr size_t append_utf8(string8 auto& buffer, char32_t cp);
115
119 template <string8 RESULT = std::string>
120 [[nodiscard]] constexpr RESULT to_utf8(char32_t cp);
121
125 template <string8 RESULT = std::string, stringable16 STR>
126 [[nodiscard]] constexpr RESULT to_utf8(STR&& str);
127
131 template <string8 RESULT = std::string, stringable32 STR>
132 [[nodiscard]] constexpr RESULT to_utf8(STR&& str);
133
135 [[nodiscard]] std::string to_string(std::wstring_view str);
136
140 constexpr void transcode_codepage_to_utf8(string8 auto& dest, stringable8 auto source, std::span<char32_t const, 128> codepage_map);
141
147 template <string8 RESULT = std::string>
148 [[nodiscard]] constexpr auto transcode_codepage_to_utf8(stringable8 auto source, std::span<char32_t const, 128> codepage_map) -> RESULT;
149
151
153 template <std::ranges::view R>
154 struct utf8_view;
155
158
161 constexpr char32_t consume_utf16(string_view16 auto& str);
162
165 constexpr size_t append_utf16(string16 auto& buffer, char32_t cp);
166
170 template <string16 RESULT = std::wstring>
171 [[nodiscard]] constexpr RESULT to_utf16(char32_t cp);
172
176 template <string16 RESULT = std::wstring, stringable8 STR>
177 [[nodiscard]] constexpr RESULT to_utf16(STR str);
178
181 [[nodiscard]] std::wstring to_wstring(std::string_view str);
182
184
185
188
191 constexpr char32_t consume_utf32(string_view32 auto& str);
192
195 constexpr size_t append_utf32(string32 auto& buffer, char32_t cp);
196
198
200 template <typename T>
201 constexpr char32_t consume_codepoint(T& str)
202 {
203 if (std::empty(str))
204 return 0;
205 using in_char_type = std::ranges::range_value_t<std::remove_cvref_t<T>>;
206 if constexpr (stringable8<T>)
207 return consume_utf8(str);
208 else if constexpr (stringable16<T>)
209 return consume_utf16(str);
210 else if constexpr (stringable32<T>)
211 return consume_utf32(str);
212 else
213 static_assert(stringable8<T>, "Unsupported character type");
214 }
215
217 template <typename T>
218 constexpr void append_codepoint(T& str, char32_t cp)
219 {
220 if constexpr (string8<T>)
221 append_utf8(str, cp);
222 else if constexpr (string16<T>)
223 append_utf16(str, cp);
224 else if constexpr (string32<T>)
225 append_utf32(str, cp);
226 else
227 static_assert(string8<T>, "Unsupported character type");
228 }
229
231 template <typename TO, typename FROM>
232 constexpr void transcode_unicode(FROM const& from, TO& out)
233 {
234 auto from_sv = make_sv(from);
235 while (!std::empty(from_sv))
237 }
238
240 template <typename TO, typename FROM>
241 [[nodiscard]] constexpr TO transcode_unicode(FROM const& from)
242 {
243 TO result{};
244 transcode_unicode(from, result);
245 return result;
246 }
247
251 template <typename T>
252 constexpr void transcode_codepage_to_unicode(T& dest, stringable8 auto source, std::span<char32_t const, 128> codepage_map);
253
258 template <typename RESULT = std::string>
259 [[nodiscard]] constexpr auto transcode_codepage_to_unicode(stringable8 auto source, std::span<char32_t const, 128> codepage_map) -> RESULT;
260
262
271
272 template <bytelike BYTE_TYPE, size_t N>
273 inline text_encoding consume_bom(std::span<BYTE_TYPE, N>& sv)
274 {
275 static_assert(N == std::dynamic_extent, "consume_bom() can only be used with dynamic spans");
276
277 static constexpr auto bom_for_gb18030 = std::array{ static_cast<BYTE_TYPE>(0x84), static_cast<BYTE_TYPE>(0x31), static_cast<BYTE_TYPE>(0x95), static_cast<BYTE_TYPE>(0x33) };
278 static constexpr auto bom_for_utf32_be = std::array{ static_cast<BYTE_TYPE>(0x00), static_cast<BYTE_TYPE>(0x00), static_cast<BYTE_TYPE>(0xFE), static_cast<BYTE_TYPE>(0xFF) };
279 static constexpr auto bom_for_utf32_le = std::array{ static_cast<BYTE_TYPE>(0xFF), static_cast<BYTE_TYPE>(0xFE), static_cast<BYTE_TYPE>(0x00), static_cast<BYTE_TYPE>(0x00) };
280 static constexpr auto bom_for_utf_ebcdic = std::array{ static_cast<BYTE_TYPE>(0xDD), static_cast<BYTE_TYPE>(0x73), static_cast<BYTE_TYPE>(0x66), static_cast<BYTE_TYPE>(0x73) };
281 static constexpr auto bom_for_utf8 = std::array{ static_cast<BYTE_TYPE>(0xEF), static_cast<BYTE_TYPE>(0xBB), static_cast<BYTE_TYPE>(0xBF) };
282 static constexpr auto bom_for_utf7 = std::array{ static_cast<BYTE_TYPE>(0x2B), static_cast<BYTE_TYPE>(0x2F), static_cast<BYTE_TYPE>(0x76) };
283 static constexpr auto bom_for_utf1 = std::array{ static_cast<BYTE_TYPE>(0xF7), static_cast<BYTE_TYPE>(0x64), static_cast<BYTE_TYPE>(0x4C) };
284 static constexpr auto bom_for_scsu = std::array{ static_cast<BYTE_TYPE>(0x0E), static_cast<BYTE_TYPE>(0xFE), static_cast<BYTE_TYPE>(0xFF) };
285 static constexpr auto bom_for_bocu1 = std::array{ static_cast<BYTE_TYPE>(0xFB), static_cast<BYTE_TYPE>(0xEE), static_cast<BYTE_TYPE>(0x28) };
286 static constexpr auto bom_for_utf16_be = std::array{ static_cast<BYTE_TYPE>(0xFE), static_cast<BYTE_TYPE>(0xFF) };
287 static constexpr auto bom_for_utf16_le = std::array{ static_cast<BYTE_TYPE>(0xFF), static_cast<BYTE_TYPE>(0xFE) };
288
289 if (!sv.empty())
290 {
291 using enum std::endian;
293 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_gb18030 })) { sv = sv.subspan(bom_for_gb18030.size()); return { gb18030, native }; }
294 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf32_be })) { sv = sv.subspan(bom_for_utf32_be.size()); return { utf32, big }; }
295 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf32_le })) { sv = sv.subspan(bom_for_utf32_le.size()); return { utf32, little }; }
296 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf_ebcdic })) { sv = sv.subspan(bom_for_utf_ebcdic.size()); return { utf_ebcdic, native }; }
297 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf8 })) { sv = sv.subspan(bom_for_utf8.size()); return { utf8, native }; }
298 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf7 })) { sv = sv.subspan(bom_for_utf7.size()); return { utf7, native }; }
299 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf1 })) { sv = sv.subspan(bom_for_utf1.size()); return { utf1, native }; }
300 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_scsu })) { sv = sv.subspan(bom_for_scsu.size()); return { scsu, native }; }
301 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_bocu1 })) { sv = sv.subspan(bom_for_bocu1.size()); return { bocu1, native }; }
302 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf16_be })) { sv = sv.subspan(bom_for_utf16_be.size()); return { utf16, big }; }
303 if (starts_with(sv, std::span<BYTE_TYPE const>{ bom_for_utf16_le })) { sv = sv.subspan(bom_for_utf16_le.size()); return { utf16, little }; }
304 }
306 }
307
309 {
310 static constexpr std::string_view bom_for_gb18030{ "\x84\x31\x95\x33", 4 };
311 static constexpr std::string_view bom_for_utf32_be{ "\x00\x00\xFE\xFF", 4 };
312 static constexpr std::string_view bom_for_utf32_le{ "\xFF\xFE\x00\x00", 4 };
313 static constexpr std::string_view bom_for_utf_ebcdic{ "\xDD\x73\x66\x73", 4 };
314 static constexpr std::string_view bom_for_utf8{ "\xEF\xBB\xBF", 3 };
315 static constexpr std::string_view bom_for_utf7{ "\x2B\x2F\x76", 3 };
316 static constexpr std::string_view bom_for_utf1{ "\xF7\x64\x4C", 3 };
317 static constexpr std::string_view bom_for_scsu{ "\x0E\xFE\xFF", 3 };
318 static constexpr std::string_view bom_for_bocu1{ "\xFB\xEE\x28", 3 };
319 static constexpr std::string_view bom_for_utf16_be{ "\xFE\xFF", 2 };
320 static constexpr std::string_view bom_for_utf16_le{ "\xFF\xFE", 2 };
321
322 if (!sv.empty())
323 {
324 using enum std::endian;
326 if (sv.starts_with(bom_for_gb18030)) { sv.remove_prefix(bom_for_gb18030.size()); return { gb18030, native }; }
327 if (sv.starts_with(bom_for_utf32_be)) { sv.remove_prefix(bom_for_utf32_be.size()); return { utf32, big }; }
328 if (sv.starts_with(bom_for_utf32_le)) { sv.remove_prefix(bom_for_utf32_le.size()); return { utf32, little }; }
329 if (sv.starts_with(bom_for_utf_ebcdic)) { sv.remove_prefix(bom_for_utf_ebcdic.size()); return { utf_ebcdic, native }; }
330 if (sv.starts_with(bom_for_utf8)) { sv.remove_prefix(bom_for_utf8.size()); return { utf8, native }; }
331 if (sv.starts_with(bom_for_utf7)) { sv.remove_prefix(bom_for_utf7.size()); return { utf7, native }; }
332 if (sv.starts_with(bom_for_utf1)) { sv.remove_prefix(bom_for_utf1.size()); return { utf1, native }; }
333 if (sv.starts_with(bom_for_scsu)) { sv.remove_prefix(bom_for_scsu.size()); return { scsu, native }; }
334 if (sv.starts_with(bom_for_bocu1)) { sv.remove_prefix(bom_for_bocu1.size()); return { bocu1, native }; }
335 if (sv.starts_with(bom_for_utf16_be)) { sv.remove_prefix(bom_for_utf16_be.size()); return { utf16, big }; }
336 if (sv.starts_with(bom_for_utf16_le)) { sv.remove_prefix(bom_for_utf16_le.size()); return { utf16, little }; }
337 }
339 }
340
342 {
343 static constexpr std::string_view bom_for_utf16_be{ "\xFE\xFF", 2 };
344 static constexpr std::string_view bom_for_utf16_le{ "\xFF\xFE", 2 };
345
346 if (!sv.empty())
347 {
348 const auto as_bytes = std::string_view{ (char*)sv.data(), sv.size() * sizeof(char16_t) };
349
350 if (as_bytes.starts_with(bom_for_utf16_be)) { sv.remove_prefix(1); return { text_encoding_type::utf16, std::endian::big }; }
351 if (as_bytes.starts_with(bom_for_utf16_le)) { sv.remove_prefix(1); return { text_encoding_type::utf16, std::endian::little }; }
352 }
354 }
355
357 {
358 static constexpr std::string_view bom_for_utf32_be{ "\xFF\xFE\x00\x00", 4 };
359 static constexpr std::string_view bom_for_utf32_le{ "\x00\x00\xFE\xFF", 4 };
360
361 if (!sv.empty())
362 {
363 const auto as_bytes = std::string_view{ (char*)sv.data(), sv.size() * sizeof(char32_t) };
364
365 if (as_bytes.starts_with(bom_for_utf32_be)) { sv.remove_prefix(1); return { text_encoding_type::utf32, std::endian::big }; }
366 if (as_bytes.starts_with(bom_for_utf32_le)) { sv.remove_prefix(1); return { text_encoding_type::utf32, std::endian::little }; }
367 }
369 }
370
373 {
374 enum status : uint8_t {
375 unsupported_encoding,
376 truncated,
377 invalid,
378 valid,
379 };
380
382 status status = unsupported_encoding;
384 char32_t point = static_cast<char32_t>(-1);
387 };
388
392 enum class unicode_plane
393 {
395 invalid = -1,
396 basic_multilingual_plane,
397 supplementary_multilingual_plane,
398 supplementary_ideographic_plane,
399 tertiary_ideographic_plane,
400 supplementary_special_purpose_plane = 14,
401 private_use_plane_a,
402 private_use_plane_b,
403
404 bmp = basic_multilingual_plane,
405 smp = supplementary_multilingual_plane,
406 sip = supplementary_ideographic_plane,
407 tip = tertiary_ideographic_plane,
408 ssp = supplementary_special_purpose_plane,
409 spua_a = private_use_plane_a, pup_a = spua_a,
410 spua_b = private_use_plane_b, pup_b = spua_b,
411 };
412
413 constexpr inline char32_t last_unicode_code_point = 0x10FFFF;
414 constexpr inline char32_t first_unicode_high_surrogate = 0xD800;
415 constexpr inline char32_t last_unicode_high_surrogate = 0xDBFF;
416 constexpr inline char32_t first_unicode_low_surrogate = 0xDC00;
417 constexpr inline char32_t last_unicode_low_surrogate = 0xDFFF;
418
419 constexpr bool is_high_surrogate(char32_t cp) noexcept { return cp >= first_unicode_high_surrogate && cp <= last_unicode_high_surrogate; }
420 constexpr bool is_low_surrogate(char32_t cp) noexcept { return cp >= first_unicode_low_surrogate && cp <= last_unicode_low_surrogate; }
421 constexpr bool is_surrogate(char32_t cp) noexcept { return cp >= first_unicode_high_surrogate && cp <= last_unicode_low_surrogate; }
422 constexpr bool is_unicode(char32_t cp) noexcept { return cp <= last_unicode_code_point; }
423 constexpr auto get_unicode_plane(char32_t cp) noexcept -> unicode_plane { return is_unicode(cp) ? unicode_plane(cp >> 16) : unicode_plane::invalid; }
424 constexpr bool is_unicode_character(char32_t cp) noexcept { return is_unicode(cp) && ((cp & 0xFFFE) != 0xFFFE) && !(cp >= 0xFDD0 && cp <= 0xFDEF); }
425
426 constexpr char32_t surrogate_pair_to_codepoint(char32_t high, char32_t low) noexcept
427 {
428 return 0x10000 + ((high - first_unicode_high_surrogate) << 10) + (low - first_unicode_low_surrogate);
429 }
430
431 constexpr std::pair<char32_t, char32_t> codepoint_to_surrogate_pair(char32_t cp) noexcept
432 {
433 return { ((cp - 0x10000) >> 10) + first_unicode_high_surrogate, ((cp - 0x10000) & 0x3FF) + first_unicode_low_surrogate };
434 }
435
436 namespace detail
437 {
438 template <std::integral T>
439 T get_val(const void* source, std::endian source_endianness)
440 {
441 T result{};
442 std::memcpy(&result, source, sizeof(T));
443 if (std::endian::native != source_endianness)
444 return byteswap(result);
445 return result;
446 }
447 }
448
449 [[nodiscard]] constexpr size_t codepoint_utf8_count(char32_t cp) noexcept
450 {
451 constexpr size_t lut[33] = { 7,6,6,6,6,6,5,5,5,5,5,4,4,4,4,4,3,3,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1 };
452 return lut[std::countl_zero(std::bit_cast<uint32_t>(cp))];
453 }
454
456 {
457 auto sv = make_sv(_str);
458 if (sv.empty())
459 return { text_decode_result::truncated };
460
461 switch (encoding.type)
462 {
463 case text_encoding_type::utf8:
464 {
465 const auto first = std::bit_cast<uint8_t>(sv[0]);
466 if (first < 0x80)
467 return { text_decode_result::valid, first, 1 };
468
469 char32_t value = 0;
470 uint8_t length = 0;
471 if ((first >> 5) == 0x6) { length = 2; value = first & 0x1F; }
472 else if ((first >> 4) == 0xe) { length = 3; value = first & 0xF; }
473 else if ((first >> 3) == 0x1e) { length = 4; value = first & 0x7; }
474
475 if (length == 0 || sv.size() < length)
476 return { text_decode_result::truncated, first, 1 };
477
478 switch (length)
479 {
480 case 4:
481 if ((sv[3] & 0xC0) != 0x80)
482 return { text_decode_result::invalid, first, 1 };
483 value = (value << 6) | (sv[3] & 0x3f);
484 case 3:
485 if ((sv[2] & 0xC0) != 0x80)
486 return { text_decode_result::invalid, first, 1 };
487 value = (value << 6) | (sv[2] & 0x3f);
488 case 2:
489 if ((sv[1] & 0xC0) != 0x80)
490 return { text_decode_result::invalid, first, 1 };
491 value = (value << 6) | (sv[1] & 0x3f);
492 return { text_decode_result::valid, value, length };
493 }
494 }
495 case text_encoding_type::utf16:
496 {
497 if (sv.size() < 2)
498 return { text_decode_result::truncated };
499
500 const uint16_t first = detail::get_val<uint16_t>(sv.data(), encoding.endianness);
501 if (is_surrogate(first))
502 {
504 {
505 if (sv.size() < 4)
506 return { text_decode_result::truncated, first, 2 };
507
508 const uint16_t second = detail::get_val<uint16_t>(sv.data() + 2, encoding.endianness);
510 return { text_decode_result::valid, surrogate_pair_to_codepoint(first, second), 4 };
511 }
512
513 return { text_decode_result::invalid, first, 2 };
514 }
515
516 return { text_decode_result::valid, first, 2 };
517 }
518 case text_encoding_type::utf32:
519 {
520 if (sv.size() < 4)
521 return { text_decode_result::truncated };
522
523 const uint32_t cp = detail::get_val<uint32_t>(sv.data(), encoding.endianness);
524 if (is_unicode(cp))
525 return { text_decode_result::valid, cp, 4 };
526 return { text_decode_result::invalid, cp, 4 };
527 }
528 default:
529 return {};
530 }
531 }
532
533 template <bytelike_range T>
534 [[nodiscard]] inline text_encoding detect_encoding(T const& str)
535 {
536 struct TextFileStats {
537 size_t points = 0;
538 size_t valid_points = 0;
539 size_t control_points = 0;
540 size_t plain_ascii = 0;
541 size_t whitespace = 0;
542 size_t extended_codepoints = 0;
543 float one_over_points = 0.f;
544 size_t invalid_points() const { return points - valid_points; }
545 float score() const { return (2.5f * whitespace + plain_ascii - 100.f * invalid_points() - 50.f * control_points + 5.f * extended_codepoints) * one_over_points; }
546 };
547
548 static const auto calculate_stats = [](TextFileStats& stats, bytelike_range auto sv, text_encoding encoding) {
549 size_t numBytes = 0;
550 while (!sv.empty())
551 {
553 if (decoded.status == text_decode_result::truncated)
554 break;
555 sv.remove_prefix(decoded.byte_count);
556 numBytes += decoded.byte_count;
557 stats.points++;
558 if (decoded.status == text_decode_result::valid)
559 {
560 stats.valid_points++;
561 if (decoded.point < 32)
562 {
563 if (decoded.point == '\n' || decoded.point == '\t')
564 {
565 stats.plain_ascii++;
566 stats.whitespace++;
567 }
568 else if (decoded.point == '\r')
569 stats.plain_ascii++;
570 else
571 stats.control_points++;
572 }
573 else if (decoded.point < 127)
574 {
575 stats.plain_ascii++;
576 if (decoded.point == ' ')
577 stats.whitespace++;
578 }
579 else if (decoded.point >= 65536)
580 stats.extended_codepoints++;
581 }
582 }
583 if (stats.points > 0)
584 stats.one_over_points = 1.f / stats.points;
585 return numBytes;
586 };
587
588 auto sv = make_sv(str);
589
590 if (sv.empty())
592
594 return encoding;
595
596 sv = sv.substr(0, 4096);
597
599
601 size_t numBytesRead = calculate_stats(stats8, sv, utf8_encoding);
602 if (numBytesRead == 0)
603 return utf8_encoding;
604
606 if (stats8.invalid_points() == 0 && stats8.control_points == 0)
607 return utf8_encoding;
608
611 text_encoding encoding8 = utf8_encoding;
612 if (const auto numHighBytes = numBytesRead - stats8.plain_ascii - stats8.control_points;
613 stats8.invalid_points() >= numHighBytes * 0.2f)
614 {
617 stats8.points = numBytesRead;
618 stats8.valid_points = numBytesRead;
619 }
620
623 calculate_stats(stats16_le, sv, utf16_le_encoding);
624
626 calculate_stats(stats16_be, sv, utf16_be_encoding);
627
630 text_encoding encoding16 = utf16_le_encoding;
631 if (stats16_be.score() > stats16_le.score())
632 {
634 encoding16 = utf16_be_encoding;
635 }
636
639 calculate_stats(stats32_le, sv, utf32_le_encoding);
640
642 calculate_stats(stats32_be, sv, utf32_be_encoding);
643
646 text_encoding encoding32 = utf32_le_encoding;
647 if (stats32_be.score() > stats32_le.score())
648 {
650 encoding32 = utf32_be_encoding;
651 }
652
654 const auto score8 = stats8.score();
655 const auto score16 = stats16->score();
656 const auto score32 = stats32->score();
657 if (score8 >= score32)
658 {
659 if (score16 >= score8)
660 return encoding16;
661 return encoding8;
662 }
663
664 if (score32 >= score16)
665 return encoding32;
666 return encoding16;
667 }
668
669#ifndef __clang__
670 [[gsl::suppress(type.1, es.79)]]
671#else
672 [[gsl::suppress("type.1", "es.79")]]
673#endif
674 [[nodiscard]] constexpr char32_t consume_utf8(string_view8 auto& str)
675 {
676 using char_type = typename std::remove_cvref_t<decltype(str)>::value_type;
677 using unsigned_char_type = std::make_unsigned_t<char_type>;
678
679 if (str.empty()) return 0;
680 auto it = std::to_address(str.begin());
681 char32_t cp = static_cast<unsigned_char_type>(*it);
682
683 int length = 0;
684 if (cp < 0x80) length = 1;
685 else if ((cp >> 5) == 0x6) length = 2;
686 else if ((cp >> 4) == 0xe) length = 3;
687 else if ((cp >> 3) == 0x1e) length = 4;
688 else return 0;
689
690 switch (length) {
691 case 2:
693 break;
694 case 3:
696 ++it; cp += static_cast<unsigned_char_type>(*it) & 0x3f;
697 break;
698 case 4:
700 ++it; cp += (static_cast<unsigned_char_type>(*it) << 6) & 0xfff;
701 ++it; cp += static_cast<unsigned_char_type>(*it) & 0x3f;
702 break;
703 }
704 str.remove_prefix(length);
705 return cp;
706 }
707
708 [[nodiscard]] constexpr size_t count_utf8_codepoints(stringable8 auto str)
709 {
710 using char_type = typename std::remove_cvref_t<decltype(str)>::value_type;
711 using unsigned_char_type = std::make_unsigned_t<char_type>;
712
713 auto it = std::to_address(str.begin());
714 const auto end = std::to_address(str.end());
715
716 size_t result = 0;
717 while (it < end)
718 {
719 char32_t cp = static_cast<unsigned_char_type>(*it);
720
721 int length = 1;
722 if ((cp >> 5) == 0x6) length = 2;
723 else if ((cp >> 4) == 0xe) length = 3;
724 else if ((cp >> 3) == 0x1e) length = 4;
725
726 it += length;
727 result++;
728 }
729 return result;
730 }
731
732
733 #ifndef __clang__
734 [[gsl::suppress(type.1)]]
735#else
736 [[gsl::suppress("type.1")]]
737#endif
738 constexpr size_t append_utf8(string8 auto& buffer, char32_t cp)
739 {
740 using char_type = typename std::remove_cvref_t<decltype(buffer)>::value_type;
741#if 1
742 const size_t cp_bytes = codepoint_utf8_count(cp);
743 std::decay_t<decltype(buffer)> bytes(cp_bytes, 0);
744 switch (cp_bytes)
745 {
746 case 7: bytes[cp_bytes - 6] = static_cast<char_type>(0x80 | ((cp >> 30) & 0x3F)); [[fallthrough]];
747 case 6: bytes[cp_bytes - 5] = static_cast<char_type>(0x80 | ((cp >> 24) & 0x3F)); [[fallthrough]];
748 case 5: bytes[cp_bytes - 4] = static_cast<char_type>(0x80 | ((cp >> 18) & 0x3F)); [[fallthrough]];
749 case 4: bytes[cp_bytes - 3] = static_cast<char_type>(0x80 | ((cp >> 12) & 0x3F)); [[fallthrough]];
750 case 3: bytes[cp_bytes - 2] = static_cast<char_type>(0x80 | ((cp >> 6) & 0x3F)); [[fallthrough]];
751 case 2: bytes[cp_bytes - 1] = static_cast<char_type>(0x80 | ((cp >> 0) & 0x3F)); bytes[0] = static_cast<char_type>((std::uint_least16_t(0xFF00uL) >> cp_bytes) | (uint64_t(cp) >> (6 * cp_bytes - 6))); break;
752 case 1: bytes[0] = static_cast<char_type>(cp); break;
753 }
754 buffer += bytes;
755 return cp_bytes;
756#else
757 if (cp < 0x80)
758 {
759 buffer += static_cast<char_type>(cp);
760 return 1;
761 }
762 else if (cp < 0x800)
763 {
764 buffer += static_cast<char_type>((cp >> 6) | 0xc0);
765 buffer += static_cast<char_type>((cp & 0x3f) | 0x80);
766 return 2;
767 }
768 else if (cp < 0x10000)
769 {
770 buffer += static_cast<char_type>((cp >> 12) | 0xe0);
771 buffer += static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80);
772 buffer += static_cast<char_type>((cp & 0x3f) | 0x80);
773 return 3;
774 }
775 else
776 {
777 buffer += static_cast<char_type>((cp >> 18) | 0xf0);
778 buffer += static_cast<char_type>(((cp >> 12) & 0x3f) | 0x80);
779 buffer += static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80);
780 buffer += static_cast<char_type>((cp & 0x3f) | 0x80);
781 return 4;
782 }
783#endif
784 }
785
786 constexpr void transcode_codepage_to_utf8(string8 auto& dest, stringable8 auto source, std::span<char32_t const, 128> codepage_map)
787 {
788 using dest_char = typename std::decay_t<decltype(dest)>::value_type;
789 for (uint8_t cp : source)
790 {
791 if (cp < 0x80)
792 dest += static_cast<dest_char>(cp);
793 else
794 append_utf8(dest, codepage_map[static_cast<size_t>(cp) - 0x80]);
795 }
796 }
797
798 template <typename T>
799 constexpr void transcode_codepage_to_unicode(T& dest, stringable8 auto source, std::span<char32_t const, 128> codepage_map)
800 {
801 using dest_char = typename std::decay_t<decltype(dest)>::value_type;
802 for (uint8_t cp : source)
803 {
804 if (cp < 0x80)
805 dest += static_cast<dest_char>(cp);
806 else
807 append_codepoint(dest, codepage_map[static_cast<size_t>(cp) - 0x80]);
808 }
809 }
810
811 template <typename RESULT>
812 [[nodiscard]] constexpr auto transcode_codepage_to_unicode(stringable8 auto source, std::span<char32_t const, 128> codepage_map) -> RESULT
813 {
814 RESULT result{};
816 return result;
817 }
818
819 template <string8 T>
820 constexpr auto transcode_codepage_to_utf8(stringable8 auto source, std::span<char32_t const, 128> codepage_map) -> T
821 {
822 T result{};
824 return result;
825 }
826
827#ifndef __clang__
828 [[gsl::suppress(type.1, es.79)]]
829#else
830 [[gsl::suppress("type.1", "es.79")]]
831#endif
832 [[nodiscard]] constexpr char32_t consume_utf16(string_view16 auto& str)
833 {
834 using char_type = typename std::remove_cvref_t<decltype(str)>::value_type;
835 using unsigned_char_type = std::make_unsigned_t<char_type const>;
836
837 if (str.empty()) return 0;
838 auto it = (unsigned_char_type*)std::to_address(str.begin());
839 char32_t cp = *it;
840
841 const int length = int(cp >= 0xD800 && cp <= 0xDBFF) + 1;
842
843 if (length == 2)
844 {
845 ++it;
846 cp = ((cp - 0xD800) << 10) | (*it - 0xDC00);
847 }
848 str.remove_prefix(length);
849 return cp;
850 }
851
852 [[nodiscard]] constexpr char32_t consume_utf32(string_view32 auto& str)
853 {
854 if (str.empty()) return 0;
855 const auto result = str[0];
856 str.remove_prefix(1);
857 return result;
858 }
859
860#ifndef __clang__
861 [[gsl::suppress(type.1)]]
862#else
863 [[gsl::suppress("type.1")]]
864#endif
865 constexpr size_t append_utf16(string16 auto& buffer, char32_t cp)
866 {
867 using char_type = typename std::remove_cvref_t<decltype(buffer)>::value_type;
868 if (cp <= 0xFFFF)
869 {
870 buffer += static_cast<char_type>(cp);
871 return 1;
872 }
873
874 buffer += static_cast<char_type>((cp >> 10) + 0xD800);
875 buffer += static_cast<char_type>((cp & 0x3FF) + 0xDC00);
876 return 2;
877 }
878
879 constexpr size_t append_utf32(string32 auto& buffer, char32_t cp)
880 {
881 buffer += cp;
882 return 1;
883 }
884
885 template <string8 T>
886#ifndef __clang__
887 [[gsl::suppress(type.1)]]
888#else
889 [[gsl::suppress("type.1")]]
890#endif
891 [[nodiscard]] constexpr T to_utf8(char32_t cp)
892 {
893 using char_type = typename T::value_type;
894 if (cp < 0x80)
895 return { static_cast<char_type>(cp) };
896 else if (cp < 0x800)
897 return { static_cast<char_type>((cp >> 6) | 0xc0), static_cast<char_type>((cp & 0x3f) | 0x80) };
898 else if (cp < 0x10000)
899 return { static_cast<char_type>((cp >> 12) | 0xe0), static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80), static_cast<char_type>((cp & 0x3f) | 0x80) };
900 else
901 return { static_cast<char_type>((cp >> 18) | 0xf0), static_cast<char_type>(((cp >> 12) & 0x3f) | 0x80), static_cast<char_type>(((cp >> 6) & 0x3f) | 0x80), static_cast<char_type>((cp & 0x3f) | 0x80) };
902 }
903
904 template <string8 RESULT, stringable8 STR>
905 [[nodiscard]] constexpr RESULT to_utf8(STR&& str)
906 {
907 if constexpr (std::same_as<STR, RESULT>)
908 return std::forward<STR>(str);
909 else
910 {
911 using char_type = typename RESULT::value_type;
912 return RESULT{ string_view_cast<char_type>(make_sv(std::forward<STR>(str))) };
913 }
914 }
915
916 template <string8 RESULT, stringable16 STR>
917 [[nodiscard]] constexpr RESULT to_utf8(STR&& str)
918 {
919 RESULT result{};
920 auto sv = make_sv(str);
921 while (!sv.empty())
922 append_utf8(result, consume_utf16(sv));
923 return result;
924 }
925
927 template <string8 RESULT, stringable32 STR>
928 [[nodiscard]] constexpr RESULT to_utf8(STR&& str)
929 {
930 RESULT result{};
931 auto sv = make_sv(str);
932 while (!sv.empty())
933 append_utf8(result, consume_utf32(sv));
934 return result;
935 }
936
938 [[nodiscard]] inline std::string to_string(std::wstring_view str)
939 {
940 return to_utf8<std::string>(str);
941 }
942
943 template <string16 T>
944#ifndef __clang__
945 [[gsl::suppress(type.1)]]
946#else
947 [[gsl::suppress("type.1")]]
948#endif
949 [[nodiscard]] constexpr T to_utf16(char32_t cp)
950 {
951 using char_type = T::value_type;
952 if (cp <= 0xFFFF)
953 return { static_cast<char_type>(cp) };
954 else
955 return { static_cast<char_type>((cp >> 10) + 0xD800), static_cast<char_type>((cp & 0x3FF) + 0xDC00) };
956 }
957
958 template <string16 T, stringable8 STR>
959 [[nodiscard]] constexpr T to_utf16(STR str)
960 {
961 T result{};
962 auto sv = make_sv(str);
963 while (!sv.empty())
964 append_utf16(result, consume_utf8(sv));
965 return result;
966 }
967
968 [[nodiscard]] inline std::wstring to_wstring(std::string_view str)
969 {
971 }
972
973 template <std::ranges::view R>
974 struct utf8_view : public std::ranges::view_interface<utf8_view<R>>
975 {
976 template <typename RANGE_ITER, typename SENTINEL>
977 struct utf8_iterator
978 {
979 using iterator_category = std::forward_iterator_tag;
980 using value_type = char32_t;
981 using difference_type = ptrdiff_t;
982 using reference = char32_t;
983
984 constexpr utf8_iterator(RANGE_ITER current, SENTINEL end) : mCurrent(std::move(current)), mEnd(std::move(end)) {}
985
986 [[nodiscard]] constexpr value_type operator*() const {
987 const auto length = len();
988 auto it = mCurrent;
989 char32_t cp = std::bit_cast<uint8_t>(*it);
990
991 switch (length) {
992 case 2:
993 ++it; cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
994 break;
995 case 3:
996 ++it; cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
997 ++it; cp += (*it) & 0x3f;
998 break;
999 case 4:
1000 ++it; cp = ((cp << 18) & 0x1fffff) + (((*it) << 12) & 0x3ffff);
1001 ++it; cp += ((*it) << 6) & 0xfff;
1002 ++it; cp += (*it) & 0x3f;
1003 break;
1004 }
1005 return cp;
1006 }
1007
1008 constexpr utf8_iterator& operator++() {
1009 std::advance(mCurrent, len());
1010 return *this;
1011 }
1012
1013 constexpr utf8_iterator operator++(int) {
1014 auto copy = *this;
1015 ++* this;
1016 return copy;
1017 }
1018
1019 constexpr auto operator<=>(utf8_iterator const&) const noexcept = default;
1020
1021 private:
1022
1023 size_t len() const
1024 {
1025 if (mCurrent >= mEnd)
1026 throw std::out_of_range("utf8 iterator out of range");
1027
1028 const unsigned cp = std::bit_cast<uint8_t>(*mCurrent);
1029 size_t length = 0;
1030 if (cp < 0x80) length = 1;
1031 else if ((cp >> 5) == 0x6) length = 2;
1032 else if ((cp >> 4) == 0xe) length = 3;
1033 else if ((cp >> 3) == 0x1e) length = 4;
1034 else
1035 throw std::runtime_error("invalid utf-8 prefix");
1036
1037 if (mCurrent + length > mEnd)
1038 throw std::runtime_error("utf-8 range contains codepoint with length beyond end of range");
1039
1040 return length;
1041 }
1042
1043 RANGE_ITER mCurrent;
1044 SENTINEL mEnd;
1045 };
1046
1047 utf8_view() = default;
1048
1049 constexpr utf8_view(R base)
1050 : mBase(base)
1051 {
1052 }
1053
1054 template <typename... ARGS>
1055 requires std::constructible_from<std::string_view, ARGS...>
1056 explicit constexpr utf8_view(ARGS&&... args)
1057 : mBase(std::forward<ARGS>(args)...)
1058 {
1059 }
1060
1061 constexpr R base() const&
1062 {
1063 return mBase;
1064 }
1065 constexpr R base()&&
1066 {
1067 return std::move(mBase);
1068 }
1069
1070 constexpr auto begin() const
1071 {
1072 return utf8_iterator<decltype(std::begin(mBase)), decltype(std::end(mBase))>{std::begin(mBase), std::end(mBase)};
1073 }
1074
1075 constexpr auto end() const
1076 {
1077 return utf8_iterator<decltype(std::end(mBase)), decltype(std::end(mBase))>{std::end(mBase), std::end(mBase)};
1078 }
1079
1080 private:
1081 R mBase{};
1082 };
1083
1084}
Whether the type is a native char type.
Definition string_ops.h:98
The type is a string with a 16-bit char type.
Definition string_ops.h:59
The type is a string with an 32-bit char type.
Definition string_ops.h:69
The type is a string with an 8-bit char type.
Definition string_ops.h:49
The type is a string view with a 16-bit char type.
Definition string_ops.h:65
The type is a string view with a 32-bit char type.
Definition string_ops.h:75
The type is a string view with an 8-bit char type.
Definition string_ops.h:55
The type is convertible to a string view with a 16-bit char type.
Definition string_ops.h:62
The type is convertible to a string view with a 32-bit char type.
Definition string_ops.h:72
The type is convertible to a string view with an 8-bit char type.
Definition string_ops.h:52
constexpr auto bit_count
Equal to the number of bits in the type.
Definition bits.h:33
unicode_plane
Represents the Unicode plane.
Definition unicode.h:393
constexpr char32_t consume_utf32(string_view32 auto &str)
Consumes (see consume()) a UTF-32 codepoint from str.
Definition unicode.h:852
text_encoding_type
Specifies a base text-encoding, ignoring endianness for multi-byte encodings.
Definition unicode.h:43
constexpr void transcode_unicode(FROM const &from, TO &out)
Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings...
Definition unicode.h:232
constexpr void transcode_codepage_to_unicode(T &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
Transcodes an Extended ASCII string source into unicode-encoded dest, according to codepage_map.
Definition unicode.h:799
constexpr bool is_unicode_character(char32_t cp) noexcept
Returns whether cp has a value that is a valid Unicode character (ie.
Definition unicode.h:424
constexpr size_t append_utf32(string32 auto &buffer, char32_t cp)
Appends 32-bit values to buffer by encoding cp into UTF-32.
Definition unicode.h:879
constexpr char32_t surrogate_pair_to_codepoint(char32_t high, char32_t low) noexcept
Returns the codepoint encoded by two surrogates.
Definition unicode.h:426
constexpr size_t count_utf8_codepoints(stringable8 auto str)
Returns the number of codepoints in the given UTF-8 string str
Definition unicode.h:708
constexpr bool is_unicode(char32_t cp) noexcept
Returns whether cp has a value that is a valid Unicode codepoint (ie. between 0 and 0x10FFFF).
Definition unicode.h:422
constexpr void transcode_codepage_to_utf8(string8 auto &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
Transcodes an Extended ASCII string source into UTF-8 dest, according to codepage_map
Definition unicode.h:786
constexpr size_t append_utf8(string8 auto &buffer, char32_t cp)
Appends octets to buffer by encoding cp into UTF-8.
Definition unicode.h:738
constexpr RESULT to_utf8(char32_t cp)
Returns cp encoded as a UTF-8 string.
Definition unicode.h:891
constexpr char32_t consume_utf16(string_view16 auto &str)
Consumes (see consume()) a UTF-16 codepoint from str.
Definition unicode.h:832
constexpr RESULT to_utf16(char32_t cp)
Returns cp encoded as a UTF-16 string.
Definition unicode.h:949
constexpr bool is_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes any part of a codepoint with a more-than-16-bit value.
Definition unicode.h:421
constexpr void append_codepoint(T &str, char32_t cp)
Appends a codepoint to a UTF-encoded string. Supports UTF-8, UTF-16 and UTF-32, decides based on char...
Definition unicode.h:218
constexpr char32_t consume_codepoint(T &str)
Consumes a codepoint from a UTF-encoded string and returns it.
Definition unicode.h:201
constexpr char32_t consume_utf8(string_view8 auto &str)
Consumes (see consume()) a UTF-8 codepoint from str.
Definition unicode.h:674
constexpr size_t codepoint_utf8_count(char32_t cp) noexcept
Returns the number of UTF-8 octets necessarity to encode the given codepoint.
Definition unicode.h:449
constexpr bool is_high_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes the high part of a codepoint with a more-than-16-bit v...
Definition unicode.h:419
text_encoding detect_encoding(T const &range)
Attempts to detect the encoding of a given bytelike range.
Definition unicode.h:534
constexpr size_t append_utf16(string16 auto &buffer, char32_t cp)
Appends 16-bit values to buffer by encoding cp into UTF-16.
Definition unicode.h:865
text_decode_result decode_codepoint(bytelike_range auto range, text_encoding encoding)
Attempts to decode the first codepoint in bytelike range range, assuming it is encoded in encoding.
Definition unicode.h:455
constexpr text_encoding unknown_text_encoding
Represents an unknown text encoding (e.g. when an encoding could not be determined)
Definition unicode.h:67
std::wstring to_wstring(std::string_view str)
Returns str (a UTF-8-encoded string) encoded as a UTF-16/32 string in a std::wstring (depending on th...
Definition unicode.h:968
constexpr bool is_low_surrogate(char32_t cp) noexcept
Returns whether cp is a codepoint that encodes the low part of a codepoint with a more-than-16-bit va...
Definition unicode.h:420
text_encoding consume_bom(std::span< BYTE_TYPE, N > &spn)
Consumes (see consume()) a byte order mark from the beginning of spn (a span of bytelike),...
Definition unicode.h:273
@ invalid
Represents an invalid plane number.
Type that represents a specific text encoding - a combination of ghassanpl::string_ops::text_encoding...
Definition unicode.h:51
The below code is based on Sun's libm library code, which is licensed under the following license:
Shamelessly stolen from https://github.com/arc80/plywood/.
Definition unicode.h:373
uint8_t byte_count
The number of bytes this codepoint takes up in the input string.
Definition unicode.h:386
char32_t point
The decoded codepoint (or -1 if failed)
Definition unicode.h:384
A simple view over an UTF8 string range with codepoint values.
Definition unicode.h:975