header_utils
Loading...
Searching...
No Matches

Functions and types that operate on Unicode codepoints and strings. More...

Classes

struct  ghassanpl::string_ops::text_encoding
 Type that represents a specific text encoding - a combination of ghassanpl::string_ops::text_encoding_type and endianness. More...
 
struct  ghassanpl::string_ops::utf8_view< R >
 A simple view over an UTF8 string range with codepoint values. More...
 

Enumerations

enum class  ghassanpl::string_ops::text_encoding_type {
  unknown , utf8 , utf16 , utf32 ,
  utf7 , utf1 , utf_ebcdic , scsu ,
  bocu1 , gb18030
}
 Specifies a base text-encoding, ignoring endianness for multi-byte encodings. More...
 
enum class  ghassanpl::string_ops::unicode_plane {
  ghassanpl::string_ops::unicode_plane::invalid , basic_multilingual_plane , supplementary_multilingual_plane , supplementary_ideographic_plane ,
  tertiary_ideographic_plane , supplementary_special_purpose_plane , private_use_plane_a , private_use_plane_b ,
  bmp , smp , sip , tip ,
  ssp , spua_a , pup_a , spua_b ,
  pup_b
}
 Represents the Unicode plane. More...
 

Functions

constexpr bool ghassanpl::string_ops::is_high_surrogate (char32_t cp) noexcept
 Returns whether cp is a codepoint that encodes the high part of a codepoint with a more-than-16-bit value.
 
constexpr bool ghassanpl::string_ops::is_low_surrogate (char32_t cp) noexcept
 Returns whether cp is a codepoint that encodes the low part of a codepoint with a more-than-16-bit value.
 
constexpr bool ghassanpl::string_ops::is_surrogate (char32_t cp) noexcept
 Returns whether cp is a codepoint that encodes any part of a codepoint with a more-than-16-bit value.
 
constexpr bool ghassanpl::string_ops::is_unicode (char32_t cp) noexcept
 Returns whether cp has a value that is a valid Unicode codepoint (ie. between 0 and 0x10FFFF).
 
constexpr bool ghassanpl::string_ops::is_unicode_character (char32_t cp) noexcept
 Returns whether cp has a value that is a valid Unicode character (ie.
 
constexpr char32_t ghassanpl::string_ops::surrogate_pair_to_codepoint (char32_t high, char32_t low) noexcept
 Returns the codepoint encoded by two surrogates.
 
constexpr auto ghassanpl::string_ops::get_unicode_plane (char32_t cp) noexcept -> unicode_plane
 
text_decode_result ghassanpl::string_ops::decode_codepoint (bytelike_range auto range, text_encoding encoding)
 Attempts to decode the first codepoint in bytelike range range, assuming it is encoded in encoding.
 
template<bytelike BYTE_TYPE, size_t N>
text_encoding ghassanpl::string_ops::consume_bom (std::span< BYTE_TYPE, N > &spn)
 Consumes (see consume()) a byte order mark from the beginning of spn (a span of bytelike), and returns the encoding that the BOM represents (or unknown_text_encoding if no BOM).
 
text_encoding ghassanpl::string_ops::consume_bom (string_view8 auto &sv)
 Consumes (see consume()) a byte order mark from the beginning of sv, and returns the encoding that the BOM represents (or unknown_text_encoding if no BOM).
 
text_encoding ghassanpl::string_ops::consume_bom (string_view16 auto &sv)
 Consumes (see consume()) a byte order mark from the beginning of sv, and returns the UTF-16 encoding that the BOM represents (or unknown_text_encoding if no BOM).
 
text_encoding ghassanpl::string_ops::consume_bom (string_view32 auto &sv)
 Consumes (see consume()) a byte order mark from the beginning of sv, and returns the UTF-32 encoding that the BOM represents (or unknown_text_encoding if no BOM).
 
template<bytelike_range T>
text_encoding ghassanpl::string_ops::detect_encoding (T const &range)
 Attempts to detect the encoding of a given bytelike range.
 
template<typename T >
constexpr char32_t ghassanpl::string_ops::consume_codepoint (T &str)
 Consumes a codepoint from a UTF-encoded string and returns it.
 
template<typename T >
constexpr void ghassanpl::string_ops::append_codepoint (T &str, char32_t cp)
 Appends a codepoint to a UTF-encoded string. Supports UTF-8, UTF-16 and UTF-32, decides based on char type of str.
 
template<typename TO , typename FROM >
constexpr void ghassanpl::string_ops::transcode_unicode (FROM const &from, TO &out)
 Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings based on the char type of TO and FROM.
 
template<typename TO , typename FROM >
constexpr TO ghassanpl::string_ops::transcode_unicode (FROM const &from)
 Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings based on the char type of TO and FROM.
 
template<typename T >
constexpr void ghassanpl::string_ops::transcode_codepage_to_unicode (T &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
 Transcodes an Extended ASCII string source into unicode-encoded dest, according to codepage_map.
 
template<typename RESULT = std::string>
constexpr auto ghassanpl::string_ops::transcode_codepage_to_unicode (stringable8 auto source, std::span< char32_t const, 128 > codepage_map) -> RESULT
 Transcodes an Extended ASCII string source into a unicode encoding, according to codepage_map
 

Encodings

Values representing UTF encodings

constexpr text_encoding ghassanpl::string_ops::utf8_encoding
 
constexpr text_encoding ghassanpl::string_ops::utf16_le_encoding
 
constexpr text_encoding ghassanpl::string_ops::utf16_be_encoding
 
constexpr text_encoding ghassanpl::string_ops::utf32_le_encoding
 
constexpr text_encoding ghassanpl::string_ops::utf32_be_encoding
 
constexpr text_encoding ghassanpl::string_ops::unknown_text_encoding
 Represents an unknown text encoding (e.g. when an encoding could not be determined)
 

UTF-8 functions

constexpr size_t ghassanpl::string_ops::codepoint_utf8_count (char32_t cp) noexcept
 Returns the number of UTF-8 octets necessarity to encode the given codepoint.
 
constexpr char32_t ghassanpl::string_ops::consume_utf8 (string_view8 auto &str)
 Consumes (see consume()) a UTF-8 codepoint from str.
 
constexpr size_t ghassanpl::string_ops::count_utf8_codepoints (stringable8 auto str)
 Returns the number of codepoints in the given UTF-8 string str
 
constexpr size_t ghassanpl::string_ops::append_utf8 (string8 auto &buffer, char32_t cp)
 Appends octets to buffer by encoding cp into UTF-8.
 
template<string8 RESULT = std::string>
constexpr RESULT ghassanpl::string_ops::to_utf8 (char32_t cp)
 Returns cp encoded as a UTF-8 string.
 
template<string8 RESULT = std::string, stringable16 STR>
constexpr RESULT ghassanpl::string_ops::to_utf8 (STR &&str)
 Returns str (a UTF-16-encoded string) encoded as a UTF-8 string.
 
std::string ghassanpl::string_ops::to_string (std::wstring_view str)
 Returns str (a UTF-16-encoded string) encoded as a UTF-8 string.
 
constexpr void ghassanpl::string_ops::transcode_codepage_to_utf8 (string8 auto &dest, stringable8 auto source, std::span< char32_t const, 128 > codepage_map)
 Transcodes an Extended ASCII string source into UTF-8 dest, according to codepage_map
 
template<string8 RESULT = std::string>
constexpr auto ghassanpl::string_ops::transcode_codepage_to_utf8 (stringable8 auto source, std::span< char32_t const, 128 > codepage_map) -> RESULT
 Transcodes an Extended ASCII string source into UTF-8, according to codepage_map
 

UTF-16 functions

constexpr char32_t ghassanpl::string_ops::consume_utf16 (string_view16 auto &str)
 Consumes (see consume()) a UTF-16 codepoint from str.
 
constexpr size_t ghassanpl::string_ops::append_utf16 (string16 auto &buffer, char32_t cp)
 Appends 16-bit values to buffer by encoding cp into UTF-16.
 
template<string16 RESULT = std::wstring>
constexpr RESULT ghassanpl::string_ops::to_utf16 (char32_t cp)
 Returns cp encoded as a UTF-16 string.
 
template<string16 RESULT = std::wstring, stringable8 STR>
constexpr RESULT ghassanpl::string_ops::to_utf16 (STR str)
 Returns str (a UTF-8-encoded string) encoded as a UTF-16 string.
 
std::wstring ghassanpl::string_ops::to_wstring (std::string_view str)
 Returns str (a UTF-8-encoded string) encoded as a UTF-16/32 string in a std::wstring (depending on the size of wchar_t)
 

UTF-32 functions

constexpr char32_t ghassanpl::string_ops::consume_utf32 (string_view32 auto &str)
 Consumes (see consume()) a UTF-32 codepoint from str.
 
constexpr size_t ghassanpl::string_ops::append_utf32 (string32 auto &buffer, char32_t cp)
 Appends 32-bit values to buffer by encoding cp into UTF-32.
 

Detailed Description

Functions and types that operate on Unicode codepoints and strings.

This code uses char32_t to represent single Unicode codepoints (as UTF-32 code units).


Class Documentation

◆ ghassanpl::string_ops::text_encoding

struct ghassanpl::string_ops::text_encoding

Type that represents a specific text encoding - a combination of ghassanpl::string_ops::text_encoding_type and endianness.

Definition at line 50 of file unicode.h.

Class Members
endian endianness
text_encoding_type type

Enumeration Type Documentation

◆ text_encoding_type

Specifies a base text-encoding, ignoring endianness for multi-byte encodings.

Definition at line 42 of file unicode.h.

◆ unicode_plane

Represents the Unicode plane.

Value equals the actual number of the unicode plane

Enumerator
invalid 

Represents an invalid plane number.

Definition at line 392 of file unicode.h.

Function Documentation

◆ append_codepoint()

template<typename T >
constexpr void ghassanpl::string_ops::append_codepoint ( T &  str,
char32_t  cp 
)
constexpr

Appends a codepoint to a UTF-encoded string. Supports UTF-8, UTF-16 and UTF-32, decides based on char type of str.

Definition at line 218 of file unicode.h.

◆ append_utf16()

constexpr size_t ghassanpl::string_ops::append_utf16 ( string16 auto buffer,
char32_t  cp 
)
constexpr

Appends 16-bit values to buffer by encoding cp into UTF-16.

Returns
the number of 16-bit codepoints appended

Definition at line 865 of file unicode.h.

◆ append_utf32()

constexpr size_t ghassanpl::string_ops::append_utf32 ( string32 auto buffer,
char32_t  cp 
)
constexpr

Appends 32-bit values to buffer by encoding cp into UTF-32.

Returns
the number of 32-bit codepoints appended

Definition at line 879 of file unicode.h.

◆ append_utf8()

constexpr size_t ghassanpl::string_ops::append_utf8 ( string8 auto buffer,
char32_t  cp 
)
constexpr

Appends octets to buffer by encoding cp into UTF-8.

Definition at line 738 of file unicode.h.

◆ codepoint_utf8_count()

constexpr size_t ghassanpl::string_ops::codepoint_utf8_count ( char32_t  cp)
constexprnoexcept

Returns the number of UTF-8 octets necessarity to encode the given codepoint.

Definition at line 449 of file unicode.h.

◆ consume_bom() [1/4]

template<bytelike BYTE_TYPE, size_t N>
text_encoding ghassanpl::string_ops::consume_bom ( std::span< BYTE_TYPE, N > &  spn)
inline

Consumes (see consume()) a byte order mark from the beginning of spn (a span of bytelike), and returns the encoding that the BOM represents (or unknown_text_encoding if no BOM).

Definition at line 273 of file unicode.h.

◆ consume_bom() [2/4]

text_encoding ghassanpl::string_ops::consume_bom ( string_view16 auto sv)
inline

Consumes (see consume()) a byte order mark from the beginning of sv, and returns the UTF-16 encoding that the BOM represents (or unknown_text_encoding if no BOM).

Definition at line 341 of file unicode.h.

◆ consume_bom() [3/4]

text_encoding ghassanpl::string_ops::consume_bom ( string_view32 auto sv)
inline

Consumes (see consume()) a byte order mark from the beginning of sv, and returns the UTF-32 encoding that the BOM represents (or unknown_text_encoding if no BOM).

Definition at line 356 of file unicode.h.

◆ consume_bom() [4/4]

text_encoding ghassanpl::string_ops::consume_bom ( string_view8 auto sv)
inline

Consumes (see consume()) a byte order mark from the beginning of sv, and returns the encoding that the BOM represents (or unknown_text_encoding if no BOM).

Definition at line 308 of file unicode.h.

◆ consume_codepoint()

template<typename T >
constexpr char32_t ghassanpl::string_ops::consume_codepoint ( T &  str)
constexpr

Consumes a codepoint from a UTF-encoded string and returns it.

Definition at line 201 of file unicode.h.

◆ consume_utf16()

constexpr char32_t ghassanpl::string_ops::consume_utf16 ( string_view16 auto str)
constexpr

Consumes (see consume()) a UTF-16 codepoint from str.

Precondition
str must be valid UTF-16

Definition at line 832 of file unicode.h.

◆ consume_utf32()

constexpr char32_t ghassanpl::string_ops::consume_utf32 ( string_view32 auto str)
constexpr

Consumes (see consume()) a UTF-32 codepoint from str.

Precondition
str must be valid UTF-32

Definition at line 852 of file unicode.h.

◆ consume_utf8()

constexpr char32_t ghassanpl::string_ops::consume_utf8 ( string_view8 auto str)
constexpr

Consumes (see consume()) a UTF-8 codepoint from str.

Precondition
str must be valid UTF-8

Definition at line 674 of file unicode.h.

◆ count_utf8_codepoints()

constexpr size_t ghassanpl::string_ops::count_utf8_codepoints ( stringable8 auto  str)
constexpr

Returns the number of codepoints in the given UTF-8 string str

Precondition
str must be valid UTF-8

Definition at line 708 of file unicode.h.

◆ decode_codepoint()

text_decode_result ghassanpl::string_ops::decode_codepoint ( bytelike_range auto  range,
text_encoding  encoding 
)
inline

Attempts to decode the first codepoint in bytelike range range, assuming it is encoded in encoding.

Definition at line 455 of file unicode.h.

◆ detect_encoding()

template<bytelike_range T>
text_encoding ghassanpl::string_ops::detect_encoding ( T const range)
inline

Attempts to detect the encoding of a given bytelike range.

Note
If no BOM is present, only detects UTF encodings.

Definition at line 534 of file unicode.h.

◆ get_unicode_plane()

constexpr auto ghassanpl::string_ops::get_unicode_plane ( char32_t  cp) -> unicode_plane
constexprnoexcept

Definition at line 423 of file unicode.h.

◆ is_high_surrogate()

constexpr bool ghassanpl::string_ops::is_high_surrogate ( char32_t  cp)
constexprnoexcept

Returns whether cp is a codepoint that encodes the high part of a codepoint with a more-than-16-bit value.

Definition at line 419 of file unicode.h.

◆ is_low_surrogate()

constexpr bool ghassanpl::string_ops::is_low_surrogate ( char32_t  cp)
constexprnoexcept

Returns whether cp is a codepoint that encodes the low part of a codepoint with a more-than-16-bit value.

Definition at line 420 of file unicode.h.

◆ is_surrogate()

constexpr bool ghassanpl::string_ops::is_surrogate ( char32_t  cp)
constexprnoexcept

Returns whether cp is a codepoint that encodes any part of a codepoint with a more-than-16-bit value.

Definition at line 421 of file unicode.h.

◆ is_unicode()

constexpr bool ghassanpl::string_ops::is_unicode ( char32_t  cp)
constexprnoexcept

Returns whether cp has a value that is a valid Unicode codepoint (ie. between 0 and 0x10FFFF).

Definition at line 422 of file unicode.h.

◆ is_unicode_character()

constexpr bool ghassanpl::string_ops::is_unicode_character ( char32_t  cp)
constexprnoexcept

Returns whether cp has a value that is a valid Unicode character (ie.

a value that encodes a (part of a) character). Specifically, byte order marks are not characters, but surrogates technically are part of a character.

Definition at line 424 of file unicode.h.

◆ surrogate_pair_to_codepoint()

constexpr char32_t ghassanpl::string_ops::surrogate_pair_to_codepoint ( char32_t  high,
char32_t  low 
)
constexprnoexcept

Returns the codepoint encoded by two surrogates.

Definition at line 426 of file unicode.h.

◆ to_string()

std::string ghassanpl::string_ops::to_string ( std::wstring_view  str)
inline

Returns str (a UTF-16-encoded string) encoded as a UTF-8 string.

Precondition
str must be valid UTF-16

Definition at line 938 of file unicode.h.

◆ to_utf16() [1/2]

template<string16 RESULT = std::wstring>
constexpr RESULT ghassanpl::string_ops::to_utf16 ( char32_t  cp)
constexpr

Returns cp encoded as a UTF-16 string.

Template Parameters
RESULTthe type of string to return (std::wstring by default)
Precondition
cp must be a valid Unicode codepoint

Definition at line 949 of file unicode.h.

◆ to_utf16() [2/2]

template<string16 RESULT = std::wstring, stringable8 STR>
constexpr RESULT ghassanpl::string_ops::to_utf16 ( STR  str)
constexpr

Returns str (a UTF-8-encoded string) encoded as a UTF-16 string.

Template Parameters
RESULTthe type of string to return (std::wstring by default)
Precondition
str must be valid UTF-8

Definition at line 959 of file unicode.h.

◆ to_utf8() [1/2]

template<string8 RESULT = std::string>
constexpr RESULT ghassanpl::string_ops::to_utf8 ( char32_t  cp)
constexpr

Returns cp encoded as a UTF-8 string.

Template Parameters
RESULTthe type of string to return (std::string by default)
Precondition
cp must be a valid Unicode codepoint

Definition at line 891 of file unicode.h.

◆ to_utf8() [2/2]

template<string8 RESULT = std::string, stringable16 STR>
constexpr RESULT ghassanpl::string_ops::to_utf8 ( STR &&  str)
constexpr

Returns str (a UTF-16-encoded string) encoded as a UTF-8 string.

Returns str (a UTF-32-encoded string) encoded as a UTF-8 string.

Template Parameters
RESULTthe type of string to return (std::string by default)
Precondition
str must be valid UTF-16
Template Parameters
RESULTthe type of string to return (std::string by default)
Precondition
str must be valid UTF-32
str must be valid UTF-16

Definition at line 905 of file unicode.h.

◆ to_wstring()

std::wstring ghassanpl::string_ops::to_wstring ( std::string_view  str)
inline

Returns str (a UTF-8-encoded string) encoded as a UTF-16/32 string in a std::wstring (depending on the size of wchar_t)

Precondition
str must be valid UTF-8

Definition at line 968 of file unicode.h.

◆ transcode_codepage_to_unicode() [1/2]

template<typename RESULT = std::string>
constexpr auto ghassanpl::string_ops::transcode_codepage_to_unicode ( stringable8 auto  source,
std::span< char32_t const, 128 >  codepage_map 
) -> RESULT
constexpr

Transcodes an Extended ASCII string source into a unicode encoding, according to codepage_map

Template Parameters
RESULTthe type of string to return (std::string by default)
Parameters
codepage_mapA span of 128 Unicode codepoints that will be substituted for EASCII values 128-255
Returns
a Unicode-encoded string of type T; encoding is decided based on the char type of RESULT

Definition at line 812 of file unicode.h.

◆ transcode_codepage_to_unicode() [2/2]

template<typename T >
constexpr void ghassanpl::string_ops::transcode_codepage_to_unicode ( T &  dest,
stringable8 auto  source,
std::span< char32_t const, 128 >  codepage_map 
)
constexpr

Transcodes an Extended ASCII string source into unicode-encoded dest, according to codepage_map.

Destination encoding will be decided based on the char type of dest.

Parameters
codepage_mapA span of 128 Unicode codepoints that will be substituted for EASCII values 128-255

Definition at line 799 of file unicode.h.

◆ transcode_codepage_to_utf8() [1/2]

constexpr void ghassanpl::string_ops::transcode_codepage_to_utf8 ( string8 auto dest,
stringable8 auto  source,
std::span< char32_t const, 128 >  codepage_map 
)
constexpr

Transcodes an Extended ASCII string source into UTF-8 dest, according to codepage_map

Parameters
codepage_mapA span of 128 Unicode codepoints that will be substituted for EASCII values 128-255 TODO: Is this needed since we have transcode_codepage_to_unicode?

Definition at line 786 of file unicode.h.

◆ transcode_codepage_to_utf8() [2/2]

template<string8 RESULT = std::string>
constexpr auto ghassanpl::string_ops::transcode_codepage_to_utf8 ( stringable8 auto  source,
std::span< char32_t const, 128 >  codepage_map 
) -> RESULT
constexpr

Transcodes an Extended ASCII string source into UTF-8, according to codepage_map

Template Parameters
RESULTthe type of string to return (std::string by default)
Parameters
codepage_mapA span of 128 Unicode codepoints that will be substituted for EASCII values 128-255
Returns
a UTF-8-encoded string of type T TODO: Is this needed since we have transcode_codepage_to_unicode?

Definition at line 820 of file unicode.h.

◆ transcode_unicode() [1/2]

template<typename TO , typename FROM >
constexpr TO ghassanpl::string_ops::transcode_unicode ( FROM const from)
constexpr

Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings based on the char type of TO and FROM.

Definition at line 241 of file unicode.h.

◆ transcode_unicode() [2/2]

template<typename TO , typename FROM >
constexpr void ghassanpl::string_ops::transcode_unicode ( FROM const from,
TO out 
)
constexpr

Converts a UTF-encoded string to a UTF-encoded string, of a different encoding. Decides the encodings based on the char type of TO and FROM.

Definition at line 232 of file unicode.h.

Variable Documentation

◆ unknown_text_encoding

constexpr text_encoding ghassanpl::string_ops::unknown_text_encoding
inlineconstexpr

Represents an unknown text encoding (e.g. when an encoding could not be determined)

Definition at line 67 of file unicode.h.

◆ utf16_be_encoding

constexpr text_encoding ghassanpl::string_ops::utf16_be_encoding
inlineconstexpr

Definition at line 63 of file unicode.h.

◆ utf16_le_encoding

constexpr text_encoding ghassanpl::string_ops::utf16_le_encoding
inlineconstexpr

Definition at line 62 of file unicode.h.

◆ utf32_be_encoding

constexpr text_encoding ghassanpl::string_ops::utf32_be_encoding
inlineconstexpr

Definition at line 65 of file unicode.h.

◆ utf32_le_encoding

constexpr text_encoding ghassanpl::string_ops::utf32_le_encoding
inlineconstexpr

Definition at line 64 of file unicode.h.

◆ utf8_encoding

constexpr text_encoding ghassanpl::string_ops::utf8_encoding
inlineconstexpr

Definition at line 61 of file unicode.h.