3.5. UTF-8 utilities

The UTF8_UTILS module provides Unicode UTF-8 string utilities including character iteration, codepoint extraction, byte length calculation, and validation of UTF-8 encoded text.

All functions and symbols are in “utf8_utils” module, use require to get access to it.

require daslib/utf8_utils

3.5.1. Constants

s_utf8d = fixed_array<uint>(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x0, 0xc, 0x18, 0x24, 0x3c, 0x60, 0x54, 0xc, 0xc, 0xc, 0x30, 0x48, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0x0, 0xc, 0xc, 0xc, 0xc, 0xc, 0x0, 0xc, 0x0, 0xc, 0xc, 0xc, 0x18, 0xc, 0xc, 0xc, 0xc, 0xc, 0x18, 0xc, 0x18, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0x18, 0xc, 0xc, 0xc, 0xc, 0xc, 0x18, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0x18, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0x24, 0xc, 0x24, 0xc, 0xc, 0xc, 0x24, 0xc, 0xc, 0xc, 0xc, 0xc, 0x24, 0xc, 0x24, 0xc, 0xc, 0xc, 0x24, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc )

s_utf8d:uint const[364]

UTF8_ACCEPT = 0x0

UTF8_ACCEPT:uint const

_UTF32_NON_WORD_RANGES = fixed_array<uint>(0x0, 0x2f, 0x3a, 0x40, 0x5b, 0x5e, 0x60, 0x60, 0x7b, 0xa9, 0xab, 0xb4, 0xb6, 0xb9, 0xbb, 0xbf, 0xd7, 0xd7, 0xf7, 0xf7, 0x2000, 0x206f, 0x2190, 0x2bff, 0x3000, 0x3004, 0x3008, 0x3011, 0x3014, 0x301f, 0xfe30, 0xfe6f, 0xff01, 0xff0f, 0xff1a, 0xff20, 0xff3b, 0xff40, 0xff5b, 0xff65 )

_UTF32_NON_WORD_RANGES:uint const[40]

_UTF32_LATIN1_LOWER = fixed_array<uint>(0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff )

_UTF32_LATIN1_LOWER:uint const[64]

3.5.2. Encoding and decoding

decode_unicode_escape (str: string) : string

utf16_to_utf32 (high: uint; low: uint) : uint

utf8_decode (source_utf8_string: array<uint8>) : array<uint>

utf8_decode (source_utf8_string: string) : array<uint>

utf8_decode (var dest_utf32_string: array<uint>; source_utf8_string: array<uint8>)

utf8_decode (var dest_utf32_string: array<uint>; source_utf8_string: string)

utf8_encode (ch: uint) : array<uint8>

utf8_encode (source_utf32_string: array<uint>) : array<uint8>

utf8_encode (var dest_array: array<uint8>; ch: uint)

utf8_encode (var dest_array: array<uint8>; source_utf32_string: array<uint>)

decode_unicode_escape(str: string ): string 

def decode_unicode_escape (str: string) : string

Arguments:

str : string

utf16_to_utf32(high: uint; low: uint ): uint 

def utf16_to_utf32 (high: uint; low: uint) : uint

Arguments:

high : uint
low : uint

3.5.2.1. utf8_decode

utf8_decode(source_utf8_string: array<uint8> ): array<uint> 

def utf8_decode (source_utf8_string: array<uint8>) : array<uint>

Arguments:

source_utf8_string : array<uint8> implicit

utf8_decode(source_utf8_string: string ): array<uint>

utf8_decode(dest_utf32_string: array<uint>; source_utf8_string: array<uint8> )

utf8_decode(dest_utf32_string: array<uint>; source_utf8_string: string )

3.5.2.2. utf8_encode

utf8_encode(ch: uint ): array<uint8> 

def utf8_encode (ch: uint) : array<uint8>

Arguments:

ch : uint

utf8_encode(source_utf32_string: array<uint> ): array<uint8>

utf8_encode(dest_array: array<uint8>; ch: uint )

utf8_encode(dest_array: array<uint8>; source_utf32_string: array<uint> )

3.5.3. Iteration

each_word (text: string; var out: array<string>)

each_word(text: string; out: array<string> )

def each_word (text: string; var out: array<string>)

Arguments:

text : string
out : array<string>

3.5.4. Length and measurement

utf8_length (utf8_string: array<uint8>) : int

utf8_length (utf8_string: string) : int

3.5.4.1. utf8_length

utf8_length(utf8_string: array<uint8> ): int 

def utf8_length (utf8_string: array<uint8>) : int

Arguments:

utf8_string : array<uint8> implicit

utf8_length(utf8_string: string ): int

3.5.5. Validation

contains_utf8_bom (utf8_string: array<uint8>) : bool

contains_utf8_bom (utf8_string: string) : bool

is_first_byte_of_utf8_char (ch: uint8) : bool

is_utf8_string_valid (utf8_string: array<uint8>) : bool

is_utf8_string_valid (utf8_string: string) : bool

3.5.5.1. contains_utf8_bom

contains_utf8_bom(utf8_string: array<uint8> ): bool 

def contains_utf8_bom (utf8_string: array<uint8>) : bool

Arguments:

utf8_string : array<uint8> implicit

contains_utf8_bom(utf8_string: string ): bool

is_first_byte_of_utf8_char(ch: uint8 ): bool 

def is_first_byte_of_utf8_char (ch: uint8) : bool

Arguments:

ch : uint8

3.5.5.2. is_utf8_string_valid

is_utf8_string_valid(utf8_string: array<uint8> ): bool 

def is_utf8_string_valid (utf8_string: array<uint8>) : bool

Arguments:

utf8_string : array<uint8> implicit

is_utf8_string_valid(utf8_string: string ): bool

3.5.6. Unicode word/case primitives

utf32_is_word_char (cp: uint) : bool

utf32_to_lower (cp: uint) : uint

utf32_is_word_char(cp: uint ): bool 

def utf32_is_word_char (cp: uint) : bool

Arguments:

cp : uint

utf32_to_lower(cp: uint ): uint 

def utf32_to_lower (cp: uint) : uint

Arguments:

cp : uint