Tokenizer Utils#
-
struct codepointFlags#
Unicode codepoint flags.
Bitfield structure for Unicode codepoint properties.
Public Types
-
enum CategoryFlags#
Category flag constants.
Values:
-
enumerator UNDEFINED = 0x0001#
Undefined category.
-
enumerator NUMBER = 0x0002#
Number category (\p{N})
-
enumerator LETTER = 0x0004#
Letter category (\p{L})
-
enumerator SEPARATOR = 0x0008#
Separator category (\p{Z})
-
enumerator ACCENT_MARK = 0x0010#
Accent mark category (\p{M})
-
enumerator PUNCTUATION = 0x0020#
Punctuation category (\p{P})
-
enumerator SYMBOL = 0x0040#
Symbol category (\p{S})
-
enumerator CONTROL = 0x0080#
Control character category (\p{C})
-
enumerator MASK_CATEGORIES = 0x00FF#
Mask for category flags.
-
enumerator UNDEFINED = 0x0001#
Public Functions
-
inline codepointFlags(uint16_t const flags = 0)#
Construct from uint16 flags.
- Parameters:
flags – Flag value
-
inline uint16_t asUint() const#
Convert to uint16.
- Returns:
Flags as uint16
-
inline uint16_t categoryFlag() const#
Get category flag.
- Returns:
Category flag value
Public Members
-
uint16_t isUndefined#
Is undefined.
-
uint16_t isNumber#
Is number (\p{N})
-
uint16_t isLetter#
Is letter (\p{L})
-
uint16_t isSeparator#
Is separator (\p{Z})
-
uint16_t isAccentMark#
Is accent mark (\p{M})
-
uint16_t isPunctuation#
Is punctuation (\p{P})
-
uint16_t isSymbol#
Is symbol (\p{S})
-
uint16_t isControl#
Is control character (\p{C})
-
uint16_t isWhitespace#
Is whitespace (\s)
-
uint16_t isLowercase#
Is lowercase.
-
uint16_t isUppercase#
Is uppercase.
-
uint16_t isNfd#
Has NFD form.
-
enum CategoryFlags#
- bool trt_edgellm::tokenizer::validateFileSize(
- std::filesystem::path const &filePath,
- size_t maxSizeBytes
Validate file size before reading.
- Parameters:
filePath – Path to the file to check
maxSizeBytes – Maximum allowed file size in bytes
- Returns:
true if file exists, size can be determined, and is within limit; false if file doesn’t exist, size cannot be determined, or exceeds limit