Pre Tokenizer#

class PreTokenizer#

Base class for pretokenizer steps.

Subclassed by trt_edgellm::tokenizer::RegexSplit, trt_edgellm::tokenizer::Sequence

Public Functions

virtual ~PreTokenizer() noexcept = default#

virtual std::vector<std::string> process( std::string const &text ) const = 0#

Process text and return split pieces.

Parameters:: text – Input text to process
Returns:: Vector of text pieces after processing

virtual std::string getTypeName() const = 0#

Get the type name of this step.

Returns:: String identifying the step type

class RegexSplit : public trt_edgellm::tokenizer::PreTokenizer #

RegexSplit step that splits text using a regex pattern.

Public Functions

explicit RegexSplit(std::string const &pattern)#

Constructor with regex pattern.

Parameters:

pattern – Regex pattern for splitting text

Throws:

std::invalid_argument – if the pattern is empty
std::runtime_error – if the pattern is invalid

~RegexSplit() override = default#

virtual std::vector<std::string> process( std::string const &text ) const override#

Process text and return split pieces.

Parameters:

text – Input text to process

Throws:

std::runtime_error – if the text is too large for regex processing
std::runtime_error – if Unicode text collapse fails
std::runtime_error – if Unicode regex split fails

Returns:

Vector of text pieces after processing

inline virtual std::string getTypeName() const override#

Get the type name of this step.

Returns:: String identifying the step type

inline std::string const &getPattern() const noexcept#

Get the regex pattern.

Returns:: Reference to the pattern string

class Sequence : public trt_edgellm::tokenizer::PreTokenizer #

PreTokenizer class for splitting text before main tokenization Now supports a sequence of processing steps.

Public Functions

Sequence() noexcept = default#: Default constructor - creates empty sequence (acts as pass-through)

explicit Sequence( std::vector<std::unique_ptr<PreTokenizer>> steps ) noexcept#

Constructor with sequence of pretokenizer steps.

Parameters:: steps – Vector of pretokenizer steps to apply in order

~Sequence() noexcept = default#

virtual std::vector<std::string> process( std::string const &text ) const override#

Process text and return split pieces.

Parameters:: text – Input text to process
Throws:: std::runtime_error – if the text is too large for processing
Returns:: Vector of text pieces after processing

inline virtual std::string getTypeName() const override#

Get the type name of this step.

Returns:: String identifying the step type

void addStep(std::unique_ptr<PreTokenizer> step)#

Add a processing step to the sequence.

Parameters:: step – Unique pointer to the step to add
Throws:: std::invalid_argument – if the step is null

inline size_t getStepCount() const noexcept#

Get the number of processing steps.

Returns:: Number of steps in the sequence

PreTokenizer const *getStep(size_t index) const noexcept#

Get step at specified index.

Parameters:: index – Index of the step
Returns:: Pointer to the step, or nullptr if index is invalid