Pre Tokenizer#

class PreTokenizer#

Base class for pretokenizer steps.

Subclassed by trt_edgellm::tokenizer::RegexSplit, trt_edgellm::tokenizer::Sequence

Public Functions

virtual ~PreTokenizer() = default#
virtual std::vector<std::string> process(
std::string const &text
) const = 0#

Process text and return split pieces.

Parameters:

text – Input text to process

Returns:

Vector of text pieces after processing

virtual std::string getTypeName() const = 0#

Get the type name of this step.

Returns:

String identifying the step type

class RegexSplit : public trt_edgellm::tokenizer::PreTokenizer#

RegexSplit step that splits text using a regex pattern.

Public Functions

explicit RegexSplit(std::string const &pattern)#

Constructor with regex pattern.

Parameters:

pattern – Regex pattern for splitting text

~RegexSplit() override = default#
virtual std::vector<std::string> process(
std::string const &text
) const override#

Process text and return split pieces.

Parameters:

text – Input text to process

Returns:

Vector of text pieces after processing

inline virtual std::string getTypeName() const override#

Get the type name of this step.

Returns:

String identifying the step type

inline std::string const &getPattern() const noexcept#

Get the regex pattern.

Returns:

Reference to the pattern string

class Sequence : public trt_edgellm::tokenizer::PreTokenizer#

PreTokenizer class for splitting text before main tokenization Now supports a sequence of processing steps.

Public Functions

Sequence() = default#

Default constructor - creates empty sequence (acts as pass-through)

explicit Sequence(
std::vector<std::unique_ptr<PreTokenizer>> steps
)#

Constructor with sequence of pretokenizer steps.

Parameters:

steps – Vector of pretokenizer steps to apply in order

~Sequence() = default#
virtual std::vector<std::string> process(
std::string const &text
) const override#

Process text and return split pieces.

Parameters:

text – Input text to process

Returns:

Vector of text pieces after processing

inline virtual std::string getTypeName() const override#

Get the type name of this step.

Returns:

String identifying the step type

void addStep(std::unique_ptr<PreTokenizer> step)#

Add a processing step to the sequence.

Parameters:

step – Unique pointer to the step to add

inline size_t getStepCount() const noexcept#

Get the number of processing steps.

Returns:

Number of steps in the sequence

PreTokenizer const *getStep(size_t index) const noexcept#

Get step at specified index.

Parameters:

index – Index of the step

Returns:

Pointer to the step, or nullptr if index is invalid