Pre Tokenizer#
-
class PreTokenizer#
Base class for pretokenizer steps.
Subclassed by trt_edgellm::tokenizer::RegexSplit, trt_edgellm::tokenizer::Sequence
Public Functions
-
virtual ~PreTokenizer() = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Returns:
Vector of text pieces after processing
-
virtual std::string getTypeName() const = 0#
Get the type name of this step.
- Returns:
String identifying the step type
-
virtual ~PreTokenizer() = default#
-
class RegexSplit : public trt_edgellm::tokenizer::PreTokenizer#
RegexSplit step that splits text using a regex pattern.
Public Functions
-
explicit RegexSplit(std::string const &pattern)#
Constructor with regex pattern.
- Parameters:
pattern – Regex pattern for splitting text
-
~RegexSplit() override = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Returns:
Vector of text pieces after processing
-
inline virtual std::string getTypeName() const override#
Get the type name of this step.
- Returns:
String identifying the step type
-
inline std::string const &getPattern() const noexcept#
Get the regex pattern.
- Returns:
Reference to the pattern string
-
explicit RegexSplit(std::string const &pattern)#
-
class Sequence : public trt_edgellm::tokenizer::PreTokenizer#
PreTokenizer class for splitting text before main tokenization Now supports a sequence of processing steps.
Public Functions
-
Sequence() = default#
Default constructor - creates empty sequence (acts as pass-through)
- explicit Sequence(
- std::vector<std::unique_ptr<PreTokenizer>> steps
Constructor with sequence of pretokenizer steps.
- Parameters:
steps – Vector of pretokenizer steps to apply in order
-
~Sequence() = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Returns:
Vector of text pieces after processing
-
inline virtual std::string getTypeName() const override#
Get the type name of this step.
- Returns:
String identifying the step type
-
void addStep(std::unique_ptr<PreTokenizer> step)#
Add a processing step to the sequence.
- Parameters:
step – Unique pointer to the step to add
-
inline size_t getStepCount() const noexcept#
Get the number of processing steps.
- Returns:
Number of steps in the sequence
-
PreTokenizer const *getStep(size_t index) const noexcept#
Get step at specified index.
- Parameters:
index – Index of the step
- Returns:
Pointer to the step, or nullptr if index is invalid
-
Sequence() = default#