Pre Tokenizer#
-
class PreTokenizer#
Base class for pretokenizer steps.
Subclassed by trt_edgellm::tokenizer::RegexSplit, trt_edgellm::tokenizer::Sequence
Public Functions
-
virtual ~PreTokenizer() noexcept = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Returns:
Vector of text pieces after processing
-
virtual std::string getTypeName() const = 0#
Get the type name of this step.
- Returns:
String identifying the step type
-
virtual ~PreTokenizer() noexcept = default#
-
class RegexSplit : public trt_edgellm::tokenizer::PreTokenizer#
RegexSplit step that splits text using a regex pattern.
Public Functions
-
explicit RegexSplit(std::string const &pattern)#
Constructor with regex pattern.
- Parameters:
pattern – Regex pattern for splitting text
- Throws:
std::invalid_argument – if the pattern is empty
std::runtime_error – if the pattern is invalid
-
~RegexSplit() override = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Throws:
std::runtime_error – if the text is too large for regex processing
std::runtime_error – if Unicode text collapse fails
std::runtime_error – if Unicode regex split fails
- Returns:
Vector of text pieces after processing
-
inline virtual std::string getTypeName() const override#
Get the type name of this step.
- Returns:
String identifying the step type
-
inline std::string const &getPattern() const noexcept#
Get the regex pattern.
- Returns:
Reference to the pattern string
-
explicit RegexSplit(std::string const &pattern)#
-
class Sequence : public trt_edgellm::tokenizer::PreTokenizer#
PreTokenizer class for splitting text before main tokenization Now supports a sequence of processing steps.
Public Functions
-
Sequence() noexcept = default#
Default constructor - creates empty sequence (acts as pass-through)
- explicit Sequence(
- std::vector<std::unique_ptr<PreTokenizer>> steps
Constructor with sequence of pretokenizer steps.
- Parameters:
steps – Vector of pretokenizer steps to apply in order
-
~Sequence() noexcept = default#
- virtual std::vector<std::string> process(
- std::string const &text
Process text and return split pieces.
- Parameters:
text – Input text to process
- Throws:
std::runtime_error – if the text is too large for processing
- Returns:
Vector of text pieces after processing
-
inline virtual std::string getTypeName() const override#
Get the type name of this step.
- Returns:
String identifying the step type
-
void addStep(std::unique_ptr<PreTokenizer> step)#
Add a processing step to the sequence.
- Parameters:
step – Unique pointer to the step to add
- Throws:
std::invalid_argument – if the step is null
-
inline size_t getStepCount() const noexcept#
Get the number of processing steps.
- Returns:
Number of steps in the sequence
-
PreTokenizer const *getStep(size_t index) const noexcept#
Get step at specified index.
- Parameters:
index – Index of the step
- Returns:
Pointer to the step, or nullptr if index is invalid
-
Sequence() noexcept = default#