[docs]@dataclass(slots=True,kw_only=True)classGuidedDecodingParams:""" Guided decoding parameters for text generation. Only one of the fields could be effective. Args: json (str, BaseModel, dict, optional): The generated text is amenable to json format with additional user-specified restrictions, namely schema. Defaults to None. regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None. grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None. json_object (bool): If True, the generated text is amenable to json format. Defaults to False. """json:Optional[Union[str,BaseModel,dict]]=Noneregex:Optional[str]=Nonegrammar:Optional[str]=Nonejson_object:bool=False@propertydefnum_guides(self):num_guides=0forfieldinfields(self):num_guides+=bool(getattr(self,field.name))returnnum_guidesdef_validate(self):if(num_guides:=self.num_guides)>1:raiseValueError(f"Only one guide can be used for a request, but got {num_guides}.")
@dataclass(slots=True,kw_only=True)classAdditionalModelOutput:""" An additional output to gather from the model. Args: name (str): The name of the additional output to gather from the model. gather_context (bool): A value indicating whether or not to gather the additional output from the context too. Defaults to False. """name:strgather_context:bool
[docs]@dataclass(slots=True,kw_only=True)classSamplingParams:""" Sampling parameters for text generation. Args: end_id (int, optional): The end token id. Defaults to None. pad_id (int, optional): The pad token id. Defaults to None. max_tokens (int): The maximum number of tokens to generate. Defaults to 32. max_new_tokens (int, optional): The maximum number of tokens to generate. This argument is being deprecated; please use max_tokens instead. Defaults to None. bad (str, List[str], optional): A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output. Defaults to None. bad_token_ids (List[int], optional): A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output. Defaults to None. stop (str, List[str], optional): A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True. Defaults to None. stop_token_ids (List[int], optional): A list of token ids that stop the generation when they are generated. Defaults to None. include_stop_str_in_output (bool): Whether to include the stop strings in output text. Defaults to False. embedding_bias (torch.Tensor, optional): The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. Defaults to None. external_draft_tokens_config (ExternalDraftTokensConfig, optional): The speculative decoding configuration. Defaults to None. logits_post_processor_name (str, optional): The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig. Defaults to None. n (int): Number of sequences to generate. Defaults to 1. best_of (int, optional): Number of sequences to consider for best output. Defaults to None. use_beam_search (bool): Whether to use beam search. Defaults to False. beam_width (int): The beam width. Setting 1 disables beam search. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to 1. num_return_sequences (int, optional): The number of sequences to return. If set to None, it defaults to the value of `beam_width`. The default is None. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to None. top_k (int): Controls number of logits to sample from. Default is 0 (all logits). top_p (float): Controls the top-P probability to sample from. Default is 0.f top_p_min (float): Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6. top_p_reset_ids (int): Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1. top_p_decay (float): Controls decay in the top-P algorithm. The decay value. Default is 1.f seed (int): Controls the random seed used by the random number generator in sampling random_seed (int): Controls the random seed used by the random number generator in sampling. This argument is being deprecated; please use seed instead. temperature (float): Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f min_tokens (int): Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1. min_length (int): Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1. This argument is being deprecated; please use min_tokens instead. beam_search_diversity_rate (float): Controls the diversity in beam search. repetition_penalty (float): Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f presence_penalty (float): Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f frequency_penalty (float): Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f length_penalty (float): Controls how to penalize longer sequences in beam search. Default is 0.f early_stopping (int): Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token) no_repeat_ngram_size (int): Controls how many repeat ngram size are acceptable. Default is 1 << 30. return_log_probs (bool): Controls if Result should contain log probabilities. Default is false. return_context_logits (bool): Controls if Result should contain the context logits. Default is false. return_generation_logits (bool): Controls if Result should contain the generation logits. Default is false. exclude_input_from_output (bool): Controls if output tokens in Result should include the input tokens. Default is true. return_encoder_output (bool): Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false. return_perf_metrics (bool): Controls if Result should contain the performance metrics for this request. Default is false. additional_model_outputs (list[AdditionalModelOutput], optional): The additional outputs to gather from the model. lookahead_config (LookaheadDecodingConfig , optional): Lookahead decoding config. Defaults to None. guided_decoding (GuidedDecodingParams, optional): Guided decoding params. Defaults to None. ignore_eos (bool): Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. Defaults to False. detokenize (bool): Whether to detokenize the output. Defaults to True. add_special_tokens (bool): Whether to add special tokens to the prompt. Defaults to True. truncate_prompt_tokens (int, optional): If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None. skip_special_tokens (bool): Whether to skip special tokens in the output. Defaults to True. spaces_between_special_tokens (bool): Whether to add spaces between special tokens in the output. Defaults to True. """# [TO DEVELOPER] This class provides an interface to LLMAPI users.# Internally, it manages and dispatches fields to Python bindings of C++ objects, currently including:# (1) all fields of tllme.SamplingConfig;# (2) all fields of tllme.OutputConfig;# (3) some fields of tllme.Request.# If you changed the implementation of C++ objects and corresponding Python bindings, please update:# (1) the fields and corresponding docstring of this class, and# (2) the expected_fields defined in _get_xxx_config methods.end_id:Optional[int]=Nonepad_id:Optional[int]=Nonemax_tokens:int=32max_new_tokens:Optional[int]=Nonebad:Optional[Union[str,List[str]]]=Nonebad_token_ids:Optional[List[int]]=None_bad_word_ids:Optional[List[List[int]]]=field(default=None,init=False,repr=False)stop:Optional[Union[str,List[str]]]=Nonestop_token_ids:Optional[List[int]]=Noneinclude_stop_str_in_output:bool=False_stop_word_ids:Optional[List[List[int]]]=field(default=None,init=False,repr=False)embedding_bias:Optional[torch.Tensor]=Noneexternal_draft_tokens_config:Optional[tllme.ExternalDraftTokensConfig]=Nonelogits_post_processor_name:Optional[str]=Nonen:int=1best_of:Optional[int]=Noneuse_beam_search:bool=False# Keep the below fields in sync with tllme.SamplingConfig or maintin the mapping table.beam_width:int=1num_return_sequences:Optional[int]=Nonetop_k:Optional[int]=Nonetop_p:Optional[float]=Nonetop_p_min:Optional[float]=Nonetop_p_reset_ids:Optional[int]=Nonetop_p_decay:Optional[float]=Noneseed:Optional[int]=Nonerandom_seed:Optional[int]=Nonetemperature:Optional[float]=Nonemin_tokens:Optional[int]=Nonemin_length:Optional[int]=Nonebeam_search_diversity_rate:Optional[float]=Nonerepetition_penalty:Optional[float]=Nonepresence_penalty:Optional[float]=Nonefrequency_penalty:Optional[float]=Nonelength_penalty:Optional[float]=Noneearly_stopping:Optional[int]=Noneno_repeat_ngram_size:Optional[int]=None# Keep the below fields in sync with tllme.OutputConfigreturn_log_probs:bool=Falsereturn_context_logits:bool=Falsereturn_generation_logits:bool=Falseexclude_input_from_output:bool=Truereturn_encoder_output:bool=Falsereturn_perf_metrics:bool=Falseadditional_model_outputs:Optional[list[AdditionalModelOutput]]=None# Lookahead decoding configlookahead_config:Optional[tllme.LookaheadDecodingConfig]=None# Guided decoding paramsguided_decoding:Optional[GuidedDecodingParams]=None# Tokenizer-related configsignore_eos:bool=Falsedetokenize:bool=Trueadd_special_tokens:bool=Truetruncate_prompt_tokens:Optional[int]=Noneskip_special_tokens:bool=Truespaces_between_special_tokens:bool=Truedef__post_init__(self):ifself.pad_idisNone:self.pad_id=self.end_id# Handle the compatibility between OpenAI and HF style-parameters.hf_style=self.beam_width>1orself.num_return_sequencesopenai_style=self.n>1orself.best_oforself.use_beam_searchifhf_styleandopenai_style:ambiguous_params={'beam_width':self.beam_width,'num_return_sequences':self.num_return_sequences,'n':self.n,'best_of':self.best_of,'use_beam_search':self.use_beam_search,}raiseValueError('Got ambiguous parameters. Please specify either Hugging Face ''style parameters (beam_width or num_return_sequences) or ''OpenAI style parameters (n, best_of, or use_beam_search), 'f'but not both: {ambiguous_params}. It is recommended to use ''OpenAI style parameters (n, best_of, use_beam_search).')ifhf_style:logger.warning("Please use 'n' and 'best_of' for the LLM API. The use of ""'beam_width' and 'num_return_sequences' will be deprecated ""in a future release.")self.n=self.beam_widthself.best_of=self.num_return_sequencesself.use_beam_search=self.beam_width>1self.best_of=self.best_oforself.nif(notself.use_beam_searchandself.n<self.best_ofandnotself.return_log_probs):logger.info(f"Enable 'return_log_probs' to trim the {self.n}-best among "f"{self.best_of} outputs under sampling decoding.")self.return_log_probs=Trueself._validate()def_validate(self):''' Verify the sampling parameters. This function verifies the sampling parameters in the LLM API, which may have stricter requirements than the Executor class of C++ runtime. For instance, while the greedy decoding with n > 1 is capable in the Executor class of C++ runtime, the LLM API disallows such combination. '''ifself.best_ofisnotNone:ifself.best_of>1andself.best_of<self.n:raiseValueError(f'In beam search, beam_width ({self.beam_width}) must be 'f'greater than or equal to num_return_sequences 'f'({self.num_return_sequences}).')if(self.best_of>1andself.greedy_decodingandnotos.environ.get('TLLM_ALLOW_N_GREEDY_DECODING',None)):raiseValueError(f'Greedy decoding in the LLM API does not allow multiple 'f'returns. Please set to best_of=1, got best_of={self.best_of}. 'f'Please set to best_of=1 or set an environment variable 'f'TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of > 1 'f'under the greedy decoding.')ifself.truncate_prompt_tokensisnotNoneandself.truncate_prompt_tokens<1:raiseValueError(f"truncate_prompt_tokens must be >= 1, got {self.truncate_prompt_tokens}")ifself.guided_decodingisnotNone:self.guided_decoding._validate()@propertydefgreedy_decoding(self)->bool:return(notself.use_beam_searchand(self.top_kisNoneorself.top_k==1)and(self.top_pisNoneorself.top_p==0.0))
def_get_bad_words(self)->List[List[int]]:words=[]ifself.bad_token_idsisnotNone:words=[[i]foriinself.bad_token_ids]ifself.badisNone:returnwordselse:ifself._bad_word_idsisNone:raiseRuntimeError(f"{self.__class__.__name__}.bad ({self.bad}) is not processed by tokenizer, ""please call the setup method.")returnwords+self._bad_word_idsdef_get_stop_words(self)->List[List[int]]:words=[]ifself.stop_token_idsisnotNone:words=[[i]foriinself.stop_token_ids]ifself.stopisNone:returnwordselse:ifself._stop_word_idsisNone:raiseRuntimeError(f"{self.__class__.__name__}.stop ({self.stop}) is not processed by tokenizer, ""please call the setup method.")returnwords+self._stop_word_idsdef_get_stop_reasons_and_words(self)->List[Tuple[Union[str,int],List[int]]]:stop_reasons=[]ifself.stop_token_idsisnotNone:stop_reasons.extend(self.stop_token_ids)ifself.stopisnotNone:ifisinstance(self.stop,str):stop_reasons.append(self.stop)else:stop_reasons.extend(self.stop)stop_words=self._get_stop_words()iflen(stop_reasons)!=len(stop_words):raiseRuntimeError(f"The number of {self.__class__.__name__}.stop_token_ids ({self.stop_token_ids}) "f"and {self.__class__.__name__}.stop ({self.stop}) are inconsistent with the "f"processed stop_words ({stop_words}).")returnlist(zip(stop_reasons,stop_words))def_get_sampling_config(self)->tllme.SamplingConfig:expected_fields={"beam_width","top_k","top_p","top_p_min","top_p_reset_ids","top_p_decay","seed","random_seed","temperature","min_tokens","min_length","beam_search_diversity_rate","repetition_penalty","presence_penalty","frequency_penalty","length_penalty","early_stopping","no_repeat_ngram_size","num_return_sequences"}found_fields={fforfindir(tllme.SamplingConfig)ifnotf.startswith('__')}iffound_fields!=expected_fields:raiseRuntimeError("Found fields in `tllme.SamplingConfig` different than expected; "f"if `tllme.SamplingConfig` is changed, please update {self.__class__.__name__} accordingly. ""See [TO DEVELOPER] comments for detailed instructions.")# A map from the SamplingConfig fields of the LLM API to their# corresponding field names of the Executor of TRT-LLM C++ runtime.# In sampling, there is no parameter that directly matches 'best_of',# so outputs must be trimmed during postprocessing.# | LLM API | TRT-LLM Executor |# --------------|-----------------|------------------------|# | Beam search | use_beam_search | beam_width > 1 |# | Beam search | n | num_return_sequences |# | Beam search | best_of | beam_width |# |-------------|-----------------|------------------------|# | Sampling | use_beam_search | beam_width == 1 |# | Sampling | n | num_return_sequences |# | Sampling | best_of | no corresponding param |unmatched_params=['num_return_sequences','beam_width','n','best_of','use_beam_search']llmapi_to_rt_param_map={f:getattr(self,f)forfinexpected_fieldsiffnotinunmatched_params}ifself.use_beam_search:llmapi_to_rt_param_map['num_return_sequences']=self.nllmapi_to_rt_param_map['beam_width']=self.best_ofelse:llmapi_to_rt_param_map['num_return_sequences']=self.best_ofllmapi_to_rt_param_map['beam_width']=1returntllme.SamplingConfig(**llmapi_to_rt_param_map)def_get_output_config(self)->tllme.OutputConfig:expected_fields=["return_log_probs","return_context_logits","return_generation_logits","exclude_input_from_output","return_encoder_output","return_perf_metrics","additional_model_outputs"]found_fields=[fforfindir(tllme.OutputConfig)ifnotf.startswith('__')]ifset(found_fields)!=set(expected_fields):raiseRuntimeError("Found fields in `tllme.OutputConfig` different than expected; "f"if `tllme.OutputConfig` is changed, please update {self.__class__.__name__} accordingly. ""See [TO DEVELOPER] comments for detailed instructions.")returntllme.OutputConfig(**{f:getattr(self,f)forfinexpected_fields})def_get_guided_decoding_params(self)->tllme.GuidedDecodingParams:ifself.guided_decodingisNoneorself.guided_decoding.num_guides==0:returnNoneifself.guided_decoding.json_object:returntllme.GuidedDecodingParams(tllme.GuidedDecodingParams.GuideType.JSON)elifself.guided_decoding.jsonisnotNone:json_schema=self.guided_decoding.jsonifisinstance(json,BaseModel):json_schema=json_schema.model_json_schema()ifisinstance(json_schema,dict):json_schema=json.dumps(json_schema)returntllme.GuidedDecodingParams(tllme.GuidedDecodingParams.GuideType.JSON_SCHEMA,json_schema)elifself.guided_decoding.regexisnotNone:returntllme.GuidedDecodingParams(tllme.GuidedDecodingParams.GuideType.REGEX,self.guided_decoding.regex)else:returntllme.GuidedDecodingParams(tllme.GuidedDecodingParams.GuideType.EBNF_GRAMMAR,self.guided_decoding.grammar)