Ssd Varlen Metadata#

void trt_edgellm::mamba::fillUniformValidLens( int32_t *d_valid_lens, int32_t batch, int32_t seq_len, cudaStream_t stream, )#: Fill d_valid_lens[0..batch) with uniform seq_len value. Used by the runner when caller’s context_lengths is null (uniform batch); the kernel’s padded_mode end-of-seq clamp still requires a valid_lens tensor.

void trt_edgellm::mamba::buildSSDVarlenMetadata( int32_t *d_seq_idx, int32_t *d_chunk_indices, int32_t *d_chunk_offsets, int32_t *d_seq_chunk_cumsum, int32_t const *d_context_lengths, int32_t batch, int32_t seq_len, int32_t chunk_size, cudaStream_t stream, )#: Build varlen metadata fully on-device (CUDA-graph-compatible — no host sync). The logical chunk upper bound is batch * (ceil(seq_len / chunk_size) + maybe_unaligned_chunk); trailing slack in chunk_indices is filled with sentinel -1 so the kernel’s chunk_indices[physical_chunk+1] lookup is safe up to the upper bound.