Gdn Kernel Utils#

void trt_edgellm::launchGdnCalCuSeqLens( void const *context_lengths, void *cu_seqlens, int32_t batchSize, cudaStream_t stream, )#: Launch the context_lengths → cu_seqlens prefix-sum kernel.

void trt_edgellm::launchGdnL2NormQK( void *q, void *k, int32_t n, int32_t seqLen, int32_t h, int32_t headDim, cudaStream_t stream, )#: L2-normalize Q and K in-place along the head dimension. Q, K: (N, seqLen, H, headDim) float16 — each token-head vector is divided by its L2 norm. Required preprocessing for the Blackwell GDN prefill kernel.

void trt_edgellm::launchGdnDDTreePrecompute( void const *q, void const *k, void const *a, void const *b, void const *a_log, void const *dtBias, void *qkScales, void *gateValues, int32_t n, int32_t seqLen, int32_t h, int32_t hv, int32_t headDim, cudaStream_t stream, )#: Precompute DDTree split-v scalar inputs. qkScales: (N, seqLen, H, 2) float32, stores q_scale and k_scale. gateValues: (N, seqLen, HV, 2) float32, stores decay gate g and beta.

void trt_edgellm::launchGdnStateTranspose( void const *src, void *dst, int32_t numBlocks, int32_t dim, cudaStream_t stream, )#: Transpose the last two dimensions of the GDN state tensor (out-of-place). The Blackwell GDN prefill MMA produces state in V-major (d_v, d_k) order, while the sequential/decode kernels use K-major (d_k, d_v). src: (numBlocks, dim, dim) float32 — row-major 2-D blocks dst: (numBlocks, dim, dim) float32 — each block transposed numBlocks = n * hv, dim = head_dim (128).