Moe Marlin Indices Kernels#

void trt_edgellm::kernel::launchBuildMarlinIndicesKernel(
int32_t const *slotsByExpertWorkspace,
int32_t const *slotsPerExpertWorkspace,
int32_t const *paddedCounts,
int32_t const *paddedOffsets,
float const *topkWeights,
int32_t *sortedTokenIds,
float *topkWeightsFlat,
int32_t *expertIds,
int32_t numTokens,
int32_t topK,
int32_t numExperts,
int32_t moeBlockSize,
cudaStream_t stream
)#

Launch kernel to build Marlin indices from slot lists (per-expert).

void trt_edgellm::kernel::launchAggregateSlotOutputsKernel(
void const *slotOutputs,
void *aggregatedOutput,
int32_t numTokens,
int32_t topK,
int32_t outDim,
cudaStream_t stream
)#

Launch kernel to aggregate slot outputs back to tokens (sum over topK in slot order).