cutlass/device_2gemm__batched_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/device_kernel.h"

 #include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/gemm/kernel/gemm_batched.h"

 #include "cutlass/gemm/kernel/default_gemm.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"


 namespace cutlass {
 namespace gemm {
 namespace device {


 template <
     typename ElementA_,
     typename LayoutA_,
     typename ElementB_,
     typename LayoutB_,
     typename ElementC_,
     typename LayoutC_,
     typename ElementAccumulator_ = ElementC_,
     typename OperatorClass_ = arch::OpClassSimt,
     typename ArchTag_ = arch::Sm70,
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::ThreadblockShape,
     typename WarpShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::WarpShape,
     typename InstructionShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::InstructionShape,
     typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kStages,
     int AlignmentA =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kAlignmentA,
     int AlignmentB =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kAlignmentB,
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::Operator
 >
 class GemmBatched {
  public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   static int const kStages = Stages;
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp::kCount;
   using Operator = Operator_;

   using DefaultGemmKernel = typename kernel::DefaultGemm<
     ElementA,
     LayoutA,
     kAlignmentA,
     ElementB,
     LayoutB,
     kAlignmentB,
     ElementC,
     LayoutC,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     kStages,
     false,
     Operator,
     false
   >::GemmKernel;

   using GemmKernel = kernel::GemmBatched<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     int64_t stride_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     int64_t stride_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     int64_t stride_C;
     TensorRef<ElementC, LayoutC> ref_D;
     int64_t stride_D;
     typename EpilogueOutputOp::Params epilogue;
     int batch_count;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments() { }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       int64_t stride_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       int64_t stride_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       int64_t stride_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       int64_t stride_D_,
       typename EpilogueOutputOp::Params epilogue_,
       int batch_count_
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       stride_A(stride_A_),
       ref_B(ref_B_),
       stride_B(stride_B_),
       ref_C(ref_C_),
       stride_C(stride_C_),
       ref_D(ref_D_),
       stride_D(stride_D_),
       epilogue(epilogue_),
       batch_count(batch_count_) { }
   };

 private:

   typename GemmKernel::Params params_;

 public:

   GemmBatched() { }

   static Status can_implement(Arguments const &args) {

     if (!TensorRef_aligned(args.ref_A, kAlignmentA) || (args.stride_A % kAlignmentA)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(args.ref_B, kAlignmentB) || (args.stride_B % kAlignmentB)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(args.ref_C, kAlignmentC) || (args.stride_C % kAlignmentC)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(args.ref_D, kAlignmentC) || (args.stride_D % kAlignmentC)) {
       return Status::kErrorMisalignedOperand;
     }

     if ((args.problem_size.m() % kAlignmentA) || (args.problem_size.k() % kAlignmentA) ||
       (args.problem_size.n() % kAlignmentB) || (args.problem_size.k() % kAlignmentB) ||
       (args.problem_size.m() % kAlignmentC) || (args.problem_size.n() % kAlignmentC)) {

       return Status::kErrorMisalignedOperand;
     }

     return Status::kSuccess;
   }

   static size_t get_workspace_size(Arguments const &args) {
     return 0;
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     // Determine grid shape
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
       args.problem_size,
       args.batch_count,
       {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK});

     // Initialize the Params structure
     params_ = typename GemmKernel::Params{
       args.problem_size,
       grid_shape,
       args.ref_A.non_const_ref(),
       args.stride_A,
       args.ref_B.non_const_ref(),
       args.stride_B,
       args.ref_C.non_const_ref(),
       args.stride_C,
       args.ref_D,
       args.stride_D,
       args.epilogue,
       args.batch_count
     };

     return Status::kSuccess;
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     params_.ref_A.reset(args.ref_A.non_const_ref().data());
     params_.ref_B.reset(args.ref_B.non_const_ref().data());
     params_.ref_C.reset(args.ref_C.non_const_ref().data());
     params_.ref_D.reset(args.ref_D.data());

     return Status::kSuccess;
   }

   Status run(cudaStream_t stream = nullptr) {

     ThreadblockSwizzle threadblock_swizzle;

     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);

     cudaError_t result;

     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
     if (smem_size >= (48 << 10)) {
       result = cudaFuncSetAttribute(Kernel<GemmKernel>,
                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
                                     smem_size);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }

       result = cudaFuncSetAttribute(
           Kernel<GemmKernel>,
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }
     }

     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);

     result = cudaGetLastError();

     return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }
 };


 template <
   typename ElementA_,
   typename LayoutA_,
   typename ElementB_,
   typename LayoutB_,
   typename ElementC_,
   typename ElementAccumulator_,
   typename OperatorClass_,
   typename ArchTag_,
   typename ThreadblockShape_,
   typename WarpShape_,
   typename InstructionShape_,
   typename EpilogueOutputOp_,
   typename ThreadblockSwizzle_,
   int Stages,
   int AlignmentA,
   int AlignmentB,
   typename Operator_
 >
 class GemmBatched<
   ElementA_,
   LayoutA_,
   ElementB_,
   LayoutB_,
   ElementC_,
   layout::ColumnMajor,
   ElementAccumulator_,
   OperatorClass_,
   ArchTag_,
   ThreadblockShape_,
   WarpShape_,
   InstructionShape_,
   EpilogueOutputOp_,
   ThreadblockSwizzle_,
   Stages,
   AlignmentA,
   AlignmentB,
   Operator_
 > {
 public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = layout::ColumnMajor;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   static int const kStages = Stages;

   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp::kCount;
   static bool const kSplitKSerial = false;

   //
   using UnderlyingOperator = GemmBatched<
     ElementB,
     typename layout::LayoutTranspose<LayoutB>::type,
     ElementA,
     typename layout::LayoutTranspose<LayoutA>::type,
     ElementC,
     layout::RowMajor,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     Stages,
     kAlignmentB,
     kAlignmentA
   >;

   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
   using GemmKernel = typename UnderlyingOperator::GemmKernel;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     int64_t stride_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     int64_t stride_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     int64_t stride_C;
     TensorRef<ElementC, LayoutC> ref_D;
     int64_t stride_D;
     typename EpilogueOutputOp::Params epilogue;
     int batch_count;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments() { }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       int64_t stride_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       int64_t stride_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       int64_t stride_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       int64_t stride_D_,
       typename EpilogueOutputOp::Params epilogue_,
       int batch_count_
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       stride_A(stride_A_),
       ref_B(ref_B_),
       stride_B(stride_B_),
       ref_C(ref_C_),
       stride_C(stride_C_),
       ref_D(ref_D_),
       stride_D(stride_D_),
       epilogue(epilogue_),
       batch_count(batch_count_) { }
   };

 private:

   UnderlyingOperator underlying_operator_;

 public:

   GemmBatched() { }

   static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
     return UnderlyingArguments(
       {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
       {args.ref_B.data(), args.ref_B.stride(0)},
       args.stride_B,
       {args.ref_A.data(), args.ref_A.stride(0)},
       args.stride_A,
       {args.ref_C.data(), args.ref_C.stride(0)},
       args.stride_C,
       {args.ref_D.data(), args.ref_D.stride(0)},
       args.stride_D,
       args.epilogue,
       args.batch_count
     );
   }

   static Status can_implement(Arguments const &args) {

     return UnderlyingOperator::can_implement(to_underlying_arguments(args));
   }

   static size_t get_workspace_size(Arguments const &args) {

     return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     return underlying_operator_.update(to_underlying_arguments(args), workspace);
   }

   Status run(cudaStream_t stream = nullptr) {

     return underlying_operator_.run(stream);
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }

 };


 } // namespace device
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ElementC
ElementC_ ElementC
Definition: device/gemm_batched.h:529

cutlass::gemm::kernel::DefaultGemm
Definition: default_gemm.h:116

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ThreadblockSwizzle
ThreadblockSwizzle_ ThreadblockSwizzle
Definition: device/gemm_batched.h:540

cutlass::gemm::device::GemmBatched::kAlignmentB
static int const kAlignmentB
Definition: device/gemm_batched.h:236

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: device/gemm_batched.h:580

cutlass::gemm::device::GemmBatched::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: device/gemm_batched.h:280

cutlass::gemm::device::GemmBatched::Arguments::problem_size
GemmCoord problem_size
Definition: device/gemm_batched.h:273

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ElementB
ElementB_ ElementB
Definition: device/gemm_batched.h:526

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:689

cutlass::gemm::device::GemmBatched::Arguments::stride_D
int64_t stride_D
Definition: device/gemm_batched.h:281

cutlass::gemm::kernel::GemmBatched::Params::ref_D
Epilogue::OutputTileIterator::TensorRef ref_D
Definition: kernel/gemm_batched.h:74

cutlass::gemm::device::GemmBatched::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, int64_t stride_A_, TensorRef< ElementB const, LayoutB > ref_B_, int64_t stride_B_, TensorRef< ElementC const, LayoutC > ref_C_, int64_t stride_C_, TensorRef< ElementC, LayoutC > ref_D_, int64_t stride_D_, typename EpilogueOutputOp::Params epilogue_, int batch_count_)
Constructs an Arguments structure.
Definition: device/gemm_batched.h:295

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::LayoutB
LayoutB_ LayoutB
Definition: device/gemm_batched.h:527

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::DefaultGemmKernel
typename kernel::DefaultGemm< ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, kStages, false, Operator, false >::GemmKernel DefaultGemmKernel
Define the kernel.
Definition: device/gemm_batched.h:262

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: device/gemm_batched.h:654

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: device/gemm_batched.h:536

cutlass::gemm::device::GemmBatched::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:443

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::device::GemmBatched::Arguments
Argument structure.
Definition: device/gemm_batched.h:267

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::Operator
typename DefaultGemmConfiguration< OperatorClass, ArchTag, ElementB, ElementA, ElementC,ElementAccumulator >::Operator Operator
Definition: device/gemm_batched.h:238

cutlass::gemm::kernel::GemmBatched::Params::ref_B
Mma::IteratorB::TensorRef ref_B
Definition: kernel/gemm_batched.h:68

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::stride_C
int64_t stride_C
Definition: device/gemm_batched.h:585

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::OperatorClass
OperatorClass_ OperatorClass
Definition: device/gemm_batched.h:534

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::LayoutB
typename layout::LayoutTranspose< LayoutA >::type LayoutB
Definition: device/gemm_batched.h:220

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ArchTag
ArchTag_ ArchTag
Definition: device/gemm_batched.h:535

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::GemmKernel
typename UnderlyingOperator::GemmKernel GemmKernel
Definition: device/gemm_batched.h:570

cutlass::gemm::device::GemmBatched::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: device/gemm_batched.h:361

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::stride_D
int64_t stride_D
Definition: device/gemm_batched.h:587

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::WarpShape
WarpShape_ WarpShape
Definition: device/gemm_batched.h:537

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: device/gemm_batched.h:588

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: device/gemm_batched.h:672

cutlass::gemm::device::GemmBatched::Arguments::stride_A
int64_t stride_A
Definition: device/gemm_batched.h:275

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::InstructionShape
InstructionShape_ InstructionShape
Definition: device/gemm_batched.h:538

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: device/gemm_batched.h:666

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:678

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ElementAccumulator
ElementAccumulator_ ElementAccumulator
Definition: device/gemm_batched.h:533

cutlass::gemm::kernel::GemmBatched::Params::ref_C
Epilogue::OutputTileIterator::TensorRef ref_C
Definition: kernel/gemm_batched.h:71

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::OperatorClass
OperatorClass OperatorClass
Definition: device/gemm_batched.h:227

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::stride_B
int64_t stride_B
Definition: device/gemm_batched.h:583

default_gemm.h
Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with the appropr...

cutlass::gemm::device::GemmBatched::kAlignmentC
static int const kAlignmentC
Definition: device/gemm_batched.h:237

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: device/gemm_batched.h:586

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ThreadblockShape
ThreadblockShape ThreadblockShape
Definition: device/gemm_batched.h:229

cutlass::gemm::kernel::GemmBatched::SharedStorage
Shared memory storage structure.
Definition: kernel/gemm_batched.h:124

cutlass::gemm::device::GemmBatched::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: device/gemm_batched.h:291

cutlass::gemm::kernel::GemmBatched::Params::grid_tiled_shape
cutlass::gemm::GemmCoord grid_tiled_shape
Definition: kernel/gemm_batched.h:63

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::InstructionShape
InstructionShape InstructionShape
Definition: device/gemm_batched.h:231

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::GemmBatched
GemmBatched()
Constructs the GEMM.
Definition: device/gemm_batched.h:634

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::problem_size
GemmCoord problem_size
Definition: device/gemm_batched.h:579

cutlass::gemm::device::GemmBatched::kStages
static int const kStages
Definition: device/gemm_batched.h:234

cutlass::gemm::device::GemmBatched::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: device/gemm_batched.h:396

cutlass::gemm::kernel::GemmBatched::kThreadCount
static int const kThreadCount
Definition: kernel/gemm_batched.h:58

cutlass::gemm::device::GemmBatched::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: device/gemm_batched.h:332

cutlass::gemm::device::GemmBatched::Arguments::stride_C
int64_t stride_C
Definition: device/gemm_batched.h:279

cutlass::gemm::kernel::GemmBatched::Params
Parameters structure.
Definition: kernel/gemm_batched.h:61

cutlass::layout::LayoutTranspose
Defines transposes of matrix layouts.
Definition: layout/matrix.h:921

cutlass::Status::kErrorMisalignedOperand
operands fail alignment requirements.

cutlass::TensorRef< ElementA const, LayoutA >

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ElementC
ElementC ElementC
Definition: device/gemm_batched.h:222

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::UnderlyingArguments
typename UnderlyingOperator::Arguments UnderlyingArguments
Definition: device/gemm_batched.h:569

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::to_underlying_arguments
static UnderlyingArguments to_underlying_arguments(Arguments const &args)
Helper to construct a transposed equivalent for the underying GEMM operator.
Definition: device/gemm_batched.h:637

cutlass::Status::kErrorInternal
An error within CUTLASS occurred.

cutlass::gemm::device::GemmBatched::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: device/gemm_batched.h:276

cutlass::gemm::device::GemmBatched::kAlignmentA
static int const kAlignmentA
Definition: device/gemm_batched.h:235

device_kernel.h
Template for generic CUTLASS kernel.

cutlass::gemm::device::GemmBatched::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: device/gemm_batched.h:366

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ThreadblockSwizzle
ThreadblockSwizzle ThreadblockSwizzle
Definition: device/gemm_batched.h:233

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::EpilogueOutputOp
EpilogueOutputOp_ EpilogueOutputOp
Definition: device/gemm_batched.h:539

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::device::GemmBatched::GemmBatched
GemmBatched()
Constructs the GEMM.
Definition: device/gemm_batched.h:329

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::EpilogueOutputOp
EpilogueOutputOp EpilogueOutputOp
Definition: device/gemm_batched.h:232

cutlass::gemm::device::GemmBatched::Arguments::batch_count
int batch_count
Definition: device/gemm_batched.h:283

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:684

default_gemm_configuration.h
Definitions for GEMM structures.

gemm_batched.h
Template for a pipelined GEMM kernel. Does not compute batching or support split-K.

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::ElementA
ElementA_ ElementA
Definition: device/gemm_batched.h:523

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::LayoutA
typename layout::LayoutTranspose< LayoutB >::type LayoutA
Definition: device/gemm_batched.h:217

cutlass::gemm::kernel::GemmBatched::Params::problem_size
cutlass::gemm::GemmCoord problem_size
Definition: kernel/gemm_batched.h:62

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ElementB
ElementA ElementB
Definition: device/gemm_batched.h:219

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: device/gemm_batched.h:660

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ElementA
ElementB ElementA
Definition: device/gemm_batched.h:216

cutlass::gemm::device::GemmBatched::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: device/gemm_batched.h:278

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, int64_t stride_A_, TensorRef< ElementB const, LayoutB > ref_B_, int64_t stride_B_, TensorRef< ElementC const, LayoutC > ref_C_, int64_t stride_C_, TensorRef< ElementC, LayoutC > ref_D_, int64_t stride_D_, typename EpilogueOutputOp::Params epilogue_, int batch_count_)
Constructs an Arguments structure.
Definition: device/gemm_batched.h:601

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: device/gemm_batched.h:597

cutlass::TensorRef_aligned
bool TensorRef_aligned(TensorRef< Element, Layout > const &ref, int alignment)
Definition: tensor_ref.h:382

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: device/gemm_batched.h:582

cutlass::Status::kSuccess
Operation was successful.

cutlass::gemm::kernel::GemmBatched::Params::ref_A
Mma::IteratorA::TensorRef ref_A
Definition: kernel/gemm_batched.h:65

cutlass::gemm::device::GemmBatched::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:407

threadblock_swizzle.h
Implements several possible threadblock-swizzling functions mapping blockIdx to GEMM problems...

arch.h
Defines tags for architecture-specific configurations.

cutlass::gemm::kernel::GemmBatched
Definition: kernel/gemm_batched.h:49

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ElementAccumulator
ElementAccumulator ElementAccumulator
Definition: device/gemm_batched.h:226

cutlass::gemm::device::GemmBatched::Arguments::stride_B
int64_t stride_B
Definition: device/gemm_batched.h:277

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::WarpShape
WarpShape WarpShape
Definition: device/gemm_batched.h:230

cutlass::gemm::device::GemmBatched< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA >::ArchTag
ArchTag ArchTag
Definition: device/gemm_batched.h:228

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::LayoutA
LayoutA_ LayoutA
Definition: device/gemm_batched.h:524

cutlass::gemm::device::GemmBatched::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: device/gemm_batched.h:448

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: device/gemm_batched.h:584

cutlass::gemm::device::GemmBatched::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: device/gemm_batched.h:282

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::batch_count
int batch_count
Definition: device/gemm_batched.h:589

cutlass::gemm::device::GemmBatched::GemmKernel
kernel::GemmBatched< typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle > GemmKernel
Definition: device/gemm_batched.h:264

cutlass::gemm::device::GemmBatched
Definition: device/gemm_batched.h:213

cutlass::Status
Status
Status code returned by CUTLASS operations.
Definition: cutlass.h:39

cutlass::gemm::device::GemmBatched::LayoutC
LayoutC_ LayoutC
Definition: device/gemm_batched.h:223

cutlass::gemm::device::GemmBatched::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: device/gemm_batched.h:274

cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments::stride_A
int64_t stride_A
Definition: device/gemm_batched.h:581