cutlass/include_2cutlass_2gemm_2device_2gemm_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/device_kernel.h"

 #include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/gemm/kernel/gemm.h"

 #include "cutlass/gemm/kernel/default_gemm.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"


 namespace cutlass {
 namespace gemm {
 namespace device {


 template <
     typename ElementA_,
     typename LayoutA_,
     typename ElementB_,
     typename LayoutB_,
     typename ElementC_,
     typename LayoutC_,
     typename ElementAccumulator_ = ElementC_,
     typename OperatorClass_ = arch::OpClassSimt,
     typename ArchTag_ = arch::Sm70,
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::ThreadblockShape,
     typename WarpShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::WarpShape,
     typename InstructionShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::InstructionShape,
     typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle,
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kStages,
     int AlignmentA =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kAlignmentA,
     int AlignmentB =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kAlignmentB,
     bool SplitKSerial = false,
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::Operator,
     bool IsBetaZero = false>
 class Gemm {
  public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   using Operator = Operator_;
   static int const kStages = Stages;
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
   static int const kAlignmentC = EpilogueOutputOp::kCount;
   static bool const kSplitKSerial = SplitKSerial;
   static bool const kIsBetaZero = IsBetaZero;

   using GemmKernel = typename kernel::DefaultGemm<
     ElementA,
     LayoutA,
     kAlignmentA,
     ElementB,
     LayoutB,
     kAlignmentB,
     ElementC,
     LayoutC,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     kStages,
     kSplitKSerial,
     Operator,
     kIsBetaZero
   >::GemmKernel;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments(): problem_size(0, 0, 0), split_k_slices(1) {

     }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ =
         typename EpilogueOutputOp::Params(),
       int split_k_slices = 1
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       ref_B(ref_B_),
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
       split_k_slices(split_k_slices) {

     }
   };

 private:

   typename GemmKernel::Params params_;

 public:

   Gemm() { }

   static Status can_implement(Arguments const &args) {

     if (!kSplitKSerial && args.split_k_slices > 1) {
       return Status::kErrorInvalidProblem;
     }

     Status status = GemmKernel::can_implement(
       args.problem_size,
       args.ref_A.non_const_ref(),
       args.ref_B.non_const_ref(),
       args.ref_C.non_const_ref(),
       args.ref_D
     );

     if (status != Status::kSuccess) {
       return status;
     }

     return Status::kSuccess;
   }

   static size_t get_workspace_size(Arguments const &args) {

     if (kSplitKSerial && args.split_k_slices > 1) {

       // Determine grid shape
       ThreadblockSwizzle threadblock_swizzle;

       cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
         args.problem_size,
         {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
         args.split_k_slices);

       return sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
     }

     return 0;
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     // Determine grid shape
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
       args.problem_size,
       {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
       args.split_k_slices);

     if (kSplitKSerial) {
       if (args.split_k_slices > 1) {
         if (!workspace) {
           return Status::kErrorWorkspaceNull;
         }

         size_t bytes = get_workspace_size(args);

         cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);

         if (result != cudaSuccess) {
           return Status::kErrorInternal;
         }
       }
     }
     else {

       if (args.split_k_slices > 1) {
         return Status::kErrorInvalidProblem;
       }
     }

     // Initialize the Params structure
     params_ = typename GemmKernel::Params{
       args.problem_size,
       grid_shape,
       args.ref_A.non_const_ref(),
       args.ref_B.non_const_ref(),
       args.ref_C.non_const_ref(),
       args.ref_D,
       args.epilogue,
       static_cast<int *>(workspace)
     };

     return Status::kSuccess;
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     if (kSplitKSerial && args.split_k_slices > 1) {
       if (!workspace) {
         return Status::kErrorWorkspaceNull;
       }
     }

     params_.ref_A.reset(args.ref_A.non_const_ref().data());
     params_.ref_B.reset(args.ref_B.non_const_ref().data());
     params_.ref_C.reset(args.ref_C.non_const_ref().data());
     params_.ref_D.reset(args.ref_D.data());
     params_.semaphore = static_cast<int *>(workspace);

     return Status::kSuccess;
   }

   Status run(cudaStream_t stream = nullptr) {

     ThreadblockSwizzle threadblock_swizzle;

     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);

     cudaError_t result;

     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
     if (smem_size >= (48 << 10)) {
       result = cudaFuncSetAttribute(Kernel<GemmKernel>,
                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
                                     smem_size);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }

       result = cudaFuncSetAttribute(
           Kernel<GemmKernel>,
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }
     }

     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);

     result = cudaGetLastError();

     return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }
 };


 template <
     typename ElementA_,
     typename LayoutA_,
     typename ElementB_,
     typename LayoutB_,
     typename ElementC_,
     typename ElementAccumulator_,
     typename OperatorClass_,
     typename ArchTag_,
     typename ThreadblockShape_,
     typename WarpShape_,
     typename InstructionShape_,
     typename EpilogueOutputOp_,
     typename ThreadblockSwizzle_,
     int Stages,
     int AlignmentA,
     int AlignmentB,
     bool SplitKSerial,
     typename Operator_,
     bool IsBetaZero>
 class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
            Operator_, IsBetaZero> {
  public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = layout::ColumnMajor;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   using Operator = Operator_;
   static int const kStages = Stages;
   static int const kAlignmentA = AlignmentA;
   static int const kAlignmentB = AlignmentB;
   static bool const kSplitKSerial = SplitKSerial;
   static bool const kIsBetaZero = IsBetaZero;

   using UnderlyingOperator = Gemm<
     ElementB,
     typename layout::LayoutTranspose<LayoutB>::type,
     ElementA,
     typename layout::LayoutTranspose<LayoutA>::type,
     ElementC,
     layout::RowMajor,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     Stages,
     kAlignmentB,
     kAlignmentA,
     SplitKSerial,
     Operator,
     kIsBetaZero
   >;

   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
   using GemmKernel = typename UnderlyingOperator::GemmKernel;
   static int const kAlignmentC = UnderlyingOperator::kAlignmentC;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments() { }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ =
         typename EpilogueOutputOp::Params(),
       int split_k_slices = 1
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       ref_B(ref_B_),
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
       split_k_slices(split_k_slices) { }
   };

 private:

   UnderlyingOperator underlying_operator_;

 public:

   Gemm() { }

   static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
     return UnderlyingArguments(
       {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
       {args.ref_B.data(), args.ref_B.stride(0)},
       {args.ref_A.data(), args.ref_A.stride(0)},
       {args.ref_C.data(), args.ref_C.stride(0)},
       {args.ref_D.data(), args.ref_D.stride(0)},
       args.epilogue,
       args.split_k_slices
     );
   }

   static Status can_implement(Arguments const &args) {

     return UnderlyingOperator::can_implement(to_underlying_arguments(args));
   }

   static size_t get_workspace_size(Arguments const &args) {

     return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     return underlying_operator_.update(to_underlying_arguments(args), workspace);
   }

   Status run(cudaStream_t stream = nullptr) {

     return underlying_operator_.run(stream);
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }
 };


 } // namespace device
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::kernel::DefaultGemm
Definition: default_gemm.h:116

cutlass::gemm::device::Gemm::kStages
static int const kStages
Definition: include/cutlass/gemm/device/gemm.h:238

cutlass::gemm::device::Gemm::Arguments::problem_size
GemmCoord problem_size
Definition: include/cutlass/gemm/device/gemm.h:276

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::split_k_slices
int split_k_slices
Definition: include/cutlass/gemm/device/gemm.h:606

cutlass::Status::kErrorInvalidProblem
Specified problem size is not supported by operator.

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ElementA
ElementB ElementA
Definition: include/cutlass/gemm/device/gemm.h:219

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: include/cutlass/gemm/device/gemm.h:601

cutlass::gemm::device::Gemm::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: include/cutlass/gemm/device/gemm.h:350

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ThreadblockSwizzle
ThreadblockSwizzle ThreadblockSwizzle
Definition: include/cutlass/gemm/device/gemm.h:236

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: include/cutlass/gemm/device/gemm.h:614

cutlass::gemm::device::Gemm::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: include/cutlass/gemm/device/gemm.h:328

cutlass::gemm::device::Gemm::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: include/cutlass/gemm/device/gemm.h:290

cutlass::gemm::device::Gemm::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, TensorRef< ElementB const, LayoutB > ref_B_, TensorRef< ElementC const, LayoutC > ref_C_, TensorRef< ElementC, LayoutC > ref_D_, typename EpilogueOutputOp::Params epilogue_=typename EpilogueOutputOp::Params(), int split_k_slices=1)
Constructs an Arguments structure.
Definition: include/cutlass/gemm/device/gemm.h:296

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: include/cutlass/gemm/device/gemm.h:666

cutlass::gemm::device::Gemm
Definition: include/cutlass/gemm/device/gemm.h:216

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::device::Gemm::kSplitKSerial
static bool const kSplitKSerial
Definition: include/cutlass/gemm/device/gemm.h:242

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: include/cutlass/gemm/device/gemm.h:678

cutlass::gemm::device::Gemm::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: include/cutlass/gemm/device/gemm.h:281

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: include/cutlass/gemm/device/gemm.h:605

cutlass::gemm::device::Gemm::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: include/cutlass/gemm/device/gemm.h:277

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Gemm
Gemm()
Constructs the GEMM.
Definition: include/cutlass/gemm/device/gemm.h:644

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: include/cutlass/gemm/device/gemm.h:660

default_gemm.h
Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with the appropr...

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:690

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::InstructionShape
InstructionShape InstructionShape
Definition: include/cutlass/gemm/device/gemm.h:234

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::Operator
Operator Operator
Definition: include/cutlass/gemm/device/gemm.h:237

cutlass::gemm::device::Gemm::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: include/cutlass/gemm/device/gemm.h:417

cutlass::gemm::device::Gemm::Arguments::split_k_slices
int split_k_slices
Definition: include/cutlass/gemm/device/gemm.h:282

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ElementC
ElementC ElementC
Definition: include/cutlass/gemm/device/gemm.h:225

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:684

cutlass::gemm::device::Gemm::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:471

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::OperatorClass
OperatorClass OperatorClass
Definition: include/cutlass/gemm/device/gemm.h:230

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::UnderlyingArguments
typename UnderlyingOperator::Arguments UnderlyingArguments
Definition: include/cutlass/gemm/device/gemm.h:589

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ElementAccumulator
ElementAccumulator ElementAccumulator
Definition: include/cutlass/gemm/device/gemm.h:229

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::GemmKernel
typename kernel::DefaultGemm< ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, kStages, kSplitKSerial, Operator, kIsBetaZero >::GemmKernel GemmKernel
Define the kernel.
Definition: include/cutlass/gemm/device/gemm.h:267

cutlass::gemm::device::Gemm::kAlignmentB
static int const kAlignmentB
Definition: include/cutlass/gemm/device/gemm.h:240

cutlass::layout::LayoutTranspose
Defines transposes of matrix layouts.
Definition: layout/matrix.h:921

cutlass::gemm::device::Gemm::kAlignmentA
static int const kAlignmentA
Definition: include/cutlass/gemm/device/gemm.h:239

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ThreadblockShape
ThreadblockShape ThreadblockShape
Definition: include/cutlass/gemm/device/gemm.h:232

cutlass::gemm::device::Gemm::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: include/cutlass/gemm/device/gemm.h:279

cutlass::TensorRef< ElementA const, LayoutA >

cutlass::gemm::device::Gemm::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: include/cutlass/gemm/device/gemm.h:278

cutlass::Status::kErrorInternal
An error within CUTLASS occurred.

device_kernel.h
Template for generic CUTLASS kernel.

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::problem_size
GemmCoord problem_size
Definition: include/cutlass/gemm/device/gemm.h:600

cutlass::gemm::device::Gemm::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:476

cutlass::gemm::device::Gemm::Gemm
Gemm()
Constructs the GEMM.
Definition: include/cutlass/gemm/device/gemm.h:325

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, TensorRef< ElementB const, LayoutB > ref_B_, TensorRef< ElementC const, LayoutC > ref_C_, TensorRef< ElementC, LayoutC > ref_D_, typename EpilogueOutputOp::Params epilogue_=typename EpilogueOutputOp::Params(), int split_k_slices=1)
Constructs an Arguments structure.
Definition: include/cutlass/gemm/device/gemm.h:618

cutlass::gemm::device::Gemm::LayoutC
LayoutC_ LayoutC
Definition: include/cutlass/gemm/device/gemm.h:226

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::LayoutB
typename layout::LayoutTranspose< LayoutA >::type LayoutB
Definition: include/cutlass/gemm/device/gemm.h:223

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: include/cutlass/gemm/device/gemm.h:603

cutlass::gemm::device::Gemm::Arguments
Argument structure.
Definition: include/cutlass/gemm/device/gemm.h:270

default_gemm_configuration.h
Definitions for GEMM structures.

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: include/cutlass/gemm/device/gemm.h:672

cutlass::gemm::device::Gemm::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:435

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::EpilogueOutputOp
EpilogueOutputOp EpilogueOutputOp
Definition: include/cutlass/gemm/device/gemm.h:235

cutlass::gemm::device::Gemm::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: include/cutlass/gemm/device/gemm.h:369

cutlass::Status::kErrorWorkspaceNull
The given workspace is null when it is required to be non-null.

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::WarpShape
WarpShape WarpShape
Definition: include/cutlass/gemm/device/gemm.h:233

cutlass::Status::kSuccess
Operation was successful.

cutlass::gemm::device::Gemm::kAlignmentC
static int const kAlignmentC
Definition: include/cutlass/gemm/device/gemm.h:241

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::to_underlying_arguments
static UnderlyingArguments to_underlying_arguments(Arguments const &args)
Helper to construct a transposed equivalent for the underying GEMM operator.
Definition: include/cutlass/gemm/device/gemm.h:647

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ElementB
ElementA ElementB
Definition: include/cutlass/gemm/device/gemm.h:222

threadblock_swizzle.h
Implements several possible threadblock-swizzling functions mapping blockIdx to GEMM problems...

arch.h
Defines tags for architecture-specific configurations.

cutlass::gemm::device::Gemm::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: include/cutlass/gemm/device/gemm.h:280

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: include/cutlass/gemm/device/gemm.h:604

cutlass::gemm::device::Gemm::kIsBetaZero
static bool const kIsBetaZero
Definition: include/cutlass/gemm/device/gemm.h:243

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::GemmKernel
typename UnderlyingOperator::GemmKernel GemmKernel
Definition: include/cutlass/gemm/device/gemm.h:590

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::ArchTag
ArchTag ArchTag
Definition: include/cutlass/gemm/device/gemm.h:231

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm.h:695

cutlass::Status
Status
Status code returned by CUTLASS operations.
Definition: cutlass.h:39

gemm.h
Template for a pipelined GEMM kernel. Does not compute batching or support split-K.

cutlass::gemm::device::Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: include/cutlass/gemm/device/gemm.h:602

cutlass::gemm::device::Gemm< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, kAlignmentB, kAlignmentA, SplitKSerial, Operator, kIsBetaZero >::LayoutA
typename layout::LayoutTranspose< LayoutB >::type LayoutA
Definition: include/cutlass/gemm/device/gemm.h:220