cutlass/include_2cutlass_2gemm_2device_2gemm__complex_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/arch/arch.h"
 #include "cutlass/device_kernel.h"

 #include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/gemm/kernel/gemm.h"

 #include "cutlass/gemm/kernel/default_gemm_complex.h"
 #include "cutlass/gemm/device/default_gemm_configuration.h"


 namespace cutlass {
 namespace gemm {
 namespace device {


 template <
     typename ElementA_,
     typename LayoutA_,
     typename ElementB_,
     typename LayoutB_,
     typename ElementC_,
     typename LayoutC_,
     typename ElementAccumulator_ = ElementC_,
     typename OperatorClass_ = arch::OpClassSimt,
     typename ArchTag_ = arch::Sm70,
     typename ThreadblockShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::ThreadblockShape,
     typename WarpShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::WarpShape,
     typename InstructionShape_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::InstructionShape,
     typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
         ElementAccumulator_>::EpilogueOutputOp,
     typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle,
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kStages,
     ComplexTransform TransformA = ComplexTransform::kNone,
     ComplexTransform TransformB = ComplexTransform::kNone,
     bool SplitKSerial = false
 >
 class GemmComplex {
  public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   static int const kStages = Stages;
   static ComplexTransform const kTransformA = TransformA;
   static ComplexTransform const kTransformB = TransformB;
   static bool const kSplitKSerial = SplitKSerial;

   using GemmKernel = typename kernel::DefaultGemmComplex<
     ElementA,
     LayoutA,
     ElementB,
     LayoutB,
     ElementC,
     LayoutC,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     kStages,
     kTransformA,
     kTransformB,
     kSplitKSerial
   >::GemmKernel;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments(): problem_size(0, 0, 0), split_k_slices(1) {

     }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ =
         typename EpilogueOutputOp::Params(),
       int split_k_slices = 1
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       ref_B(ref_B_),
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
       split_k_slices(split_k_slices) {

     }
   };

 private:

   typename GemmKernel::Params params_;

 public:

   GemmComplex() { }

   static Status can_implement(Arguments const &args) {

     if (!kSplitKSerial && args.split_k_slices > 1) {
       return Status::kErrorInvalidProblem;
     }

     return Status::kSuccess;
   }

   static size_t get_workspace_size(Arguments const &args) {

     if (kSplitKSerial && args.split_k_slices > 1) {

       // Determine grid shape
       ThreadblockSwizzle threadblock_swizzle;

       cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
         args.problem_size,
         {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
         args.split_k_slices);

       return sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
     }

     return 0;
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     // Determine grid shape
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
       args.problem_size,
       {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
       args.split_k_slices);

     if (kSplitKSerial) {
       if (args.split_k_slices > 1) {
         if (!workspace) {
           return Status::kErrorWorkspaceNull;
         }

         size_t bytes = get_workspace_size(args);

         cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);

         if (result != cudaSuccess) {
           return Status::kErrorInternal;
         }
       }
     }
     else {

       if (args.split_k_slices > 1) {
         return Status::kErrorInvalidProblem;
       }
     }

     // Initialize the Params structure
     params_ = typename GemmKernel::Params{
       args.problem_size,
       grid_shape,
       args.ref_A.non_const_ref(),
       args.ref_B.non_const_ref(),
       args.ref_C.non_const_ref(),
       args.ref_D,
       args.epilogue,
       static_cast<int *>(workspace)
     };

     return Status::kSuccess;
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     if (kSplitKSerial && args.split_k_slices > 1) {
       if (!workspace) {
         return Status::kErrorWorkspaceNull;
       }
     }

     params_.ref_A.reset(args.ref_A.non_const_ref().data());
     params_.ref_B.reset(args.ref_B.non_const_ref().data());
     params_.ref_C.reset(args.ref_C.non_const_ref().data());
     params_.ref_D.reset(args.ref_D.data());
     params_.semaphore = static_cast<int *>(workspace);

     return Status::kSuccess;
   }

   Status run(cudaStream_t stream = nullptr) {

     ThreadblockSwizzle threadblock_swizzle;

     dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
     dim3 block(GemmKernel::kThreadCount, 1, 1);

     cudaError_t result;

     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
     if (smem_size >= (48 << 10)) {
       result = cudaFuncSetAttribute(Kernel<GemmKernel>,
                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
                                     smem_size);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }

       result = cudaFuncSetAttribute(
           Kernel<GemmKernel>,
           cudaFuncAttributePreferredSharedMemoryCarveout, 100);

       if (result != cudaSuccess) {
         return Status::kErrorInternal;
       }
     }

     cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);

     result = cudaGetLastError();

     return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }
 };


 template <
   typename ElementA_,
   typename LayoutA_,
   typename ElementB_,
   typename LayoutB_,
   typename ElementC_,
   typename ElementAccumulator_,
   typename OperatorClass_,
   typename ArchTag_,
   typename ThreadblockShape_,
   typename WarpShape_,
   typename InstructionShape_,
   typename EpilogueOutputOp_,
   typename ThreadblockSwizzle_,
   int Stages,
   ComplexTransform TransformA,
   ComplexTransform TransformB,
   bool SplitKSerial
 >
 class GemmComplex<
   ElementA_,
   LayoutA_,
   ElementB_,
   LayoutB_,
   ElementC_,
   layout::ColumnMajor,    // partially specialized on LayoutC
   ElementAccumulator_,
   OperatorClass_,
   ArchTag_,
   ThreadblockShape_,
   WarpShape_,
   InstructionShape_,
   EpilogueOutputOp_,
   ThreadblockSwizzle_,
   Stages,
   TransformA,
   TransformB,
   SplitKSerial
 > {
 public:

   using ElementA = ElementA_;
   using LayoutA = LayoutA_;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   using ElementB = ElementB_;
   using LayoutB = LayoutB_;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   using ElementC = ElementC_;
   using LayoutC = layout::ColumnMajor;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
   using ElementAccumulator = ElementAccumulator_;
   using OperatorClass = OperatorClass_;
   using ArchTag = ArchTag_;
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   using EpilogueOutputOp = EpilogueOutputOp_;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   static int const kStages = Stages;
   static bool const kSplitKSerial = SplitKSerial;

   using UnderlyingOperator = GemmComplex<
     ElementB,
     typename layout::LayoutTranspose<LayoutB>::type,
     ElementA,
     typename layout::LayoutTranspose<LayoutA>::type,
     ElementC,
     layout::RowMajor,
     ElementAccumulator,
     OperatorClass,
     ArchTag,
     ThreadblockShape,
     WarpShape,
     InstructionShape,
     EpilogueOutputOp,
     ThreadblockSwizzle,
     Stages,
     TransformA,
     TransformB,
     SplitKSerial
   >;

   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
   using GemmKernel = typename UnderlyingOperator::GemmKernel;

   struct Arguments {

     //
     // Data members
     //

     GemmCoord problem_size;
     TensorRef<ElementA const, LayoutA> ref_A;
     TensorRef<ElementB const, LayoutB> ref_B;
     TensorRef<ElementC const, LayoutC> ref_C;
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Arguments() { }

     CUTLASS_HOST_DEVICE
     Arguments(
       GemmCoord problem_size_,
       TensorRef<ElementA const, LayoutA> ref_A_,
       TensorRef<ElementB const, LayoutB> ref_B_,
       TensorRef<ElementC const, LayoutC> ref_C_,
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ =
         typename EpilogueOutputOp::Params(),
       int split_k_slices = 1
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
       ref_B(ref_B_),
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
       split_k_slices(split_k_slices) { }
   };

 private:

   UnderlyingOperator underlying_operator_;

 public:

   GemmComplex() { }

   static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
     return UnderlyingArguments(
       {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
       {args.ref_B.data(), args.ref_B.stride(0)},
       {args.ref_A.data(), args.ref_A.stride(0)},
       {args.ref_C.data(), args.ref_C.stride(0)},
       {args.ref_D.data(), args.ref_D.stride(0)},
       args.epilogue,
       args.split_k_slices
     );
   }

   static Status can_implement(Arguments const &args) {

     return UnderlyingOperator::can_implement(to_underlying_arguments(args));
   }

   static size_t get_workspace_size(Arguments const &args) {

     return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
   }

   Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

     return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
   }

   Status update(Arguments const &args, void *workspace = nullptr) {

     return underlying_operator_.update(to_underlying_arguments(args), workspace);
   }

   Status run(cudaStream_t stream = nullptr) {

     return underlying_operator_.run(stream);
   }

   Status operator()(cudaStream_t stream = nullptr) {
     return run(stream);
   }

   Status operator()(
     Arguments const &args,
     void *workspace = nullptr,
     cudaStream_t stream = nullptr) {

     Status status = initialize(args, workspace);

     if (status == Status::kSuccess) {
       status = run(stream);
     }

     return status;
   }
 };


 } // namespace device
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::device::GemmComplex::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:450

cutlass::gemm::device::GemmComplex::kTransformA
static ComplexTransform const kTransformA
Definition: include/cutlass/gemm/device/gemm_complex.h:229

cutlass::gemm::device::GemmComplex::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: include/cutlass/gemm/device/gemm_complex.h:263

cutlass::gemm::device::GemmComplex
Definition: include/cutlass/gemm/device/gemm_complex.h:207

cutlass
Definition: aligned_buffer.h:35

cutlass::ComplexTransform
ComplexTransform
Enumeraed type describing a transformation on a complex value.
Definition: complex.h:43

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ElementB
ElementA ElementB
Definition: include/cutlass/gemm/device/gemm_complex.h:213

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: include/cutlass/gemm/device/gemm_complex.h:581

cutlass::gemm::device::GemmComplex::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:409

cutlass::Status::kErrorInvalidProblem
Specified problem size is not supported by operator.

cutlass::gemm::device::GemmComplex::Arguments::problem_size
GemmCoord problem_size
Definition: include/cutlass/gemm/device/gemm_complex.h:262

cutlass::gemm::device::GemmComplex::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: include/cutlass/gemm/device/gemm_complex.h:314

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::EpilogueOutputOp
EpilogueOutputOp EpilogueOutputOp
Definition: include/cutlass/gemm/device/gemm_complex.h:226

cutlass::gemm::device::GemmComplex::kStages
static int const kStages
Definition: include/cutlass/gemm/device/gemm_complex.h:228

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ArchTag
ArchTag ArchTag
Definition: include/cutlass/gemm/device/gemm_complex.h:222

cutlass::gemm::device::GemmComplex::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: include/cutlass/gemm/device/gemm_complex.h:264

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::device::GemmComplex::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: include/cutlass/gemm/device/gemm_complex.h:267

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::operator()
Status operator()(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:673

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::device::GemmComplex::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: include/cutlass/gemm/device/gemm_complex.h:343

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::split_k_slices
int split_k_slices
Definition: include/cutlass/gemm/device/gemm_complex.h:584

cutlass::ComplexTransform::kNone

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::to_underlying_arguments
static UnderlyingArguments to_underlying_arguments(Arguments const &args)
Helper to construct a transposed equivalent for the underying GEMM operator.
Definition: include/cutlass/gemm/device/gemm_complex.h:625

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::LayoutA
typename layout::LayoutTranspose< LayoutB >::type LayoutA
Definition: include/cutlass/gemm/device/gemm_complex.h:211

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::device::GemmComplex::Arguments
Argument structure.
Definition: include/cutlass/gemm/device/gemm_complex.h:256

cutlass::gemm::device::GemmComplex::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: include/cutlass/gemm/device/gemm_complex.h:324

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::initialize
Status initialize(Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr)
Initializes GEMM state from arguments.
Definition: include/cutlass/gemm/device/gemm_complex.h:650

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::run
Status run(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:662

cutlass::gemm::device::GemmComplex::LayoutC
LayoutC_ LayoutC
Definition: include/cutlass/gemm/device/gemm_complex.h:217

cutlass::gemm::device::GemmComplex::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: include/cutlass/gemm/device/gemm_complex.h:276

cutlass::layout::LayoutTranspose
Defines transposes of matrix layouts.
Definition: layout/matrix.h:921

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ElementC
ElementC ElementC
Definition: include/cutlass/gemm/device/gemm_complex.h:216

cutlass::TensorRef< ElementA const, LayoutA >

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ElementAccumulator
ElementAccumulator ElementAccumulator
Definition: include/cutlass/gemm/device/gemm_complex.h:220

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::get_workspace_size
static size_t get_workspace_size(Arguments const &args)
Gets the workspace size.
Definition: include/cutlass/gemm/device/gemm_complex.h:644

cutlass::Status::kErrorInternal
An error within CUTLASS occurred.

cutlass::gemm::device::GemmComplex::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:445

device_kernel.h
Template for generic CUTLASS kernel.

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::GemmComplex
GemmComplex()
Constructs the GEMM.
Definition: include/cutlass/gemm/device/gemm_complex.h:622

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::operator()
Status operator()(cudaStream_t stream=nullptr)
Runs the kernel using initialized state.
Definition: include/cutlass/gemm/device/gemm_complex.h:668

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ThreadblockSwizzle
ThreadblockSwizzle ThreadblockSwizzle
Definition: include/cutlass/gemm/device/gemm_complex.h:227

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::device::GemmComplex::GemmComplex
GemmComplex()
Constructs the GEMM.
Definition: include/cutlass/gemm/device/gemm_complex.h:311

default_gemm_configuration.h
Definitions for GEMM structures.

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::GemmKernel
typename UnderlyingOperator::GemmKernel GemmKernel
Definition: include/cutlass/gemm/device/gemm_complex.h:569

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::problem_size
GemmCoord problem_size
Definition: include/cutlass/gemm/device/gemm_complex.h:578

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::LayoutB
typename layout::LayoutTranspose< LayoutA >::type LayoutB
Definition: include/cutlass/gemm/device/gemm_complex.h:214

cutlass::gemm::device::GemmComplex::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: include/cutlass/gemm/device/gemm_complex.h:266

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::device::GemmComplex::Arguments::ref_C
TensorRef< ElementC const, LayoutC > ref_C
Definition: include/cutlass/gemm/device/gemm_complex.h:265

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::UnderlyingArguments
typename UnderlyingOperator::Arguments UnderlyingArguments
Definition: include/cutlass/gemm/device/gemm_complex.h:568

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::ref_B
TensorRef< ElementB const, LayoutB > ref_B
Definition: include/cutlass/gemm/device/gemm_complex.h:580

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::WarpShape
WarpShape WarpShape
Definition: include/cutlass/gemm/device/gemm_complex.h:224

cutlass::gemm::device::GemmComplex::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: include/cutlass/gemm/device/gemm_complex.h:391

cutlass::gemm::device::GemmComplex::kSplitKSerial
static bool const kSplitKSerial
Definition: include/cutlass/gemm/device/gemm_complex.h:231

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::update
Status update(Arguments const &args, void *workspace=nullptr)
Lightweight update given a subset of arguments.
Definition: include/cutlass/gemm/device/gemm_complex.h:656

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments()
Default ctor.
Definition: include/cutlass/gemm/device/gemm_complex.h:592

cutlass::Status::kErrorWorkspaceNull
The given workspace is null when it is required to be non-null.

cutlass::Status::kSuccess
Operation was successful.

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::can_implement
static Status can_implement(Arguments const &args)
Determines whether the GEMM can execute the given problem.
Definition: include/cutlass/gemm/device/gemm_complex.h:638

threadblock_swizzle.h
Implements several possible threadblock-swizzling functions mapping blockIdx to GEMM problems...

arch.h
Defines tags for architecture-specific configurations.

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::ref_D
TensorRef< ElementC, LayoutC > ref_D
Definition: include/cutlass/gemm/device/gemm_complex.h:582

cutlass::gemm::device::GemmComplex::Arguments::split_k_slices
int split_k_slices
Definition: include/cutlass/gemm/device/gemm_complex.h:268

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::InstructionShape
InstructionShape InstructionShape
Definition: include/cutlass/gemm/device/gemm_complex.h:225

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::OperatorClass
OperatorClass OperatorClass
Definition: include/cutlass/gemm/device/gemm_complex.h:221

cutlass::gemm::device::GemmComplex::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, TensorRef< ElementB const, LayoutB > ref_B_, TensorRef< ElementC const, LayoutC > ref_C_, TensorRef< ElementC, LayoutC > ref_D_, typename EpilogueOutputOp::Params epilogue_=typename EpilogueOutputOp::Params(), int split_k_slices=1)
Constructs an Arguments structure.
Definition: include/cutlass/gemm/device/gemm_complex.h:282

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::GemmKernel
typename kernel::DefaultGemmComplex< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, kStages, kTransformA, kTransformB, kSplitKSerial >::GemmKernel GemmKernel
Define the kernel.
Definition: include/cutlass/gemm/device/gemm_complex.h:253

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ThreadblockShape
ThreadblockShape ThreadblockShape
Definition: include/cutlass/gemm/device/gemm_complex.h:223

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::epilogue
EpilogueOutputOp::Params epilogue
Definition: include/cutlass/gemm/device/gemm_complex.h:583

cutlass::gemm::device::GemmComplex< ElementB, typename layout::LayoutTranspose< LayoutB >::type, ElementA, typename layout::LayoutTranspose< LayoutA >::type, ElementC, layout::RowMajor, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, SplitKSerial >::ElementA
ElementB ElementA
Definition: include/cutlass/gemm/device/gemm_complex.h:210

cutlass::gemm::device::GemmComplex::kTransformB
static ComplexTransform const kTransformB
Definition: include/cutlass/gemm/device/gemm_complex.h:230

cutlass.h
Basic include for CUTLASS.

cutlass::Status
Status
Status code returned by CUTLASS operations.
Definition: cutlass.h:39

gemm.h
Template for a pipelined GEMM kernel. Does not compute batching or support split-K.

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::Arguments
CUTLASS_HOST_DEVICE Arguments(GemmCoord problem_size_, TensorRef< ElementA const, LayoutA > ref_A_, TensorRef< ElementB const, LayoutB > ref_B_, TensorRef< ElementC const, LayoutC > ref_C_, TensorRef< ElementC, LayoutC > ref_D_, typename EpilogueOutputOp::Params epilogue_=typename EpilogueOutputOp::Params(), int split_k_slices=1)
Constructs an Arguments structure.
Definition: include/cutlass/gemm/device/gemm_complex.h:596

cutlass::gemm::device::GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >::Arguments::ref_A
TensorRef< ElementA const, LayoutA > ref_A
Definition: include/cutlass/gemm/device/gemm_complex.h:579