cutlass/include_2cutlass_2gemm_2kernel_2gemm_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/

 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/semaphore.h"


 namespace cutlass {
 namespace gemm {
 namespace kernel {


 template <
   typename Mma_,
   typename Epilogue_,
   typename ThreadblockSwizzle_,
   bool SplitKSerial
 >
 struct Gemm {

   using Mma = Mma_;
   using Epilogue = Epilogue_;
   using OutputOp = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;
   static bool const kSplitKSerial = SplitKSerial;

   using WarpCount = typename Mma::WarpCount;
   static int const kThreadCount = 32 * WarpCount::kCount;

   struct Params {
     cutlass::gemm::GemmCoord problem_size;
     cutlass::gemm::GemmCoord grid_tiled_shape;
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorA::TensorRef ref_A;
     typename Mma::IteratorB::Params params_B;
     typename Mma::IteratorB::TensorRef ref_B;
     typename Epilogue::OutputTileIterator::Params params_C;
     typename Epilogue::OutputTileIterator::TensorRef ref_C;
     typename Epilogue::OutputTileIterator::Params params_D;
     typename Epilogue::OutputTileIterator::TensorRef ref_D;
     typename OutputOp::Params output_op;
     int *semaphore;
     int gemm_k_iterations;
     int gemm_k_size;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(
       cutlass::gemm::GemmCoord const & problem_size,
       cutlass::gemm::GemmCoord const & grid_tiled_shape,
       typename Mma::IteratorA::TensorRef ref_A,
       typename Mma::IteratorB::TensorRef ref_B,
       typename Epilogue::OutputTileIterator::TensorRef ref_C,
       typename Epilogue::OutputTileIterator::TensorRef ref_D,
       typename OutputOp::Params output_op = typename OutputOp::Params(),
       int *semaphore = nullptr
     ):
       problem_size(problem_size),
       grid_tiled_shape(grid_tiled_shape),
       params_A(ref_A.layout()),
       ref_A(ref_A),
       params_B(ref_B.layout()),
       ref_B(ref_B),
       params_C(ref_C.layout()),
       ref_C(ref_C),
       params_D(ref_D.layout()),
       ref_D(ref_D),
       output_op(output_op),
       semaphore(semaphore) {

       int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
       int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();

       gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
     }
   };

   union SharedStorage {
     typename Mma::SharedStorage main_loop;
     typename Epilogue::SharedStorage epilogue;
   };

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   Gemm() { }

     static Status can_implement(
       cutlass::gemm::GemmCoord const & problem_size,
       typename Mma::IteratorA::TensorRef ref_A,
       typename Mma::IteratorB::TensorRef ref_B,
       typename Epilogue::OutputTileIterator::TensorRef ref_C,
       typename Epilogue::OutputTileIterator::TensorRef ref_D) {

     static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
     static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;

     if (!TensorRef_aligned(ref_A, kAlignmentA)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(ref_B, kAlignmentB)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(ref_C, kAlignmentC)) {
       return Status::kErrorMisalignedOperand;
     }

     if (!TensorRef_aligned(ref_D, kAlignmentC)) {
       return Status::kErrorMisalignedOperand;
     }

     if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
       (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
       (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {

       return Status::kErrorMisalignedOperand;
     }

     return Status::kSuccess;
   }

   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {

     // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

     // Early exit if CTA is out of range
     if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
       params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {

       return;
     }

     // Compute initial location in logical coordinates
     cutlass::MatrixCoord tb_offset_A{
       threadblock_tile_offset.m() * Mma::Shape::kM,
       threadblock_tile_offset.k() * params.gemm_k_size,
     };

     cutlass::MatrixCoord tb_offset_B{
       threadblock_tile_offset.k() * params.gemm_k_size,
       threadblock_tile_offset.n() * Mma::Shape::kN
     };

     // Problem size is a function of threadblock index in the K dimension
     int problem_size_k = min(
       params.problem_size.k(),
       (threadblock_tile_offset.k() + 1) * params.gemm_k_size);

     // Compute threadblock-scoped matrix multiply-add
     int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;

     // Compute position within threadblock
     int thread_idx = threadIdx.x;

     // Construct iterators to A and B operands
     typename Mma::IteratorA iterator_A(
       params.params_A,
       params.ref_A.data(),
       {params.problem_size.m(), problem_size_k},
       thread_idx,
       tb_offset_A);

     typename Mma::IteratorB iterator_B(
       params.params_B,
       params.ref_B.data(),
       {problem_size_k, params.problem_size.n()},
       thread_idx,
       tb_offset_B);

     int warp_idx = threadIdx.x / 32;
     int lane_idx = threadIdx.x % 32;

     //
     // Main loop
     //

     // Construct thread-scoped matrix multiply
     Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);

     typename Mma::FragmentC accumulators;

     accumulators.clear();

     if (!kSplitKSerial || gemm_k_iterations > 0) {
       // Compute threadblock-scoped matrix multiply-add
       mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
     }

     //
     // Epilogue
     //

     OutputOp output_op(params.output_op);

     //
     // Masked tile iterators constructed from members
     //

     threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

     //assume identity swizzle
     MatrixCoord threadblock_offset(
       threadblock_tile_offset.m() * Mma::Shape::kM,
       threadblock_tile_offset.n() * Mma::Shape::kN
     );

     int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();

     // Construct the semaphore.
     Semaphore semaphore(params.semaphore + block_idx, thread_idx);

     // If performing a reduction via split-K, fetch the initial synchronization
     if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {

       // Fetch the synchronization lock initially but do not block.
       semaphore.fetch();

       // Indicate which position in a serial reduction the output operator is currently updating
       output_op.set_k_partition(threadblock_tile_offset.k());
     }

     // Tile iterator loading from source tensor.
     typename Epilogue::OutputTileIterator iterator_C(
       params.params_C,
       params.ref_C.data(),
       params.problem_size.mn(),
       thread_idx,
       threadblock_offset
     );

     // Tile iterator writing to destination tensor.
     typename Epilogue::OutputTileIterator iterator_D(
       params.params_D,
       params.ref_D.data(),
       params.problem_size.mn(),
       thread_idx,
       threadblock_offset
     );

     Epilogue epilogue(
       shared_storage.epilogue,
       thread_idx,
       warp_idx,
       lane_idx);

     // Wait on the semaphore - this latency may have been covered by iterator construction
     if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {

       // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
       if (threadblock_tile_offset.k()) {
         iterator_C = iterator_D;
       }

       semaphore.wait(threadblock_tile_offset.k());

       __threadfence();
     }

     // Execute the epilogue operator to update the destination tensor.
     epilogue(output_op, iterator_D, accumulators, iterator_C);

     //
     // Release the semaphore
     //

     if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {

       int lock = 0;
       if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {

         // The final threadblock resets the semaphore for subsequent grids.
         lock = 0;
       }
       else {
         // Otherwise, the semaphore is incremented
         lock = threadblock_tile_offset.k() + 1;
       }

       __threadfence();
       semaphore.release(lock);
     }
   }
 };


 } // namespace kernel
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::kernel::Gemm::Params::ref_C
Epilogue::OutputTileIterator::TensorRef ref_C
Definition: include/cutlass/gemm/kernel/gemm.h:73

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::kernel::Gemm::SharedStorage::epilogue
Epilogue::SharedStorage epilogue
Definition: include/cutlass/gemm/kernel/gemm.h:122

cutlass::gemm::kernel::Gemm::Params::params_D
Epilogue::OutputTileIterator::Params params_D
Definition: include/cutlass/gemm/kernel/gemm.h:74

cutlass::gemm::kernel::Gemm::Params::params_A
Mma::IteratorA::Params params_A
Definition: include/cutlass/gemm/kernel/gemm.h:68

cutlass::gemm::kernel::Gemm::Epilogue
Epilogue_ Epilogue
Definition: include/cutlass/gemm/kernel/gemm.h:55

cutlass::gemm::kernel::Gemm::Params::params_B
Mma::IteratorB::Params params_B
Definition: include/cutlass/gemm/kernel/gemm.h:70

cutlass::gemm::kernel::Gemm::Params::Params
CUTLASS_HOST_DEVICE Params(cutlass::gemm::GemmCoord const &problem_size, cutlass::gemm::GemmCoord const &grid_tiled_shape, typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B, typename Epilogue::OutputTileIterator::TensorRef ref_C, typename Epilogue::OutputTileIterator::TensorRef ref_D, typename OutputOp::Params output_op=typename OutputOp::Params(), int *semaphore=nullptr)
Definition: include/cutlass/gemm/kernel/gemm.h:89

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::GemmCoord::mn
CUTLASS_HOST_DEVICE Coord< 2 > mn() const
Obtains a Coord<2> from GemmCoord.
Definition: include/cutlass/gemm/gemm.h:171

cutlass::gemm::kernel::Gemm::Params::params_C
Epilogue::OutputTileIterator::Params params_C
Definition: include/cutlass/gemm/kernel/gemm.h:72

cutlass::gemm::kernel::Gemm::kThreadCount
static int const kThreadCount
Definition: include/cutlass/gemm/kernel/gemm.h:62

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::Semaphore::fetch
CUTLASS_DEVICE void fetch()
Permit fetching the synchronization mechanism early.
Definition: semaphore.h:68

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::kernel::Gemm::Params::grid_tiled_shape
cutlass::gemm::GemmCoord grid_tiled_shape
Definition: include/cutlass/gemm/kernel/gemm.h:67

cutlass::gemm::kernel::Gemm::Params::gemm_k_iterations
int gemm_k_iterations
Definition: include/cutlass/gemm/kernel/gemm.h:78

cutlass::gemm::kernel::Gemm::Params::ref_B
Mma::IteratorB::TensorRef ref_B
Definition: include/cutlass/gemm/kernel/gemm.h:71

cutlass::gemm::kernel::Gemm::can_implement
static Status can_implement(cutlass::gemm::GemmCoord const &problem_size, typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B, typename Epilogue::OutputTileIterator::TensorRef ref_C, typename Epilogue::OutputTileIterator::TensorRef ref_D)
Determines whether kernel satisfies alignment.
Definition: include/cutlass/gemm/kernel/gemm.h:133

cutlass::gemm::kernel::Gemm::Gemm
CUTLASS_HOST_DEVICE Gemm()
Definition: include/cutlass/gemm/kernel/gemm.h:130

cutlass::gemm::GemmCoord::k
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:145

cutlass::gemm::kernel::Gemm::kSplitKSerial
static bool const kSplitKSerial
Definition: include/cutlass/gemm/kernel/gemm.h:58

cutlass::gemm::kernel::Gemm::OutputOp
typename Epilogue::OutputOp OutputOp
Definition: include/cutlass/gemm/kernel/gemm.h:56

cutlass::gemm::kernel::Gemm::Params
Parameters structure.
Definition: include/cutlass/gemm/kernel/gemm.h:65

cutlass::gemm::kernel::Gemm::Params::output_op
OutputOp::Params output_op
Definition: include/cutlass/gemm/kernel/gemm.h:76

cutlass::Status::kErrorMisalignedOperand
operands fail alignment requirements.

cutlass::gemm::kernel::Gemm::SharedStorage
Shared memory storage structure.
Definition: include/cutlass/gemm/kernel/gemm.h:120

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::platform::min
CUTLASS_HOST_DEVICE constexpr const T & min(const T &a, const T &b)
std::min
Definition: platform.h:183

cutlass::gemm::kernel::Gemm::Params::gemm_k_size
int gemm_k_size
Definition: include/cutlass/gemm/kernel/gemm.h:79

cutlass::gemm::kernel::Gemm::Params::semaphore
int * semaphore
Definition: include/cutlass/gemm/kernel/gemm.h:77

cutlass::gemm::kernel::Gemm::operator()
CUTLASS_DEVICE void operator()(Params const &params, SharedStorage &shared_storage)
Executes one GEMM.
Definition: include/cutlass/gemm/kernel/gemm.h:172

cutlass::Semaphore
CTA-wide semaphore for inter-CTA synchronization.
Definition: semaphore.h:48

semaphore.h
Implementation of a CTA-wide semaphore for inter-CTA synchronization.

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::Semaphore::release
CUTLASS_DEVICE void release(int status=0)
Updates the lock with the given result.
Definition: semaphore.h:98

cutlass::gemm::kernel::Gemm::Params::problem_size
cutlass::gemm::GemmCoord problem_size
Definition: include/cutlass/gemm/kernel/gemm.h:66

cutlass::gemm::kernel::Gemm::ThreadblockSwizzle
ThreadblockSwizzle_ ThreadblockSwizzle
Definition: include/cutlass/gemm/kernel/gemm.h:57

cutlass::gemm::kernel::Gemm
Definition: include/cutlass/gemm/kernel/gemm.h:52

cutlass::gemm::kernel::Gemm::Params::ref_A
Mma::IteratorA::TensorRef ref_A
Definition: include/cutlass/gemm/kernel/gemm.h:69

cutlass::TensorRef_aligned
bool TensorRef_aligned(TensorRef< Element, Layout > const &ref, int alignment)
Definition: tensor_ref.h:382

cutlass::Semaphore::wait
CUTLASS_DEVICE void wait(int status=0)
Waits until the semaphore is equal to the given value.
Definition: semaphore.h:81

cutlass::Status::kSuccess
Operation was successful.

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::kernel::Gemm::Mma
Mma_ Mma
Definition: include/cutlass/gemm/kernel/gemm.h:54

cutlass::gemm::kernel::Gemm::WarpCount
typename Mma::WarpCount WarpCount
Warp count (concept: GemmShape)
Definition: include/cutlass/gemm/kernel/gemm.h:61

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::kernel::Gemm::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: include/cutlass/gemm/kernel/gemm.h:86

cutlass::Status
Status
Status code returned by CUTLASS operations.
Definition: cutlass.h:39

cutlass::gemm::kernel::Gemm::SharedStorage::main_loop
Mma::SharedStorage main_loop
Definition: include/cutlass/gemm/kernel/gemm.h:121

cutlass::gemm::kernel::Gemm::Params::ref_D
Epilogue::OutputTileIterator::TensorRef ref_D
Definition: include/cutlass/gemm/kernel/gemm.h:75