cutlass/mma__singlestage_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/aligned_buffer.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/threadblock/mma_base.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 template <
   typename Shape_,
   //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
   typename IteratorA_,
   typename SmemIteratorA_,
   //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
   typename IteratorB_,
   typename SmemIteratorB_,
   typename ElementC_,
   typename LayoutC_,
   typename Policy_,
   typename Enable = bool
 >
 class MmaSingleStage : public MmaBase<Shape_, Policy_, 1> {
 public:

   using Base = MmaBase<Shape_, Policy_, 1>;

   using Shape = Shape_;
   using IteratorA = IteratorA_;
   using IteratorB = IteratorB_;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using Policy = Policy_;

   using SmemIteratorA = SmemIteratorA_;
   using SmemIteratorB = SmemIteratorB_;

   //
   // Dependent types
   //

   using FragmentA = typename IteratorA::Fragment;

   using FragmentB = typename IteratorB::Fragment;

   using FragmentC = typename Policy::Operator::FragmentC;

   using Operator = typename Policy::Operator;

   // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline)
   static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1");
 private:

   using WarpFragmentA = typename Operator::FragmentA;
   using WarpFragmentB = typename Operator::FragmentB;

 protected:

   SmemIteratorA smem_iterator_A_;

   SmemIteratorB smem_iterator_B_;

 public:

   CUTLASS_DEVICE
   MmaSingleStage(
     typename Base::SharedStorage &shared_storage,
     int thread_idx,
     int warp_idx,
     int lane_idx
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
     smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {

     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
     //   _m: the warp's position within the threadblock along the M dimension
     //   _n: the warp's position within the threadblock along the N dimension
     //   _k: the warp's position within the threadblock along the K dimension

     int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
     int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);

     int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
     int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;

     // Add per-warp offsets in units of warp-level tiles
     this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
     this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});

   }

   CUTLASS_DEVICE
   void operator()(
     int gemm_k_iterations,
     FragmentC &accum,
     IteratorA iterator_A,
     IteratorB iterator_B,
     FragmentC const &src_accum) {

     //
     // Prologue
     //

     // Perform accumulation in the 'd' output operand
     accum = src_accum;


     FragmentA tb_frag_A;
     FragmentB tb_frag_B;

     tb_frag_A.clear();
     tb_frag_B.clear();

     // The last kblock is loaded in the prolog
     iterator_A.load(tb_frag_A);
     iterator_B.load(tb_frag_B);

     ++iterator_A;
     ++iterator_B;

     // Pair of fragments used to overlap shared memory loads and math instructions
     WarpFragmentA warp_frag_A[2];
     WarpFragmentB warp_frag_B[2];
     Operator warp_mma;

     // Avoid reading out of bounds
     if (gemm_k_iterations <= 1) {
       iterator_A.clear_mask();
       iterator_B.clear_mask();
     }


     //
     // Mainloop
     //

     CUTLASS_GEMM_LOOP
     for (; gemm_k_iterations > 0; --gemm_k_iterations) {
       this->smem_iterator_A_.store(tb_frag_A);
       this->smem_iterator_B_.store(tb_frag_B);


       __syncthreads();

       //
       // Loop over GEMM K dimension
       //

       CUTLASS_PRAGMA_UNROLL
       for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {

         // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
         // as the case may be.

         this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k) % Base::kWarpGemmIterations);
         this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k) % Base::kWarpGemmIterations);

         this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k) % 2]);
         this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k) % 2]);

         ++this->warp_tile_iterator_A_;
         ++this->warp_tile_iterator_B_;

         warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
       }

       // Add negative offsets to return smem load iterators to the 'start' of the shared memory
       this->warp_tile_iterator_A_.add_tile_offset({0, -Policy::kPartitionsK * Base::kWarpGemmIterations});
       this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});

       __syncthreads();

       iterator_A.load(tb_frag_A);
       iterator_B.load(tb_frag_B);

       ++iterator_A;
       ++iterator_B;

       // Avoid reading out of bounds if this was the last loop iteration
       if (gemm_k_iterations <= 2) {
         iterator_A.clear_mask();
         iterator_B.clear_mask();
       }
     }

   }
 };


 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::GemmShape::kM
static int const kM
Definition: include/cutlass/gemm/gemm.h:58

cutlass::gemm::threadblock::MmaSingleStage::IteratorB
IteratorB_ IteratorB
Iterates over tiles of B operand in global memory.
Definition: mma_singlestage.h:84

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::MmaSingleStage::ElementC
ElementC_ ElementC
Data type of accumulator matrix.
Definition: mma_singlestage.h:85

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 1 >::warp_tile_iterator_B_
Operator::IteratorB warp_tile_iterator_B_
Iterator to load a warp-scoped tile of B operand from shared memory.
Definition: mma_base.h:193

cutlass::gemm::threadblock::MmaSingleStage::smem_iterator_B_
SmemIteratorB smem_iterator_B_
Iterator to write threadblock-scoped tile of B operand to shared memory.
Definition: mma_singlestage.h:121

cutlass::gemm::threadblock::MmaSingleStage
Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
Definition: mma_singlestage.h:76

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::threadblock::MmaSingleStage::operator()
CUTLASS_DEVICE void operator()(int gemm_k_iterations, FragmentC &accum, IteratorA iterator_A, IteratorB iterator_B, FragmentC const &src_accum)
Perform a threadblock-scoped matrix multiply-accumulate.
Definition: mma_singlestage.h:157

cutlass::gemm::threadblock::MmaSingleStage::Policy
Policy_ Policy
Policy describing tuning details.
Definition: mma_singlestage.h:87

cutlass::gemm::threadblock::MmaSingleStage::FragmentA
typename IteratorA::Fragment FragmentA
Fragment of operand A loaded from global memory.
Definition: mma_singlestage.h:97

cutlass::gemm::threadblock::MmaSingleStage::LayoutC
LayoutC_ LayoutC
Layout of accumulator matrix.
Definition: mma_singlestage.h:86

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::gemm::threadblock::MmaSingleStage::Shape
Shape_ Shape
Size of the Gemm problem - concept: gemm::GemmShape<>
Definition: mma_singlestage.h:82

cutlass::gemm::threadblock::MmaSingleStage::MmaSingleStage
CUTLASS_DEVICE MmaSingleStage(typename Base::SharedStorage &shared_storage, int thread_idx, int warp_idx, int lane_idx)
Construct from tensor references.
Definition: mma_singlestage.h:127

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::threadblock::MmaSingleStage::smem_iterator_A_
SmemIteratorA smem_iterator_A_
Iterator to write threadblock-scoped tile of A operand to shared memory.
Definition: mma_singlestage.h:118

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 1 >::kWarpGemmIterations
static int const kWarpGemmIterations
Number of warp-level GEMM oeprations.
Definition: mma_base.h:108

mma_base.h
Template for a double-buffered threadblock-scoped GEMM kernel.

aligned_buffer.h
AlignedBuffer is a container for trivially copyable elements suitable for use in unions and shared me...

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 1 >::kStages
static int const kStages
Number of stages.
Definition: mma_base.h:112

numeric_types.h
Top-level include for all CUTLASS numeric types.

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::threadblock::MmaSingleStage::SmemIteratorB
SmemIteratorB_ SmemIteratorB
Definition: mma_singlestage.h:90

cutlass::gemm::threadblock::MmaSingleStage::FragmentC
typename Policy::Operator::FragmentC FragmentC
Fragment of accumulator tile.
Definition: mma_singlestage.h:103

cutlass::gemm::threadblock::MmaBase
Definition: mma_base.h:83

cutlass::gemm::threadblock::MmaSingleStage::Operator
typename Policy::Operator Operator
Warp-level Mma.
Definition: mma_singlestage.h:106

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 1 >::warp_tile_iterator_A_
Operator::IteratorA warp_tile_iterator_A_
Iterator to load a warp-scoped tile of A operand from shared memory.
Definition: mma_base.h:190

CUTLASS_GEMM_LOOP
#define CUTLASS_GEMM_LOOP
Definition: cutlass.h:112

cutlass::gemm::threadblock::MmaSingleStage::SmemIteratorA
SmemIteratorA_ SmemIteratorA
Definition: mma_singlestage.h:89

cutlass::gemm::threadblock::MmaSingleStage::FragmentB
typename IteratorB::Fragment FragmentB
Fragment of operand B loaded from global memory.
Definition: mma_singlestage.h:100

cutlass::gemm::threadblock::MmaSingleStage::IteratorA
IteratorA_ IteratorA
Iterates over tiles of A operand in global memory.
Definition: mma_singlestage.h:83

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::GemmShape::kN
static int const kN
Definition: include/cutlass/gemm/gemm.h:59