cutlass/mma__pipelined_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/numeric_conversion.h"

 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/threadblock/mma_base.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 template <
   typename Shape_,
   //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
   typename IteratorA_,
   typename SmemIteratorA_,
   //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
   typename IteratorB_,
   typename SmemIteratorB_,
   typename ElementC_,
   typename LayoutC_,
   typename Policy_,
   typename TransformA_ = NumericArrayConverter<
     typename SmemIteratorA_::Element,
     typename IteratorA_::Element,
     IteratorA_::Fragment::kElements>,
   typename TransformB_ = NumericArrayConverter<
     typename SmemIteratorB_::Element,
     typename IteratorB_::Element,
     IteratorB_::Fragment::kElements>,
   typename Enable = bool
 >
 class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
 public:

   using Base = MmaBase<Shape_, Policy_, 2>;

   using Shape = Shape_;
   using IteratorA = IteratorA_;
   using IteratorB = IteratorB_;
   using ElementC = ElementC_;
   using LayoutC = LayoutC_;
   using Policy = Policy_;

   using SmemIteratorA = SmemIteratorA_;
   using SmemIteratorB = SmemIteratorB_;

   using TransformA = TransformA_;
   using TransformB = TransformB_;

   //
   // Dependent types
   //

   using FragmentA = typename IteratorA::Fragment;

   using FragmentB = typename IteratorB::Fragment;

   using FragmentC = typename Policy::Operator::FragmentC;

   using Operator = typename Policy::Operator;

   // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");

 private:

   using WarpFragmentA = typename Operator::FragmentA;
   using WarpFragmentB = typename Operator::FragmentB;

 protected:

   SmemIteratorA smem_iterator_A_;

   SmemIteratorB smem_iterator_B_;

 public:

   CUTLASS_DEVICE
   MmaPipelined(
     typename Base::SharedStorage &shared_storage,
     int thread_idx,
     int warp_idx,
     int lane_idx
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
     smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {

     // Compute warp location within threadblock tile by mapping the warp_id to
     // three coordinates:
     //   _m: the warp's position within the threadblock along the M dimension
     //   _n: the warp's position within the threadblock along the N dimension
     //   _k: the warp's position within the threadblock along the K dimension

     int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
     int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);

     int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
     int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;

     // Add per-warp offsets in units of warp-level tiles
     this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
     this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
   }

   CUTLASS_DEVICE
   void operator()(
     int gemm_k_iterations,
     FragmentC &accum,
     IteratorA iterator_A,
     IteratorB iterator_B,
     FragmentC const &src_accum,
     TransformA transform_A = TransformA(),
     TransformB transform_B = TransformB()) {

     //
     // Prologue
     //

     // Perform accumulation in the 'd' output operand
     accum = src_accum;

     FragmentA tb_frag_A;
     FragmentB tb_frag_B;

     tb_frag_A.clear();
     tb_frag_B.clear();

     // The last kblock is loaded in the prolog
     iterator_A.load(tb_frag_A);
     iterator_B.load(tb_frag_B);

     ++iterator_A;
     ++iterator_B;

     this->smem_iterator_A_.store(transform_A(tb_frag_A));
     this->smem_iterator_B_.store(transform_B(tb_frag_B));

     ++this->smem_iterator_A_;
     ++this->smem_iterator_B_;

     __syncthreads();

     // Pair of fragments used to overlap shared memory loads and math instructions
     WarpFragmentA warp_frag_A[2];
     WarpFragmentB warp_frag_B[2];

     this->warp_tile_iterator_A_.set_kgroup_index(0);
     this->warp_tile_iterator_B_.set_kgroup_index(0);

     this->warp_tile_iterator_A_.load(warp_frag_A[0]);
     this->warp_tile_iterator_B_.load(warp_frag_B[0]);

     ++this->warp_tile_iterator_A_;
     ++this->warp_tile_iterator_B_;

     Operator warp_mma;

     int smem_write_stage_idx = 1;

     // Avoid reading out of bounds
     if (gemm_k_iterations <= 1) {
       iterator_A.clear_mask();
       iterator_B.clear_mask();
     }

     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
     // shared memory loads (which have the tightest latency requirement).

     //
     // Mainloop
     //

     // Note: The main loop does not support Base::kWarpGemmIterations == 2.
     CUTLASS_GEMM_LOOP
     for (; gemm_k_iterations > 0; --gemm_k_iterations) {
       //
       // Loop over GEMM K dimension
       //

       CUTLASS_PRAGMA_UNROLL
       for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {

         // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
         // as the case may be.

         if (warp_mma_k == Base::kWarpGemmIterations - 1) {

           // Write fragments to shared memory
           this->smem_iterator_A_.store(transform_A(tb_frag_A));

           this->smem_iterator_B_.store(transform_B(tb_frag_B));

           __syncthreads();

           ++this->smem_iterator_B_;
           ++this->smem_iterator_A_;

           // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
           if (smem_write_stage_idx == 1) {
             this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
             this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
           }
           else {
             this->warp_tile_iterator_A_.add_tile_offset(
                 {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
             this->warp_tile_iterator_B_.add_tile_offset(
                 {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
                  0});
           }

           smem_write_stage_idx ^= 1;
         }

         this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
         this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);

         this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
         this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);

         ++this->warp_tile_iterator_A_;
         ++this->warp_tile_iterator_B_;

         if (warp_mma_k == 0) {

           iterator_A.load(tb_frag_A);
           iterator_B.load(tb_frag_B);

           ++iterator_A;
           ++iterator_B;

           // Avoid reading out of bounds if this was the last loop iteration
           if (gemm_k_iterations <= 2) {
             iterator_A.clear_mask();
             iterator_B.clear_mask();
           }
         }

         warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
       }
     }

   }
 };


 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass
cutlass::gemm::GemmShape::kM
static int const kM
Definition: include/cutlass/gemm/gemm.h:58

cutlass::gemm::threadblock::MmaPipelined::LayoutC
LayoutC_ LayoutC
Layout of accumulator matrix.
Definition: mma_pipelined.h:96

cutlass::gemm::threadblock::MmaPipelined::TransformB
TransformB_ TransformB
Definition: mma_pipelined.h:103

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::MmaPipelined::Policy
Policy_ Policy
Policy describing tuning details.
Definition: mma_pipelined.h:97

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 2 >::warp_tile_iterator_B_
Operator::IteratorB warp_tile_iterator_B_
Iterator to load a warp-scoped tile of B operand from shared memory.
Definition: mma_base.h:193

cutlass::gemm::threadblock::MmaPipelined
Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
Definition: mma_pipelined.h:86

cutlass::gemm::threadblock::MmaPipelined::IteratorB
IteratorB_ IteratorB
Iterates over tiles of B operand in global memory.
Definition: mma_pipelined.h:94

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::threadblock::MmaPipelined::operator()
CUTLASS_DEVICE void operator()(int gemm_k_iterations, FragmentC &accum, IteratorA iterator_A, IteratorB iterator_B, FragmentC const &src_accum, TransformA transform_A=TransformA(), TransformB transform_B=TransformB())
Perform a threadblock-scoped matrix multiply-accumulate.
Definition: mma_pipelined.h:170

cutlass::gemm::threadblock::MmaPipelined::IteratorA
IteratorA_ IteratorA
Iterates over tiles of A operand in global memory.
Definition: mma_pipelined.h:93

cutlass::gemm::threadblock::MmaPipelined::FragmentB
typename IteratorB::Fragment FragmentB
Fragment of operand B loaded from global memory.
Definition: mma_pipelined.h:113

cutlass::gemm::threadblock::MmaPipelined::SmemIteratorA
SmemIteratorA_ SmemIteratorA
Definition: mma_pipelined.h:99

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

numeric_conversion.h
Boost-like numeric conversion operator for CUTLASS numeric types.

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 2 >::kWarpGemmIterations
static int const kWarpGemmIterations
Number of warp-level GEMM oeprations.
Definition: mma_base.h:108

mma_base.h
Template for a double-buffered threadblock-scoped GEMM kernel.

aligned_buffer.h
AlignedBuffer is a container for trivially copyable elements suitable for use in unions and shared me...

cutlass::gemm::threadblock::MmaPipelined::Shape
Shape_ Shape
Size of the Gemm problem - concept: gemm::GemmShape<>
Definition: mma_pipelined.h:92

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 2 >::kStages
static int const kStages
Number of stages.
Definition: mma_base.h:112

cutlass::gemm::threadblock::MmaPipelined::FragmentA
typename IteratorA::Fragment FragmentA
Fragment of operand A loaded from global memory.
Definition: mma_pipelined.h:110

numeric_types.h
Top-level include for all CUTLASS numeric types.

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::threadblock::MmaBase
Definition: mma_base.h:83

cutlass::gemm::threadblock::MmaPipelined::FragmentC
typename Policy::Operator::FragmentC FragmentC
Fragment of accumulator tile.
Definition: mma_pipelined.h:116

cutlass::gemm::threadblock::MmaBase< Shape_, Policy_, 2 >::warp_tile_iterator_A_
Operator::IteratorA warp_tile_iterator_A_
Iterator to load a warp-scoped tile of A operand from shared memory.
Definition: mma_base.h:190

CUTLASS_GEMM_LOOP
#define CUTLASS_GEMM_LOOP
Definition: cutlass.h:112

cutlass::gemm::threadblock::MmaPipelined::ElementC
ElementC_ ElementC
Data type of accumulator matrix.
Definition: mma_pipelined.h:95

cutlass::gemm::threadblock::MmaPipelined::smem_iterator_A_
SmemIteratorA smem_iterator_A_
Iterator to write threadblock-scoped tile of A operand to shared memory.
Definition: mma_pipelined.h:132

cutlass::gemm::threadblock::MmaPipelined::SmemIteratorB
SmemIteratorB_ SmemIteratorB
Definition: mma_pipelined.h:100

cutlass::gemm::threadblock::MmaPipelined::smem_iterator_B_
SmemIteratorB smem_iterator_B_
Iterator to write threadblock-scoped tile of B operand to shared memory.
Definition: mma_pipelined.h:135

cutlass::gemm::threadblock::MmaPipelined::MmaPipelined
CUTLASS_DEVICE MmaPipelined(typename Base::SharedStorage &shared_storage, int thread_idx, int warp_idx, int lane_idx)
Construct from tensor references.
Definition: mma_pipelined.h:141

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::threadblock::MmaPipelined::TransformA
TransformA_ TransformA
Definition: mma_pipelined.h:102

cutlass::gemm::threadblock::MmaPipelined::Operator
typename Policy::Operator Operator
Warp-level Mma.
Definition: mma_pipelined.h:119

cutlass::gemm::GemmShape::kN
static int const kN
Definition: include/cutlass/gemm/gemm.h:59