cutlass/epilogue__base_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <assert.h>

 #include "cutlass/cutlass.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/vector.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/aligned_buffer.h"

 #include "cutlass/gemm/gemm.h"

 #include "cutlass/transform/pitch_linear_thread_map.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename Shape_,
   typename WarpMmaOperator_,
   int PartitionsK,
   typename AccumulatorFragmentIterator_,
   typename WarpTileIterator_,
   typename Padding_
 >
 class EpilogueBase {
 public:

   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
   using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
   using WarpTileIterator = WarpTileIterator_;
   using Padding = Padding_;

   using Layout = layout::RowMajor;

   using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;

   using ElementAccumulator = typename AccumulatorTile::Element;


   using WarpCount = gemm::GemmShape<
     Shape::kM / WarpMmaOperator::Shape::kM,
     Shape::kN / WarpMmaOperator::Shape::kN,
     kPartitionsK
   >;

 public:

   struct SharedStorage {

     //
     // Type definitions
     //

     using Element = typename WarpTileIterator::Element;

     using TensorRef = typename WarpTileIterator::TensorRef;

     using Layout = typename WarpTileIterator::Layout;

     using Shape = MatrixShape<
       WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
       WarpCount::kN * WarpTileIterator::Shape::kColumn
     >;

     using StorageShape = MatrixShape<
       Shape::kRow + Padding::kRow,
       Shape::kColumn + Padding::kColumn
     >;

     //
     // Data members
     //

     AlignedBuffer<Element, StorageShape::kCount> storage;

     //
     // Methods
     //

     CUTLASS_DEVICE
     Element *data() {
       return storage.data();
     }

     CUTLASS_DEVICE
     TensorRef reference() {
       return TensorRef(
         storage.data(),
         Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
     }

     CUTLASS_DEVICE
     void debug_print() {
       if (threadIdx.x == 0) {

         #pragma unroll 1
         for (int r = 0; r < Shape::kRow; ++r) {

           #pragma unroll 1
           for (int c = 0; c < Shape::kColumn; ++c) {

             printf("%d  ", int(storage.data()[r * StorageShape::kColumn + c]));
           }
           printf("\n");
         }
       }
       __syncthreads();
     }
   };

 protected:

   //
   // Data members
   //

   SharedStorage &shared_storage_;

   WarpTileIterator warp_tile_iterator_;

 public:

   CUTLASS_DEVICE
   EpilogueBase(
     SharedStorage &shared_storage,
     int thread_idx,
     int warp_idx,
     int lane_idx
   ):
     shared_storage_(shared_storage),
     warp_tile_iterator_(shared_storage.reference(), lane_idx) {

     // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
     //
     //   _m: the warp's position within the threadblock along the M dimension
     //   _n: the warp's position within the threadblock along the N dimension
     //   _k: the warp's position within the threadblock along the K dimension

     int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
     int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
     int warp_m = warp_mn % WarpCount::kM;
     int warp_n = warp_mn / WarpCount::kM;

     MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};

     warp_tile_iterator_.add_tile_offset(warp_offset);
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::gemm::GemmShape::kM
static int const kM
Definition: include/cutlass/gemm/gemm.h:58

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::debug_print
CUTLASS_DEVICE void debug_print()
Definition: epilogue_base.h:149

cutlass::MatrixShape::kColumn
static int const kColumn
columns of a matrix
Definition: matrix_shape.h:44

cutlass::epilogue::threadblock::EpilogueBase::warp_tile_iterator_
WarpTileIterator warp_tile_iterator_
Stores a warp&#39;s fragment of accumulators to SMEM.
Definition: epilogue_base.h:176

cutlass::epilogue::threadblock::EpilogueBase::shared_storage_
SharedStorage & shared_storage_
Definition: epilogue_base.h:173

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::epilogue::threadblock::EpilogueBase::WarpMmaOperator
WarpMmaOperator_ WarpMmaOperator
Definition: epilogue_base.h:71

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage
Shared storage allocation needed by the epilogue.
Definition: epilogue_base.h:97

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::data
CUTLASS_DEVICE Element * data()
Returns a pointer to the shared memory buffer.
Definition: epilogue_base.h:136

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::epilogue::threadblock::EpilogueBase::ElementAccumulator
typename AccumulatorTile::Element ElementAccumulator
Accumulator element.
Definition: epilogue_base.h:84

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::TensorRef
typename WarpTileIterator::TensorRef TensorRef
Tensor reference to shared memory allocation.
Definition: epilogue_base.h:107

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::gemm::GemmShape::kK
static int const kK
Definition: include/cutlass/gemm/gemm.h:60

tensor.h
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::epilogue::threadblock::EpilogueBase::kPartitionsK
static int const kPartitionsK
Definition: epilogue_base.h:72

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::Element
typename WarpTileIterator::Element Element
Element type of shared memory.
Definition: epilogue_base.h:104

tensor_coord.h
Defines a canonical coordinate for rank=4 tensors offering named indices.

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::storage
AlignedBuffer< Element, StorageShape::kCount > storage
Definition: epilogue_base.h:128

cutlass::MatrixShape::kRow
static int const kRow
rows of a matrix
Definition: matrix_shape.h:43

aligned_buffer.h
AlignedBuffer is a container for trivially copyable elements suitable for use in unions and shared me...

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::AlignedBuffer
Modifies semantics of cutlass::Array<> to provide guaranteed alignment.
Definition: aligned_buffer.h:45

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::AlignedBuffer::data
CUTLASS_HOST_DEVICE pointer data()
Definition: aligned_buffer.h:84

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::Layout
typename WarpTileIterator::Layout Layout
Layout of shared memory allocation.
Definition: epilogue_base.h:110

cutlass::epilogue::threadblock::EpilogueBase::AccumulatorFragmentIterator
AccumulatorFragmentIterator_ AccumulatorFragmentIterator
Definition: epilogue_base.h:73

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

vector.h
Defines layout functions used for rank=1 vectors.

cutlass::epilogue::threadblock::EpilogueBase::Shape
Shape_ Shape
Definition: epilogue_base.h:70

cutlass::epilogue::threadblock::EpilogueBase
Base class for epilogues defining warp-level.
Definition: epilogue_base.h:67

cutlass::layout::RowMajor::packed
static CUTLASS_HOST_DEVICE RowMajor packed(MatrixCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: layout/matrix.h:93

cutlass::epilogue::threadblock::EpilogueBase::Padding
Padding_ Padding
Definition: epilogue_base.h:75

cutlass::epilogue::threadblock::EpilogueBase::EpilogueBase
CUTLASS_DEVICE EpilogueBase(SharedStorage &shared_storage, int thread_idx, int warp_idx, int lane_idx)
Constructor.
Definition: epilogue_base.h:182

cutlass::epilogue::threadblock::EpilogueBase::WarpTileIterator
WarpTileIterator_ WarpTileIterator
Definition: epilogue_base.h:74

cutlass::epilogue::threadblock::EpilogueBase::AccumulatorTile
typename AccumulatorFragmentIterator::AccumulatorTile AccumulatorTile
The complete warp-level accumulator tile.
Definition: epilogue_base.h:81

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage::reference
CUTLASS_DEVICE TensorRef reference()
Returns a tensor reference to the shared memory buffer.
Definition: epilogue_base.h:142

cutlass::gemm::GemmShape::kN
static int const kN
Definition: include/cutlass/gemm/gemm.h:59