cutlass/epilogue_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <assert.h>

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/vector.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/aligned_buffer.h"
 #include "cutlass/functional.h"

 #include "cutlass/gemm/gemm.h"

 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator.h"

 #include "cutlass/epilogue/threadblock/epilogue_base.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename Shape_,
   typename WarpMmaOperator_,
   int PartitionsK,
   typename OutputTileIterator_,
   typename AccumulatorFragmentIterator_,
   typename WarpTileIterator_,
   typename SharedLoadIterator_,
   typename OutputOp_,
   typename Padding_
 >
 class Epilogue :
   public EpilogueBase<
     Shape_,
     WarpMmaOperator_,
     PartitionsK,
     AccumulatorFragmentIterator_,
     WarpTileIterator_,
     Padding_> {

 public:

   using Base = EpilogueBase<
     Shape_,
     WarpMmaOperator_,
     PartitionsK,
     AccumulatorFragmentIterator_,
     WarpTileIterator_,
     Padding_>;

   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
   using OutputTileIterator = OutputTileIterator_;
   using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
   using WarpTileIterator = WarpTileIterator_;
   using SharedLoadIterator = SharedLoadIterator_;
   using OutputOp = OutputOp_;
   using Padding = Padding_;

   using Layout = layout::RowMajor;
   using LongIndex = typename Layout::LongIndex;

   using AccumulatorTile = typename Base::AccumulatorTile;

   using ElementAccumulator = typename WarpTileIterator::Element;


   using ElementOutput = typename OutputTileIterator::Element;

   static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;

   using TensorRef = typename OutputTileIterator::TensorRef;

   using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;

   using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;

   using OutputAccessType = Array<
     typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;

   using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;

   using WarpCount = typename Base::WarpCount;

 public:


   static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
     "Mismatch between shared load iterator and output tile iterator.");

   static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");

   static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
     "Divisibility");

 private:

   SharedLoadIterator shared_load_iterator_;

 public:

   CUTLASS_DEVICE
   Epilogue(
     typename Base::SharedStorage &shared_storage,
     int thread_idx,
     int warp_idx,
     int lane_idx
   ):
     Base(shared_storage, thread_idx, warp_idx, lane_idx),
     shared_load_iterator_(shared_storage.reference(), thread_idx) { }

   CUTLASS_DEVICE
   void operator()(
     OutputOp const &output_op,
     OutputTileIterator destination_iterator,
     AccumulatorTile const &accumulators,
     OutputTileIterator source_iterator) {


     typename OutputTileIterator::Fragment source_fragment;

     if (!output_op.is_source_needed()) {
       source_iterator.clear_mask();
     }

     source_fragment.clear();

     //
     // Iterator over warp-level accumulator fragment
     //

     AccumulatorFragmentIterator accum_fragment_iterator(accumulators);

     //
     // Iterate over accumulator tile
     //

     CUTLASS_PRAGMA_UNROLL
     for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {

       //
       // Load the source
       //

       source_iterator.load(source_fragment);
       ++source_iterator;

       //
       // Convert and store fragment
       //

       __syncthreads();

       typename AccumulatorFragmentIterator::Fragment accum_fragment;

       accum_fragment_iterator.load(accum_fragment);
       ++accum_fragment_iterator;

       this->warp_tile_iterator_.store(accum_fragment);

       __syncthreads();

       //
       // Load fragments from shared memory
       //

       typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];

       shared_load_iterator_.load(aligned_accum_fragment[0]);

       // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
       if (kPartitionsK > 1)
       {
         plus <typename SharedLoadIterator::Fragment> add_fragments;
         const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;

         CUTLASS_PRAGMA_UNROLL
         for ( int i = 1; i < kPartitionsK; ++i) {
           shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
           shared_load_iterator_.load(aligned_accum_fragment[i]);
           aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
         }

         shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
       }

       //
       // Compute the output result
       //

       typename OutputTileIterator::Fragment output_fragment;

       apply_output_operator_(output_fragment, output_op, aligned_accum_fragment[0], source_fragment);


       //
       // Store the final result
       //

       destination_iterator.store(output_fragment);
       ++destination_iterator;

     }
   }

 private:

   CUTLASS_DEVICE
   void apply_output_operator_(
     typename OutputTileIterator::Fragment &output_fragment,
     OutputOp const &output_op,
     typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
     typename OutputTileIterator::Fragment const &source_fragment) {

     OutputAccessType *output_frag_ptr =
       reinterpret_cast<OutputAccessType *>(&output_fragment);

     AccumulatorAccessType const *compute_frag_ptr =
       reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);

     OutputAccessType const *source_frag_ptr =
       reinterpret_cast<OutputAccessType const *>(&source_fragment);

     int const kOutputOpIterations =
       OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kOutputOpIterations; ++i) {

       // Call the output operator
       output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
     }
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::layout::RowMajor::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62

cutlass::epilogue::threadblock::Epilogue::LongIndex
typename Layout::LongIndex LongIndex
Definition: epilogue.h:105

cutlass::epilogue::threadblock::Epilogue::WarpCount
typename Base::WarpCount WarpCount
Number of warps.
Definition: epilogue.h:137

cutlass::epilogue::threadblock::SharedLoadIterator::Fragment
Array< Element, ThreadMap::Iterations::kColumn *ThreadMap::Iterations::kRow *ThreadMap::Iterations::kGroup *ThreadMap::Iterations::kCluster *ThreadMap::kElementsPerAccess > Fragment
Fragment object.
Definition: shared_load_iterator.h:91

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::EpilogueBase::warp_tile_iterator_
WarpTileIterator warp_tile_iterator_
Stores a warp&#39;s fragment of accumulators to SMEM.
Definition: epilogue_base.h:176

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::epilogue::threadblock::EpilogueBase::SharedStorage
Shared storage allocation needed by the epilogue.
Definition: epilogue_base.h:97

cutlass::epilogue::threadblock::Epilogue::operator()
CUTLASS_DEVICE void operator()(OutputOp const &output_op, OutputTileIterator destination_iterator, AccumulatorTile const &accumulators, OutputTileIterator source_iterator)
Streams the result to global memory.
Definition: epilogue.h:170

cutlass::epilogue::threadblock::Epilogue::OutputTileIterator
OutputTileIterator_ OutputTileIterator
Definition: epilogue.h:96

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::epilogue::threadblock::Epilogue::Epilogue
CUTLASS_DEVICE Epilogue(typename Base::SharedStorage &shared_storage, int thread_idx, int warp_idx, int lane_idx)
Constructor.
Definition: epilogue.h:159

cutlass::epilogue::threadblock::Epilogue::Shape
Shape_ Shape
Definition: epilogue.h:93

cutlass::epilogue::threadblock::Epilogue::TensorRef
typename OutputTileIterator::TensorRef TensorRef
Tensor reference to destination tensor.
Definition: epilogue.h:121

cutlass::epilogue::threadblock::EpilogueBase::WarpCount
gemm::GemmShape< Shape::kM/WarpMmaOperator::Shape::kM, Shape::kN/WarpMmaOperator::Shape::kN, kPartitionsK > WarpCount
Number of warps.
Definition: epilogue_base.h:92

cutlass::plus
Definition: functional.h:46

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

tensor.h
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...

cutlass::epilogue::threadblock::Epilogue::kPartitionsK
static int const kPartitionsK
Definition: epilogue.h:95

cutlass::epilogue::threadblock::Epilogue::OutputOp
OutputOp_ OutputOp
Definition: epilogue.h:100

cutlass::TensorRef
Definition: tensor_ref.h:146

cutlass::epilogue::threadblock::Epilogue::Padding
Padding_ Padding
Definition: epilogue.h:101

tensor_coord.h
Defines a canonical coordinate for rank=4 tensors offering named indices.

aligned_buffer.h
AlignedBuffer is a container for trivially copyable elements suitable for use in unions and shared me...

cutlass::epilogue::threadblock::Epilogue::AccumulatorFragmentIterator
AccumulatorFragmentIterator_ AccumulatorFragmentIterator
Definition: epilogue.h:97

numeric_types.h
Top-level include for all CUTLASS numeric types.

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::Epilogue::ConstTensorRef
typename OutputTileIterator::ConstTensorRef ConstTensorRef
Const tensor reference to source tensor.
Definition: epilogue.h:127

cutlass::epilogue::threadblock::Epilogue::WarpTileIterator
WarpTileIterator_ WarpTileIterator
Definition: epilogue.h:98

cutlass::epilogue::threadblock::Epilogue::SharedLoadIterator
SharedLoadIterator_ SharedLoadIterator
Definition: epilogue.h:99

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::epilogue::threadblock::Epilogue
Epilogue operator without splitk.
Definition: epilogue.h:74

cutlass::epilogue::threadblock::Epilogue::ElementAccumulator
typename WarpTileIterator::Element ElementAccumulator
Accumulator element.
Definition: epilogue.h:111

vector.h
Defines layout functions used for rank=1 vectors.

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

epilogue_base.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::epilogue::threadblock::EpilogueBase
Base class for epilogues defining warp-level.
Definition: epilogue_base.h:67

cutlass::epilogue::threadblock::Epilogue::WarpMmaOperator
WarpMmaOperator_ WarpMmaOperator
Definition: epilogue.h:94

cutlass::epilogue::threadblock::Epilogue::AccumulatorTile
typename Base::AccumulatorTile AccumulatorTile
The complete warp-level accumulator tile.
Definition: epilogue.h:108

cutlass::epilogue::threadblock::Epilogue::OutputAccessType
Array< typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess > OutputAccessType
Array type used to output.
Definition: epilogue.h:131

cutlass::epilogue::threadblock::Epilogue::kElementsPerAccess
static int const kElementsPerAccess
Output access size.
Definition: epilogue.h:118

cutlass::epilogue::threadblock::EpilogueBase::AccumulatorTile
typename AccumulatorFragmentIterator::AccumulatorTile AccumulatorTile
The complete warp-level accumulator tile.
Definition: epilogue_base.h:81

cutlass::epilogue::threadblock::Epilogue::ElementOutput
typename OutputTileIterator::Element ElementOutput
Output element.
Definition: epilogue.h:115

cutlass.h
Basic include for CUTLASS.

cutlass::epilogue::threadblock::Epilogue::SyncTensorRef
typename cutlass::TensorRef< int, cutlass::layout::PackedVectorLayout > SyncTensorRef
Tensor reference to sync tensor.
Definition: epilogue.h:124

functional.h
Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...

cutlass::epilogue::threadblock::Epilogue::AccumulatorAccessType
Array< typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess > AccumulatorAccessType
Array type used by output functor.
Definition: epilogue.h:134