cutlass/interleaved__epilogue_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <assert.h>

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/vector.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/aligned_buffer.h"

 #include "cutlass/gemm/gemm.h"

 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/transform/threadblock/regular_tile_iterator.h"

 #include "cutlass/epilogue/threadblock/epilogue_base.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
     typename Shape_,
     typename WarpMmaOperator_,
     int PartitionsK,
     typename OutputTileIterator_,
     typename AccumulatorFragmentIterator_,
     typename OutputOp_,
     int InterleavedK,
     bool IsBetaZero = false>
 class InterleavedEpilogue {
  public:
   using Shape = Shape_;
   using WarpMmaOperator = WarpMmaOperator_;
   static int const kPartitionsK = PartitionsK;
   using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
   using OutputTileIterator = OutputTileIterator_;
   using OutputOp = OutputOp_;

   using Layout = layout::ColumnMajorInterleaved<InterleavedK>;

   using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;

   using ElementAccumulator = typename AccumulatorTile::Element;

   using ElementOutput = typename OutputTileIterator::Element;

   static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;

   using TensorRef = typename OutputTileIterator::TensorRef;

   using SyncTensorRef =
       typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;

   using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;

   using OutputAccessType = Array<typename OutputTileIterator::Element,
                                  OutputTileIterator::kElementsPerAccess>;

   using AccumulatorAccessType =
       Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;

   using WarpCount =
       gemm::GemmShape<Shape::kM / WarpMmaOperator::Shape::kM,
                       Shape::kN / WarpMmaOperator::Shape::kN, kPartitionsK>;

  public:
   static_assert(OutputTileIterator::kElementsPerAccess,
                 "This must not be zero.");

   static_assert(!(OutputTileIterator::Fragment::kElements %
                   OutputTileIterator::kElementsPerAccess),
                 "Divisibility");

   struct SharedStorage {};


  public:
   CUTLASS_DEVICE
   InterleavedEpilogue(
       SharedStorage &shared_storage,
       int thread_idx,
       int warp_idx,
       int lane_idx
     ) {}

   CUTLASS_DEVICE
   void operator()(
     OutputOp const &output_op,
     OutputTileIterator destination_iterator,
     AccumulatorTile const &accumulators,
     OutputTileIterator source_iterator) {

     //
     // Predicated tile iterators constructed from members
     //

     if (IsBetaZero && output_op.is_source_needed())
       assert(0);

     typename OutputTileIterator::Fragment source_fragment;

     if (!IsBetaZero) {
       if (!output_op.is_source_needed()) {
         source_iterator.clear_mask();
       }
     }

     source_fragment.clear();

     //
     // Iterator over warp-level accumulator fragment
     //

     AccumulatorFragmentIterator accum_fragment_iterator(accumulators);

     //
     // Iterate over accumulator tile
     //

     CUTLASS_PRAGMA_UNROLL
     for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
       //
       // Load the source
       //

       if (!IsBetaZero) {
         source_iterator.set_iteration_index(iter);
         source_iterator.load(source_fragment);
         ++source_iterator;
       }

       //
       // Convert fragment
       //

       typename AccumulatorFragmentIterator::Fragment accum_fragment;

       accum_fragment_iterator.load(accum_fragment);
       ++accum_fragment_iterator;

       //
       // Compute the output result
       //

       typename OutputTileIterator::Fragment output_fragment;
       apply_output_operator_(output_op, output_fragment, accum_fragment, source_fragment);

       //
       // Store the final result
       //

       destination_iterator.set_iteration_index(iter);
       destination_iterator.store(output_fragment);
       ++destination_iterator;
     }
   }

  private:
   CUTLASS_DEVICE
   void apply_output_operator_(
     OutputOp const &output_op,
       typename OutputTileIterator::Fragment &output_fragment,
       typename AccumulatorFragmentIterator::Fragment const
           &aligned_accum_fragment,
       typename OutputTileIterator::Fragment const &source_fragment) {
     OutputAccessType *output_frag_ptr =
         reinterpret_cast<OutputAccessType *>(&output_fragment);

     AccumulatorAccessType const *compute_frag_ptr =
         reinterpret_cast<AccumulatorAccessType const *>(
             &aligned_accum_fragment);

     OutputAccessType const *source_frag_ptr =
         reinterpret_cast<OutputAccessType const *>(&source_fragment);

     int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
                                     OutputTileIterator::kElementsPerAccess;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kOutputOpIterations; ++i) {
       // Call the output operator
       output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
     }
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::epilogue::threadblock::InterleavedEpilogue::Shape
Shape_ Shape
Definition: interleaved_epilogue.h:81

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::InterleavedEpilogue::ElementAccumulator
typename AccumulatorTile::Element ElementAccumulator
Accumulator element.
Definition: interleaved_epilogue.h:95

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::epilogue::threadblock::InterleavedEpilogue::InterleavedEpilogue
CUTLASS_DEVICE InterleavedEpilogue(SharedStorage &shared_storage, int thread_idx, int warp_idx, int lane_idx)
Constructor.
Definition: interleaved_epilogue.h:141

cutlass::epilogue::threadblock::InterleavedEpilogue::AccumulatorTile
typename AccumulatorFragmentIterator::AccumulatorTile AccumulatorTile
The complete warp-level accumulator tile.
Definition: interleaved_epilogue.h:92

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::epilogue::threadblock::InterleavedEpilogue::AccumulatorAccessType
Array< ElementAccumulator, OutputTileIterator::kElementsPerAccess > AccumulatorAccessType
Array type used by output functor.
Definition: interleaved_epilogue.h:119

cutlass::epilogue::threadblock::InterleavedEpilogue
Epilogue operator without splitk.
Definition: interleaved_epilogue.h:79

cutlass::epilogue::threadblock::InterleavedEpilogue::OutputAccessType
Array< typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess > OutputAccessType
Array type used to output.
Definition: interleaved_epilogue.h:115

cutlass::epilogue::threadblock::InterleavedEpilogue::OutputOp
OutputOp_ OutputOp
Definition: interleaved_epilogue.h:86

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::epilogue::threadblock::InterleavedEpilogue::ConstTensorRef
typename OutputTileIterator::ConstTensorRef ConstTensorRef
Const tensor reference to source tensor.
Definition: interleaved_epilogue.h:111

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

tensor.h
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...

cutlass::epilogue::threadblock::InterleavedEpilogue::TensorRef
typename OutputTileIterator::TensorRef TensorRef
Tensor reference to destination tensor.
Definition: interleaved_epilogue.h:104

cutlass::epilogue::threadblock::InterleavedEpilogue::SharedStorage
Shared storage allocation needed by the epilogue.
Definition: interleaved_epilogue.h:135

cutlass::epilogue::threadblock::InterleavedEpilogue::WarpMmaOperator
WarpMmaOperator_ WarpMmaOperator
Definition: interleaved_epilogue.h:82

cutlass::epilogue::threadblock::InterleavedEpilogue::SyncTensorRef
typename cutlass::TensorRef< int, cutlass::layout::PackedVectorLayout > SyncTensorRef
Tensor reference to sync tensor.
Definition: interleaved_epilogue.h:108

cutlass::TensorRef
Definition: tensor_ref.h:146

tensor_coord.h
Defines a canonical coordinate for rank=4 tensors offering named indices.

aligned_buffer.h
AlignedBuffer is a container for trivially copyable elements suitable for use in unions and shared me...

cutlass::epilogue::threadblock::InterleavedEpilogue::OutputTileIterator
OutputTileIterator_ OutputTileIterator
Definition: interleaved_epilogue.h:85

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::InterleavedEpilogue::operator()
CUTLASS_DEVICE void operator()(OutputOp const &output_op, OutputTileIterator destination_iterator, AccumulatorTile const &accumulators, OutputTileIterator source_iterator)
Streams the result to global memory.
Definition: interleaved_epilogue.h:150

cutlass::epilogue::threadblock::InterleavedEpilogue::kElementsPerAccess
static int const kElementsPerAccess
Output access size.
Definition: interleaved_epilogue.h:101

vector.h
Defines layout functions used for rank=1 vectors.

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

epilogue_base.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::epilogue::threadblock::InterleavedEpilogue::kPartitionsK
static int const kPartitionsK
Definition: interleaved_epilogue.h:83

cutlass::epilogue::threadblock::InterleavedEpilogue::AccumulatorFragmentIterator
AccumulatorFragmentIterator_ AccumulatorFragmentIterator
Definition: interleaved_epilogue.h:84

cutlass::epilogue::threadblock::InterleavedEpilogue::ElementOutput
typename OutputTileIterator::Element ElementOutput
Output element.
Definition: interleaved_epilogue.h:98

cutlass.h
Basic include for CUTLASS.