cutlass/default__thread__map__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/layout/pitch_linear.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadblockShape_,
   typename WarpShape_,
   int PartitionsK,
   typename Element_,
   int ElementsPerAccess
 >
 struct DefaultThreadMapTensorOp {

   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   static int const kPartitionsK = PartitionsK;
   using Element = Element_;
   static int const kElementsPerAccess = ElementsPerAccess;

   //
   // Definitions
   //

   struct Detail {

     static int const kTensorOpRows = 8;
     static int const kWarpSize = 32;

     static_assert(
       !(ThreadblockShape::kM % WarpShape::kM) &&
       !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

     using WarpCount = gemm::GemmShape<
       ThreadblockShape::kM / WarpShape::kM,
       ThreadblockShape::kN / WarpShape::kN,
       kPartitionsK
     >;

     static int const kThreads = WarpCount::kCount * kWarpSize;
   };

   //
   // ThreadMap
   //

   using Type = OutputTileOptimalThreadMap <
     OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
     OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
     Detail::kThreads,
     kElementsPerAccess,
     sizeof_bits<Element>::value
   >;
 };


 template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
           typename Element_, int ElementsPerAccess, int InterleavedK>
 struct DefaultInterleavedThreadMapTensorOp {
   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   static int const kPartitionsK = PartitionsK;
   using Element = Element_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kInterleavedK = InterleavedK;

   //
   // Definitions
   //

   struct Detail {
     static int const kTensorOpRows = 8;
     static int const kWarpSize = 32;

     static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
                       !(ThreadblockShape::kM % WarpShape::kM),
                   "Divisibility");

     using WarpCount =
         gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
                         ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;

     static int const kThreads = WarpCount::kCount * kWarpSize;
   };

   //
   // ThreadMap
   //

   using Type = InterleavedOutputTileThreadMap<
       layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
       layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
                                WarpShape::kN / InterleavedK>,
       Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail
Definition: default_thread_map_tensor_op.h:64

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kWarpSize
static int const kWarpSize
Definition: default_thread_map_tensor_op.h:68

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Detail
Definition: default_thread_map_tensor_op.h:116

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kPartitionsK
static int const kPartitionsK
Definition: default_thread_map_tensor_op.h:56

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Element
Element_ Element
Definition: default_thread_map_tensor_op.h:108

cutlass::gemm::GemmShape::kCount
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap
Definition: output_tile_thread_map.h:442

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kThreads
static int const kThreads
Number of participating threads.
Definition: default_thread_map_tensor_op.h:82

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:104

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:52

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kTensorOpRows
static int const kTensorOpRows
Tensor Operations fundamentally perform operations on 8 rows.
Definition: default_thread_map_tensor_op.h:67

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:105

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Element
Element_ Element
Definition: default_thread_map_tensor_op.h:57

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:106

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:55

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:54

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kElementsPerAccess
static int const kElementsPerAccess
Definition: default_thread_map_tensor_op.h:58