cutlass/default__thread__map__wmma__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/layout/pitch_linear.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadblockShape_,
   typename WarpShape_,
   typename InstructionShape_,
   int PartitionsK,
   typename Element_,
   int ElementsPerAccess
 >
 struct DefaultThreadMapWmmaTensorOp {

   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using InstructionShape = InstructionShape_;
   static int const kPartitionsK = PartitionsK;
   using Element = Element_;
   static int const kElementsPerAccess = ElementsPerAccess;

   //
   // Definitions
   //

   struct Detail {

     static int const kTensorOpRows = InstructionShape::kM;
     static int const kWarpSize = 32;

     static_assert(
       !(ThreadblockShape::kM % WarpShape::kM) &&
       !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

     using WarpCount = gemm::GemmShape<
       ThreadblockShape::kM / WarpShape::kM,
       ThreadblockShape::kN / WarpShape::kN,
       kPartitionsK
     >;

     static int const kThreads = WarpCount::kCount * kWarpSize;
   };

   //
   // ThreadMap
   //

   using Type = OutputTileOptimalThreadMap <
     OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
     OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
     Detail::kThreads,
     kElementsPerAccess,
     sizeof_bits<Element>::value
   >;
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::Detail
Definition: default_thread_map_wmma_tensor_op.h:66

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::Element
Element_ Element
Definition: default_thread_map_wmma_tensor_op.h:59

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::GemmShape::kCount
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::Detail::kThreads
static int const kThreads
Number of participating threads.
Definition: default_thread_map_wmma_tensor_op.h:84

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::kPartitionsK
static int const kPartitionsK
Definition: default_thread_map_wmma_tensor_op.h:58

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::InstructionShape
InstructionShape_ InstructionShape
Definition: default_thread_map_wmma_tensor_op.h:57

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::kElementsPerAccess
static int const kElementsPerAccess
Definition: default_thread_map_wmma_tensor_op.h:60

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_wmma_tensor_op.h:55

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp
Defines the optimal thread map for Wmma TensorOp accumulator layouts.
Definition: default_thread_map_wmma_tensor_op.h:53

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_wmma_tensor_op.h:56

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::Detail::kWarpSize
static int const kWarpSize
Definition: default_thread_map_wmma_tensor_op.h:70

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp::Detail::kTensorOpRows
static int const kTensorOpRows
Wmma Tensor Operations fundamentally perform operations on InstructionShape::kM rows.
Definition: default_thread_map_wmma_tensor_op.h:69