cutlass/default__thread__map__simt_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadblockShape_,
   typename WarpShape_,
   typename MmaSimtPolicy_,
   int PartitionsK,
   typename Element_,
   int ElementsPerAccess
 >
 struct DefaultThreadMapSimt {

   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   using MmaSimtPolicy = MmaSimtPolicy_;
   static int const kPartitionsK = PartitionsK;
   using Element = Element_;
   static int const kElementsPerAccess = ElementsPerAccess;

   //
   // Definitions
   //

   struct Detail {

     static int const kWarpSize = 32;

     static_assert(
       !(ThreadblockShape::kM % WarpShape::kM) &&
       !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

     using WarpCount = gemm::GemmShape<
       ThreadblockShape::kM / WarpShape::kM,
       ThreadblockShape::kN / WarpShape::kN,
       kPartitionsK
     >;

     static int const kGroupCount =
       WarpShape::kM / (MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM);

     static int const kThreads = WarpCount::kCount * kWarpSize;

     static int const kIterations = MmaSimtPolicy::LaneMmaShape::kM * kGroupCount;
   };

   //
   // ThreadMap
   //

   using Type = OutputTileOptimalThreadMap<
     OutputTileShape<                          // Shape
       ThreadblockShape::kN,
       1,
       MmaSimtPolicy::WarpShape::kRow,
       Detail::WarpCount::kM,
       1>,
     OutputTileShape<                          // Count
       1,
       MmaSimtPolicy::LaneMmaShape::kM,
       Detail::kGroupCount,
       1,
       Detail::kIterations>,
     Detail::kThreads,
     kElementsPerAccess,
     sizeof_bits<Element>::value
   >;
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::gemm::GemmShape::kM
static int const kM
Definition: include/cutlass/gemm/gemm.h:58

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::DefaultThreadMapSimt::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_simt.h:54

cutlass::epilogue::threadblock::DefaultThreadMapSimt::MmaSimtPolicy
MmaSimtPolicy_ MmaSimtPolicy
Definition: default_thread_map_simt.h:56

cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Detail::kThreads
static int const kThreads
Number of participating threads.
Definition: default_thread_map_simt.h:85

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::GemmShape::kCount
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67

cutlass::epilogue::threadblock::DefaultThreadMapSimt
Defines the optimal thread map for SIMT accumulator layouts.
Definition: default_thread_map_simt.h:52

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Element
Element_ Element
Definition: default_thread_map_simt.h:58

cutlass::epilogue::threadblock::DefaultThreadMapSimt::kElementsPerAccess
static int const kElementsPerAccess
Definition: default_thread_map_simt.h:59

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Detail::kIterations
static int const kIterations
Number of iterations.
Definition: default_thread_map_simt.h:88

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Detail::kWarpSize
static int const kWarpSize
Definition: default_thread_map_simt.h:67

cutlass::epilogue::threadblock::DefaultThreadMapSimt::kPartitionsK
static int const kPartitionsK
Definition: default_thread_map_simt.h:57

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Detail
Definition: default_thread_map_simt.h:65

cutlass::epilogue::threadblock::DefaultThreadMapSimt::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_simt.h:55

cutlass::epilogue::threadblock::DefaultThreadMapSimt::Detail::kGroupCount
static int const kGroupCount
Computes number of thread-level matrix multiplies are needed to span a warp.
Definition: default_thread_map_simt.h:81