cutlass/default__thread__map__volta__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "predicated_tile_iterator.h"
 #include "cutlass/gemm/gemm.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadblockShape,
   typename WarpShape,
   int PartitionsK,
   typename ElementOutput,
   int ElementsPerAccess,
   typename ElementAccumulator
 >
 struct DefaultThreadMapVoltaTensorOp;


 template <
   typename ThreadblockShape_,
   typename WarpShape_,
   int PartitionsK,
   typename ElementOutput_,
   int ElementsPerAccess
 >
 struct DefaultThreadMapVoltaTensorOp<
   ThreadblockShape_,
   WarpShape_,
   PartitionsK,
   ElementOutput_,
   ElementsPerAccess,
   half_t> {

   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   static int const kPartitionsK = PartitionsK;
   using ElementOutput = ElementOutput_;
   static int const kElementsPerAccess = ElementsPerAccess;
   using ElementAccumulator = half_t;

   //
   // Definitions
   //

   struct Detail {

     static int const kTensorOpRows = 16;
     static int const kWarpSize = 32;
     static int const kInterleavedTilesM = WarpShape::kM / 32;

     static_assert(
       !(ThreadblockShape::kM % WarpShape::kM) &&
       !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

     using WarpCount = gemm::GemmShape<
       ThreadblockShape::kM / WarpShape::kM,
       ThreadblockShape::kN / WarpShape::kN,
       kPartitionsK
     >;

     static int const kThreads = WarpCount::kCount * kWarpSize;

     using Shape = cutlass::epilogue::threadblock::OutputTileShape<
       ThreadblockShape::kN,   // column
       4,                      // row
       4,                      // group
       WarpCount::kM,          // cluster
       1                       // tile
     >;

     using Count = cutlass::epilogue::threadblock::OutputTileShape<
       1,                                // column
       2,                                // row
       kInterleavedTilesM,               // group
       1,                                // cluster
       WarpShape::kM / kTensorOpRows     // iterations
     >;
   };

   //
   // ThreadMap
   //

   using Type = OutputTileOptimalThreadMap <
     typename Detail::Shape,
     typename Detail::Count,
     Detail::kThreads,
     kElementsPerAccess,
     sizeof_bits<ElementOutput>::value
   >;
 };


 template <
   typename ThreadblockShape_,
   typename WarpShape_,
   int PartitionsK,
   typename ElementOutput_,
   int ElementsPerAccess
 >
 struct DefaultThreadMapVoltaTensorOp<
   ThreadblockShape_,
   WarpShape_,
   PartitionsK,
   ElementOutput_,
   ElementsPerAccess,
   float> {

   using ThreadblockShape = ThreadblockShape_;
   using WarpShape = WarpShape_;
   static int const kPartitionsK = PartitionsK;
   using ElementOutput = ElementOutput_;
   static int const kElementsPerAccess = ElementsPerAccess;
   using ElementAccumulator = float;

   //
   // Definitions
   //

   struct Detail {

     static int const kTensorOpRows = 16;
     static int const kWarpSize = 32;
     static int const kInterleavedTilesM = WarpShape::kM / 32;

     static_assert(
       !(ThreadblockShape::kM % WarpShape::kM) &&
       !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

     using WarpCount = gemm::GemmShape<
       ThreadblockShape::kM / WarpShape::kM,
       ThreadblockShape::kN / WarpShape::kN,
       kPartitionsK
     >;

     static int const kThreads = WarpCount::kCount * kWarpSize;

     using Shape = cutlass::epilogue::threadblock::OutputTileShape<
       ThreadblockShape::kN,   // column
       4,                      // row
       4,                      // group
       WarpCount::kM,          // cluster
       1                       // tile
     >;

     using Count = cutlass::epilogue::threadblock::OutputTileShape<
       1,                                // column
       2,                                // row
       kInterleavedTilesM,               // group
       1,                                // cluster
       WarpShape::kM / kTensorOpRows     // iterations
     >;
   };

   //
   // ThreadMap
   //

   using Type = OutputTileOptimalThreadMap <
     typename Detail::Shape,
     typename Detail::Count,
     Detail::kThreads,
     kElementsPerAccess,
     sizeof_bits<ElementOutput>::value
   >;
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp< ThreadblockShape_, WarpShape_, PartitionsK, ElementOutput_, ElementsPerAccess, float >::ElementAccumulator
float ElementAccumulator
Definition: default_thread_map_volta_tensor_op.h:158

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57

predicated_tile_iterator.h
Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::half_t
IEEE half-precision floating-point type.
Definition: half.h:126

cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp< ThreadblockShape_, WarpShape_, PartitionsK, ElementOutput_, ElementsPerAccess, half_t >::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_volta_tensor_op.h:72

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp< ThreadblockShape_, WarpShape_, PartitionsK, ElementOutput_, ElementsPerAccess, float >::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_volta_tensor_op.h:153

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_volta_tensor_op.h:52