cutlass/output__tile__thread__map_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/fast_math.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   int Column,
   int Row,
   int Group,
   int Cluster,
   int Tile
 >
 struct OutputTileShape {
   static int const kColumn = Column;
   static int const kRow = Row;
   static int const kGroup = Group;
   static int const kCluster = Cluster;
   static int const kTile = Tile;

   static int const kCount = kColumn * kRow * kGroup * kCluster * kTile;
 };


 template <
   typename ThreadMap_,
   typename Shape_,
   typename Iterations_,
   typename Delta_,
   typename Count_
 >
 struct OutputTileThreadMap {

   using ThreadMap = ThreadMap_;

   static int const kThreads = ThreadMap::kThreads;

   static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

   using Shape = Shape_;

   using Iterations = Iterations_;

   using Delta = Delta_;

   using Count = Count_;

   CUTLASS_HOST_DEVICE
   static MatrixCoord initial_offset(int thread_idx) {

     using Index = typename layout::PitchLinearCoord::Index;

     layout::PitchLinearCoord coord = ThreadMap::initial_offset(thread_idx);

     Index cluster = coord.strided() / (Shape::kGroup * Shape::kRow);
     Index cluster_residual = coord.strided() % (Shape::kGroup * Shape::kRow);

     Index group = cluster_residual / (Shape::kRow);
     Index row = cluster_residual % (Shape::kRow);

     return MatrixCoord{
       row + group * Shape::kRow * Count::kRow
         + cluster * Shape::kGroup * Count::kGroup * Shape::kRow * Count::kRow,
       coord.contiguous()
     };
   }
 };


 namespace detail {

 template <
   typename Shape,
   int WarpsRemaining,
   int ElementsPerAccess,
   int ElementSize,
   bool Is2dTile
 >
 struct RowArrangement;

 template <
   typename Shape,
   int WarpsRemaining,
   int ElementsPerAccess,
   int ElementSize
 >
 struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
   static int const kWarpSize = 32;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kElementSize = ElementSize;

   static int const kIterationsRow = 1;
   static int const kDeltaRow = 1;
   static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
   static int const kDeltaColumn = kWarpSize * kElementsPerAccess;

   static int const kAccessWidth = kWarpSize;
   static int const kAccessRows = 1;
   static int const kWarpPartitionsRow = 1;
   static int const kWarpPartitionsColumn = WarpsRemaining;
 };

 template <
   typename Shape,
   int WarpsRemaining,
   int ElementsPerAccess,
   int ElementSize
 >
 struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {

   static int const kMemoryAccessSize = 128;
   static int const kWarpSize = 32;

   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kElementSize = ElementSize;

   struct Detail {
     static int const kShapeRow = Shape::kRow / WarpsRemaining;
     static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;

     static int const kTargetMemoryAccessWidth =
       kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);

     static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
   };

   static int const kAccessWidth =
     (Detail::kTargetAccessRows > Detail::kShapeRow ?
       kWarpSize / Detail::kShapeRow
       : const_min(
           Detail::kShapeWidth,
         const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
         ));

   static int const kAccessRows =
     (Detail::kTargetAccessRows > Detail::kShapeRow ?
       Detail::kShapeRow
       : const_min(Shape::kRow, kWarpSize / kAccessWidth));

   static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
   static int const kDeltaRow = kAccessRows;

   static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
   static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;

   static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
   static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
   static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );

   static int const kWarpPartitionsRow = 1;
   static int const kWarpPartitionsColumn = 1;
 };

 }


 template <
   typename Shape_,
   typename Count_,
   int Threads,
   int ElementsPerAccess,
   int ElementSize
 >
 struct OutputTileOptimalThreadMap {

   using Shape = Shape_;
   using Count = Count_;

   static int const kWarpSize = 32;
   static int const kThreads = Threads;
   static int const kWarpCount = kThreads / kWarpSize;

   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kElementSize = ElementSize;

   //
   // Metaprogram computation
   //

   struct Detail {

     // Clusters
     static int const kIterationsCluster =
       ((Shape::kCluster > kWarpCount) ?
         Shape::kCluster / kWarpCount
         : 1);

     static int const kDeltaCluster =
       ((Shape::kCluster > kWarpCount) ?
         Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
         : 1);

     static int const kCompactedDeltaCluster =
       ((Shape::kCluster > kWarpCount) ?
         Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
         : 1);

     static int const kWarpPartitionsCluster =
       ((Shape::kCluster > kWarpCount) ?
         kWarpCount
         : kWarpCount / Shape::kCluster);

     static int const kWarpsRemainingForGroups =
       ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);

     // Groups
     static int const kIterationsGroup =
       ((Shape::kGroup > kWarpsRemainingForGroups) ?
         Shape::kGroup / kWarpsRemainingForGroups
         : 1);

     static int const kDeltaGroup =
       ((Shape::kGroup > kWarpsRemainingForGroups) ?
         Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
         : 1);

     static int const kCompactedDeltaGroup =
       ((Shape::kGroup > kWarpsRemainingForGroups) ?
         Shape::kRow * Shape::kGroup / kIterationsGroup
         : 1);

     static int const kWarpPartitionsGroup =
       ((Shape::kGroup > kWarpsRemainingForGroups) ?
         1
         : kWarpsRemainingForGroups / Shape::kGroup);

     static int const kWarpsRemainingForRows =
       ((Shape::kGroup > kWarpsRemainingForGroups) ?
         1
         : kWarpsRemainingForGroups / Shape::kGroup);

     // Rows
     using RowArrangement = detail::RowArrangement<
       Shape,
       kWarpsRemainingForRows,
       kElementsPerAccess,
       kElementSize,
       (Shape::kRow > kWarpsRemainingForRows)
     >;

     // Warp partitions
     using WarpPartitions = OutputTileShape<
       RowArrangement::kWarpPartitionsColumn,
       RowArrangement::kWarpPartitionsRow,
       kWarpPartitionsGroup,
       kWarpPartitionsCluster,
       1>;

     static int const kAccessWidth = RowArrangement::kAccessWidth;
     static int const kAccessRows = RowArrangement::kAccessRows;
   };

   //
   // Output
   //

   using Iterations = OutputTileShape<
     Detail::RowArrangement::kIterationsColumn,
     Detail::RowArrangement::kIterationsRow,
     Detail::kIterationsGroup,
     Detail::kIterationsCluster,
     1>;

   using Delta = OutputTileShape<
     Detail::RowArrangement::kDeltaColumn,
     Detail::RowArrangement::kDeltaRow,
     Detail::kDeltaGroup,
     Detail::kDeltaCluster,
     1>;

   CUTLASS_HOST_DEVICE
   static MatrixCoord initial_offset(int thread_idx) {

     int warp_idx = thread_idx / kWarpSize;
     int lane_idx = thread_idx % kWarpSize;

     // Compute warp location
     int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
     int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;

     int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
     int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;

     int row_idx = residual_group / Detail::WarpPartitions::kRow;
     int col_idx = residual_group % Detail::WarpPartitions::kRow;

     // Compute per-lane offset
     int lane_row_offset = lane_idx / Detail::kAccessWidth;
     int lane_col_offset = lane_idx % Detail::kAccessWidth;

     // Compute coordinate in output space
     int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
     int group_offset = group_idx * Shape::kRow * Count::kRow;
     int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
     int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;

     return MatrixCoord(
       cluster_offset + group_offset + row_offset + lane_row_offset,
       (column_offset + lane_col_offset) * kElementsPerAccess
     );
   }

   struct CompactedThreadMap {


     using Shape = Shape_;

     using Iterations = OutputTileShape<
       Detail::RowArrangement::kIterationsColumn,
       Detail::RowArrangement::kIterationsRow,
       Detail::kIterationsGroup,
       Detail::kIterationsCluster,
       1>;

     using Delta = OutputTileShape<
       Detail::RowArrangement::kDeltaColumn,
       Detail::RowArrangement::kDeltaRow,
       Detail::kCompactedDeltaGroup,
       Detail::kCompactedDeltaCluster,
       1>;

     static int const kElementsPerAccess = ElementsPerAccess;

     static int const kThreads = Threads;

     CUTLASS_HOST_DEVICE
     static MatrixCoord initial_offset(int thread_idx) {

       int warp_idx = thread_idx / kWarpSize;
       int lane_idx = thread_idx % kWarpSize;

       // Compute warp location
       int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
       int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;

       int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
       int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;

       int row_idx = residual_group / Detail::WarpPartitions::kRow;
       int col_idx = residual_group % Detail::WarpPartitions::kRow;

       // Compute per-lane offset
       int lane_row_offset = lane_idx / Detail::kAccessWidth;
       int lane_col_offset = lane_idx % Detail::kAccessWidth;

       // Compute coordinate in output space
       int cluster_offset = cluster_idx * Shape::kRow * Shape::kGroup;
       int group_offset = group_idx * Shape::kRow;
       int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
       int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;

       MatrixCoord coord(
         cluster_offset + group_offset + row_offset + lane_row_offset,
         (column_offset + lane_col_offset) * kElementsPerAccess
       );

       return coord;
     }
   };
 };


 template <typename WarpCount_, typename MmaCount_, int Threads,
           int ElementsPerAccess, int ElementSize>
 struct InterleavedOutputTileThreadMap {
   using WarpCount = WarpCount_;
   using MmaCount = MmaCount_;

   static int const kWarpSize = 32;
   static int const kThreads = Threads;
   static int const kWarpCount = kThreads / kWarpSize;

   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kElementSize = ElementSize;

   //
   // Metaprogram computation
   //

   struct Detail {};

   //
   // Output
   //

   using Iterations = MmaCount;

   using Delta = layout::PitchLinearShape<kWarpSize * kElementsPerAccess, 1>;

   CUTLASS_HOST_DEVICE
   static layout::PitchLinearCoord initial_offset(int thread_idx) {
     int warp_idx = thread_idx / kWarpSize;
     int lane_idx = thread_idx % kWarpSize;

     // Compute warp location
     layout::PitchLinearCoord warp_footprint{
         Delta::kContiguous * Iterations::kContiguous,
         Delta::kStrided * Iterations::kStrided};

     layout::PitchLinearCoord warp_offset{warp_idx % WarpCount::kContiguous,
                                          warp_idx / WarpCount::kContiguous};

     // Compute per-lane offset
     layout::PitchLinearCoord thread_offset_in_warp{
         lane_idx * kElementsPerAccess, 0};

     layout::PitchLinearCoord thread_offset_in_threadblock_tile =
         warp_footprint * warp_offset + thread_offset_in_warp;

     return thread_offset_in_threadblock_tile;
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
cutlass::layout::PitchLinearCoord::Index
int Index
Integer-valued index.
Definition: pitch_linear.h:56

cutlass::epilogue::threadblock::OutputTileThreadMap::ThreadMap
ThreadMap_ ThreadMap
Conventional thread map (concept: ThreadMap)
Definition: output_tile_thread_map.h:79

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228

cutlass
Definition: aligned_buffer.h:35

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::Count
Count_ Count
Definition: output_tile_thread_map.h:231

cutlass::epilogue::threadblock::OutputTileShape::kGroup
static int const kGroup
Definition: output_tile_thread_map.h:60

cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap::WarpCount
WarpCount_ WarpCount
Definition: output_tile_thread_map.h:443

cutlass::epilogue::threadblock::OutputTileThreadMap::Iterations
Iterations_ Iterations
Iterations performed by each thread.
Definition: output_tile_thread_map.h:91

cutlass::epilogue::threadblock::OutputTileShape::kColumn
static int const kColumn
Definition: output_tile_thread_map.h:58

cutlass::epilogue::threadblock::detail::RowArrangement
RowArrangement determines how one or more warps cover a region of consecutive rows.
Definition: output_tile_thread_map.h:133

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap
Definition: output_tile_thread_map.h:442

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::CompactedThreadMap
Compacted thread map in which the 4D region is contiguous.
Definition: output_tile_thread_map.h:369

cutlass::epilogue::threadblock::OutputTileThreadMap::Count
Count_ Count
Number of iterator iterations.
Definition: output_tile_thread_map.h:97

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::CompactedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE MatrixCoord initial_offset(int thread_idx)
Function to compute each thread&#39;s initial offset.
Definition: output_tile_thread_map.h:396

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::Shape
Shape_ Shape
Definition: output_tile_thread_map.h:230

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap::initial_offset
static CUTLASS_HOST_DEVICE layout::PitchLinearCoord initial_offset(int thread_idx)
Initial offset function.
Definition: output_tile_thread_map.h:469

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::Detail::RowArrangement
detail::RowArrangement< Shape, kWarpsRemainingForRows, kElementsPerAccess, kElementSize,(Shape::kRow > kWarpsRemainingForRows) > RowArrangement
Definition: output_tile_thread_map.h:303

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap::Iterations
MmaCount Iterations
Definition: output_tile_thread_map.h:463

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::CompactedThreadMap::Shape
Shape_ Shape
Definition: output_tile_thread_map.h:372

cutlass::epilogue::threadblock::OutputTileThreadMap::Delta
Delta_ Delta
Delta between accesses.
Definition: output_tile_thread_map.h:94

cutlass::epilogue::threadblock::OutputTileThreadMap::initial_offset
static CUTLASS_HOST_DEVICE MatrixCoord initial_offset(int thread_idx)
Initial offset function.
Definition: output_tile_thread_map.h:101

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap::Detail
Definition: output_tile_thread_map.h:457

cutlass::epilogue::threadblock::OutputTileShape::kRow
static int const kRow
Definition: output_tile_thread_map.h:59

matrix.h
Defines layout functions used by TensorRef and derived classes.

fast_math.h
Math utilities.

cutlass::epilogue::threadblock::OutputTileThreadMap
Definition: output_tile_thread_map.h:76

cutlass::epilogue::threadblock::OutputTileThreadMap::Shape
Shape_ Shape
Shape of the tile.
Definition: output_tile_thread_map.h:88

cutlass::epilogue::threadblock::OutputTileShape::kTile
static int const kTile
Definition: output_tile_thread_map.h:62

cutlass::epilogue::threadblock::OutputTileShape::kCount
static int const kCount
Definition: output_tile_thread_map.h:64

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap::MmaCount
MmaCount_ MmaCount
Definition: output_tile_thread_map.h:444

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::initial_offset
static CUTLASS_HOST_DEVICE MatrixCoord initial_offset(int thread_idx)
Initial offset function.
Definition: output_tile_thread_map.h:337

cutlass::const_min
CUTLASS_HOST_DEVICE constexpr int const_min(int a, int b)
Definition: fast_math.h:219

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap::Detail
Definition: output_tile_thread_map.h:244

cutlass::epilogue::threadblock::OutputTileShape::kCluster
static int const kCluster
Definition: output_tile_thread_map.h:61

cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97