cutlass/gemm_2threadblock_2threadblock__swizzle_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/gemm/gemm.h"


 namespace cutlass {
 namespace gemm {
 namespace threadblock {


 CUTLASS_DEVICE
 int RematerializeThreadIdxX() {
   return threadIdx.x;
 }

 CUTLASS_DEVICE
 int RematerializeThreadIdxY() {
   return threadIdx.y;
 }

 CUTLASS_DEVICE
 int RematerializeThreadIdxZ() {
   return threadIdx.z;
 }

 CUTLASS_DEVICE
 int RematerializeBlockIdxX() {
   return blockIdx.x;
 }

 CUTLASS_DEVICE
 int RematerializeBlockIdxY() {
   return blockIdx.y;
 }

 CUTLASS_DEVICE
 int RematerializeBlockIdxZ() {
   return blockIdx.z;
 }

 CUTLASS_DEVICE
 int RematerializeBlockDimX() {
   return blockDim.x;
 }

 CUTLASS_DEVICE
 int RematerializeBlockDimY() {
   return blockDim.y;
 }

 CUTLASS_DEVICE
 int RematerializeBlockDimZ() {
   return blockDim.z;
 }


 struct GemmIdentityThreadblockSwizzle {

   CUTLASS_HOST_DEVICE
   GemmIdentityThreadblockSwizzle() { }

   int const kTile = 1;

   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
     int split_k_slices) const {

     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       split_k_slices);
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
     return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k());
   }

   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {

     int block_idx_x = RematerializeBlockIdxX();
     int block_idx_y = RematerializeBlockIdxY();

     return GemmCoord{
       (block_idx_x / kTile),
       (block_idx_y * kTile) + (block_idx_x % kTile),
       RematerializeBlockIdxZ()
     };
   }
 };


 struct GemmHorizontalThreadblockSwizzle {

   CUTLASS_HOST_DEVICE
   GemmHorizontalThreadblockSwizzle() { }

   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
     int split_k_slices) const {

     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       split_k_slices);
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }

   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
       RematerializeBlockIdxZ()
     };
   }
 };


 struct GemmBatchedIdentityThreadblockSwizzle {

   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     int batch_count,
     GemmCoord tile_size) const {

     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       batch_count % (1 << 16));
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
     return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
   }

   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {
     return GemmCoord{
       RematerializeBlockIdxX(),
       RematerializeBlockIdxY(),
       0
     };
   }

   CUTLASS_DEVICE
   int get_batch_idx() const {
     return RematerializeBlockIdxZ();
   }
 };


 struct GemmSplitKIdentityThreadblockSwizzle {

   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
     int partitions) const {

     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       partitions);
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
     return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
   }


   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {
     return GemmCoord{
       RematerializeBlockIdxX(),
       RematerializeBlockIdxY(),
       RematerializeBlockIdxZ()
     };
   }
 };


 struct GemmSplitKHorizontalThreadblockSwizzle {

   CUTLASS_HOST_DEVICE
   GemmCoord get_tiled_shape(
     GemmCoord problem_size,
     GemmCoord tile_size,
     int partitions) const {

     return GemmCoord(
       (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       partitions);
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(GemmCoord tiled_shape) const {
     return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
   }


   CUTLASS_DEVICE
   GemmCoord get_tile_offset() const {
     return GemmCoord{
       RematerializeBlockIdxY(),
       RematerializeBlockIdxX(),
       RematerializeBlockIdxZ()
     };
   }
 };


 struct GemvBatchedStridedThreadblockDefaultSwizzle {

   CUTLASS_HOST_DEVICE
   BatchedGemmCoord get_tiled_shape(
     BatchedGemmCoord problem_size,
     BatchedGemmCoord tile_size) const {

     return BatchedGemmCoord(
       1, // M is always 1
       (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
       (problem_size.k() + tile_size.k() - 1) / tile_size.k(),
       (problem_size.batch() + tile_size.batch() - 1) / tile_size.batch());
   }

   CUTLASS_HOST_DEVICE
   dim3 get_grid_shape(BatchedGemmCoord tiled_shape) const {
     return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
   }

   CUTLASS_DEVICE
   BatchedGemmCoord get_tile_offset() const {
     return BatchedGemmCoord{
       0, // M is always 1
       RematerializeBlockIdxX(),
       RematerializeBlockIdxZ(),
       RematerializeBlockIdxY(),
     };
   }

   CUTLASS_DEVICE
   int get_batch_tile_idx() const {
     return RematerializeBlockIdxY();
   }

   CUTLASS_DEVICE
   int get_batch_idx() const {
     return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
   }
 };


 } // namespace threadblock
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle::kTile
int const kTile
Definition: gemm/threadblock/threadblock_swizzle.h:106

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::threadblock::RematerializeThreadIdxY
CUTLASS_DEVICE int RematerializeThreadIdxY()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:52

cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:200

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle::get_tile_offset
CUTLASS_DEVICE GemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:206

cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:241

cutlass::gemm::threadblock::GemmSplitKHorizontalThreadblockSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape(GemmCoord problem_size, GemmCoord tile_size, int partitions) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:264

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::threadblock::RematerializeBlockDimX
CUTLASS_DEVICE int RematerializeBlockDimX()
Helper to rematerialize block Dim. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:82

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:123

cutlass::gemm::GemmCoord::k
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:145

cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape(GemmCoord problem_size, GemmCoord tile_size, int split_k_slices) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:152

cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:165

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(BatchedGemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:313

cutlass::gemm::threadblock::RematerializeThreadIdxX
CUTLASS_DEVICE int RematerializeThreadIdxX()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:46

cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle
Threadblock swizzling function for GEMMs.
Definition: gemm/threadblock/threadblock_swizzle.h:145

cutlass::gemm::BatchedGemmCoord
Definition: include/cutlass/gemm/gemm.h:260

cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle::GemmHorizontalThreadblockSwizzle
CUTLASS_HOST_DEVICE GemmHorizontalThreadblockSwizzle()
Definition: gemm/threadblock/threadblock_swizzle.h:148

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle::get_batch_tile_idx
CUTLASS_DEVICE int get_batch_tile_idx() const
Gets the batch tile index.
Definition: gemm/threadblock/threadblock_swizzle.h:330

cutlass::gemm::threadblock::GemmSplitKHorizontalThreadblockSwizzle::get_grid_shape
CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const
Computes CUDA grid dimensions given a size in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:277

cutlass::gemm::threadblock::RematerializeThreadIdxZ
CUTLASS_DEVICE int RematerializeThreadIdxZ()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:58

cutlass::gemm::BatchedGemmCoord::batch
CUTLASS_HOST_DEVICE Index const & batch() const
Returns the GEMM batch coordinate.
Definition: include/cutlass/gemm/gemm.h:322

cutlass::gemm::threadblock::RematerializeBlockDimY
CUTLASS_DEVICE int RematerializeBlockDimY()
Helper to rematerialize block Dim. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:88

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle::GemmIdentityThreadblockSwizzle
CUTLASS_HOST_DEVICE GemmIdentityThreadblockSwizzle()
Definition: gemm/threadblock/threadblock_swizzle.h:104

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle::get_batch_idx
CUTLASS_DEVICE int get_batch_idx() const
Gets the absolute batch index.
Definition: gemm/threadblock/threadblock_swizzle.h:336

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape(GemmCoord problem_size, GemmCoord tile_size, int split_k_slices) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:110

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::BatchedGemmCoord::k
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:314

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle::get_tile_offset
CUTLASS_DEVICE GemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:129

cutlass::gemm::threadblock::GemmSplitKHorizontalThreadblockSwizzle
Threadblock swizzling function for split-K GEMMs.
Definition: gemm/threadblock/threadblock_swizzle.h:260

cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape(GemmCoord problem_size, int batch_count, GemmCoord tile_size) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:187

cutlass::gemm::threadblock::RematerializeBlockIdxY
CUTLASS_DEVICE int RematerializeBlockIdxY()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:70

cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape(GemmCoord problem_size, GemmCoord tile_size, int partitions) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:228

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle::get_tiled_shape
CUTLASS_HOST_DEVICE BatchedGemmCoord get_tiled_shape(BatchedGemmCoord problem_size, BatchedGemmCoord tile_size) const
Returns the shape of the problem in units of logical tiles.
Definition: gemm/threadblock/threadblock_swizzle.h:300

cutlass::gemm::threadblock::RematerializeBlockDimZ
CUTLASS_DEVICE int RematerializeBlockDimZ()
Helper to rematerialize block Dim. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:94

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle::get_tile_offset
CUTLASS_DEVICE BatchedGemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:319

cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle
Threadblock swizzling function for GEMMs.
Definition: gemm/threadblock/threadblock_swizzle.h:101

cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle
Threadblock swizzling function for batched GEMVs.
Definition: gemm/threadblock/threadblock_swizzle.h:296

cutlass::gemm::BatchedGemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:306

cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle::get_batch_idx
CUTLASS_DEVICE int get_batch_idx() const
Gets the batch index.
Definition: gemm/threadblock/threadblock_swizzle.h:216

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::threadblock::RematerializeBlockIdxZ
CUTLASS_DEVICE int RematerializeBlockIdxZ()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:76

cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle
Threadblock swizzling function for batched GEMMs.
Definition: gemm/threadblock/threadblock_swizzle.h:183

cutlass::gemm::threadblock::GemmSplitKHorizontalThreadblockSwizzle::get_tile_offset
CUTLASS_DEVICE GemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:284

cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle::get_tile_offset
CUTLASS_DEVICE GemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:171

cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle
Threadblock swizzling function for split-K GEMMs.
Definition: gemm/threadblock/threadblock_swizzle.h:224

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::threadblock::RematerializeBlockIdxX
CUTLASS_DEVICE int RematerializeBlockIdxX()
Helper to rematerialize block Idx. Reduces register liveness.
Definition: gemm/threadblock/threadblock_swizzle.h:64

cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle::get_tile_offset
CUTLASS_DEVICE GemmCoord get_tile_offset() const
Obtains the threadblock offset (in units of threadblock-scoped tiles)
Definition: gemm/threadblock/threadblock_swizzle.h:248