cutlass/kernel_2gemm__batched_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"


 namespace cutlass {
 namespace gemm {
 namespace kernel {


 template <
   typename Mma_,
   typename Epilogue_,
   typename ThreadblockSwizzle_
 >
 struct GemmBatched {

   using Mma = Mma_;
   using Epilogue = Epilogue_;
   using OutputOp = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;

   using WarpCount = typename Mma::WarpCount;
   static int const kThreadCount = 32 * WarpCount::kCount;

   struct Params {
     cutlass::gemm::GemmCoord problem_size;
     cutlass::gemm::GemmCoord grid_tiled_shape;
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorA::TensorRef ref_A;
     int64_t stride_A;
     typename Mma::IteratorB::Params params_B;
     typename Mma::IteratorB::TensorRef ref_B;
     int64_t stride_B;
     typename Epilogue::OutputTileIterator::Params params_C;
     typename Epilogue::OutputTileIterator::TensorRef ref_C;
     int64_t stride_C;
     typename Epilogue::OutputTileIterator::Params params_D;
     typename Epilogue::OutputTileIterator::TensorRef ref_D;
     int64_t stride_D;
     typename OutputOp::Params epilogue;
     int batch_count;
     int gemm_k_iterations;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(
       cutlass::gemm::GemmCoord const & problem_size_,
       cutlass::gemm::GemmCoord const & grid_tiled_shape_,
       typename Mma::IteratorA::TensorRef ref_A_,
       int64_t stride_A_,
       typename Mma::IteratorB::TensorRef ref_B_,
       int64_t stride_B_,
       typename Epilogue::OutputTileIterator::TensorRef ref_C_,
       int64_t stride_C_,
       typename Epilogue::OutputTileIterator::TensorRef ref_D_,
       int64_t stride_D_,
       typename OutputOp::Params epilogue_,
       int batch_count_
     ):
       problem_size(problem_size_),
       grid_tiled_shape(grid_tiled_shape_),
       params_A(ref_A_.layout()),
       ref_A(ref_A_),
       stride_A(stride_A_),
       params_B(ref_B_.layout()),
       ref_B(ref_B_),
       stride_B(stride_B_),
       params_C(ref_C_.layout()),
       ref_C(ref_C_),
       stride_C(stride_C_),
       params_D(ref_D_.layout()),
       ref_D(ref_D_),
       stride_D(stride_D_),
       epilogue(epilogue_),
       batch_count(batch_count_),
       gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {

     }
   };

   union SharedStorage {
     typename Mma::SharedStorage main_loop;
     typename Epilogue::SharedStorage epilogue;
   };

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   GemmBatched() { }

   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {

     // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

     // Early exit if CTA is out of range
     if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
       params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {

       return;
     }


     // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
     for (int batch_idx = threadblock_swizzle.get_batch_idx();
       batch_idx < params.batch_count;
       batch_idx += gridDim.z) {

       // Compute initial location in logical coordinates
       cutlass::MatrixCoord tb_offset_A{
         threadblock_tile_offset.m() * Mma::Shape::kM,
         0
       };

       cutlass::MatrixCoord tb_offset_B{
         0,
         threadblock_tile_offset.n() * Mma::Shape::kN
       };

       // Compute position within threadblock
       int thread_idx = threadIdx.x;

       // Construct iterators to A and B operands
       typename Mma::IteratorA iterator_A(
         params.params_A,
         params.ref_A.data(),
         params.problem_size.mk(),
         thread_idx,
         tb_offset_A);

       iterator_A.add_pointer_offset(params.stride_A * batch_idx);

       typename Mma::IteratorB iterator_B(
         params.params_B,
         params.ref_B.data(),
         params.problem_size.kn(),
         thread_idx,
         tb_offset_B);

       iterator_B.add_pointer_offset(params.stride_B * batch_idx);


       //
       // Main loop
       //

       // Construct thread-scoped matrix multiply
       int warp_idx = threadIdx.x / 32;
       int lane_idx = threadIdx.x % 32;

       Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);

       typename Mma::FragmentC accumulators;

       accumulators.clear();


       // Compute threadblock-scoped matrix multiply-add
       mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);

       //
       // Epilogue
       //

       OutputOp output_op(params.epilogue);

       //
       // Masked tile iterators constructed from members
       //

       threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

       //assume identity swizzle
       MatrixCoord threadblock_offset(
         threadblock_tile_offset.m() * Mma::Shape::kM,
         threadblock_tile_offset.n() * Mma::Shape::kN
       );

       // Tile iterator writing to output tile
       typename Epilogue::OutputTileIterator iterator_C(
         params.params_C,
         params.ref_C.data(),
         params.problem_size.mn(),
         thread_idx,
         threadblock_offset
       );

       iterator_C.add_pointer_offset(params.stride_C * batch_idx);

       // Tile iterator writing to output tile
       typename Epilogue::OutputTileIterator iterator_D(
         params.params_D,
         params.ref_D.data(),
         params.problem_size.mn(),
         thread_idx,
         threadblock_offset
       );

       iterator_D.add_pointer_offset(params.stride_D * batch_idx);

       Epilogue epilogue(
         shared_storage.epilogue,
         thread_idx,
         warp_idx,
         lane_idx);

       // run efficient epilogue
       epilogue(output_op, iterator_D, accumulators, iterator_C);
     }
   }
 };


 } // namespace kernel
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::kernel::GemmBatched::operator()
CUTLASS_DEVICE void operator()(Params const &params, SharedStorage &shared_storage)
Executes one GEMM.
Definition: kernel/gemm_batched.h:138

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::kernel::GemmBatched::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: kernel/gemm_batched.h:85

cutlass::gemm::kernel::GemmBatched::OutputOp
typename Epilogue::OutputOp OutputOp
Definition: kernel/gemm_batched.h:53

cutlass::gemm::kernel::GemmBatched::Params::ref_D
Epilogue::OutputTileIterator::TensorRef ref_D
Definition: kernel/gemm_batched.h:74

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::GemmCoord::mn
CUTLASS_HOST_DEVICE Coord< 2 > mn() const
Obtains a Coord<2> from GemmCoord.
Definition: include/cutlass/gemm/gemm.h:171

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::kernel::GemmBatched::Params::ref_B
Mma::IteratorB::TensorRef ref_B
Definition: kernel/gemm_batched.h:68

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::kernel::GemmBatched::Params::gemm_k_iterations
int gemm_k_iterations
Definition: kernel/gemm_batched.h:78

cutlass::gemm::kernel::GemmBatched::Params::ref_C
Epilogue::OutputTileIterator::TensorRef ref_C
Definition: kernel/gemm_batched.h:71

cutlass::gemm::kernel::GemmBatched::GemmBatched
CUTLASS_HOST_DEVICE GemmBatched()
Definition: kernel/gemm_batched.h:134

cutlass::gemm::kernel::GemmBatched::Epilogue
Epilogue_ Epilogue
Definition: kernel/gemm_batched.h:52

cutlass::gemm::kernel::GemmBatched::SharedStorage
Shared memory storage structure.
Definition: kernel/gemm_batched.h:124

cutlass::gemm::kernel::GemmBatched::Params::grid_tiled_shape
cutlass::gemm::GemmCoord grid_tiled_shape
Definition: kernel/gemm_batched.h:63

cutlass::gemm::kernel::GemmBatched::SharedStorage::main_loop
Mma::SharedStorage main_loop
Definition: kernel/gemm_batched.h:125

cutlass::gemm::kernel::GemmBatched::kThreadCount
static int const kThreadCount
Definition: kernel/gemm_batched.h:58

cutlass::gemm::kernel::GemmBatched::Params
Parameters structure.
Definition: kernel/gemm_batched.h:61

cutlass::gemm::kernel::GemmBatched::WarpCount
typename Mma::WarpCount WarpCount
Warp count (concept: GemmShape)
Definition: kernel/gemm_batched.h:57

cutlass::gemm::kernel::GemmBatched::Params::params_D
Epilogue::OutputTileIterator::Params params_D
Definition: kernel/gemm_batched.h:73

cutlass::gemm::kernel::GemmBatched::Params::params_C
Epilogue::OutputTileIterator::Params params_C
Definition: kernel/gemm_batched.h:70

cutlass::gemm::kernel::GemmBatched::Params::epilogue
OutputOp::Params epilogue
Definition: kernel/gemm_batched.h:76

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::kernel::GemmBatched::Params::stride_C
int64_t stride_C
Definition: kernel/gemm_batched.h:72

cutlass::gemm::GemmCoord::mk
CUTLASS_HOST_DEVICE Coord< 2 > mk() const
Obtains a Coord<2> from GemmCoord.
Definition: include/cutlass/gemm/gemm.h:177

cutlass::gemm::kernel::GemmBatched::Mma
Mma_ Mma
Definition: kernel/gemm_batched.h:51

cutlass::gemm::kernel::GemmBatched::Params::problem_size
cutlass::gemm::GemmCoord problem_size
Definition: kernel/gemm_batched.h:62

cutlass::gemm::kernel::GemmBatched::Params::params_A
Mma::IteratorA::Params params_A
Definition: kernel/gemm_batched.h:64

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::gemm::kernel::GemmBatched::Params::batch_count
int batch_count
Definition: kernel/gemm_batched.h:77

cutlass::gemm::kernel::GemmBatched::Params::params_B
Mma::IteratorB::Params params_B
Definition: kernel/gemm_batched.h:67

cutlass::gemm::GemmCoord::kn
CUTLASS_HOST_DEVICE Coord< 2 > kn() const
Obtains a Coord<2> from GemmCoord.
Definition: include/cutlass/gemm/gemm.h:195

cutlass::gemm::kernel::GemmBatched::Params::stride_B
int64_t stride_B
Definition: kernel/gemm_batched.h:69

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::kernel::GemmBatched::Params::ref_A
Mma::IteratorA::TensorRef ref_A
Definition: kernel/gemm_batched.h:65

cutlass::gemm::kernel::GemmBatched::Params::Params
CUTLASS_HOST_DEVICE Params(cutlass::gemm::GemmCoord const &problem_size_, cutlass::gemm::GemmCoord const &grid_tiled_shape_, typename Mma::IteratorA::TensorRef ref_A_, int64_t stride_A_, typename Mma::IteratorB::TensorRef ref_B_, int64_t stride_B_, typename Epilogue::OutputTileIterator::TensorRef ref_C_, int64_t stride_C_, typename Epilogue::OutputTileIterator::TensorRef ref_D_, int64_t stride_D_, typename OutputOp::Params epilogue_, int batch_count_)
Definition: kernel/gemm_batched.h:88

cutlass::gemm::kernel::GemmBatched::Params::stride_A
int64_t stride_A
Definition: kernel/gemm_batched.h:66

cutlass::gemm::kernel::GemmBatched
Definition: kernel/gemm_batched.h:49

cutlass::gemm::kernel::GemmBatched::SharedStorage::epilogue
Epilogue::SharedStorage epilogue
Definition: kernel/gemm_batched.h:126

cutlass::gemm::kernel::GemmBatched::Params::stride_D
int64_t stride_D
Definition: kernel/gemm_batched.h:75

cutlass::gemm::kernel::GemmBatched::ThreadblockSwizzle
ThreadblockSwizzle_ ThreadblockSwizzle
Definition: kernel/gemm_batched.h:54

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39