cutlass/kernel_2gemm__splitk__parallel_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/gemm/gemm.h"
 #include "cutlass/matrix_coord.h"


 namespace cutlass {
 namespace gemm {
 namespace kernel {


 template <
   typename Mma_,
   typename Epilogue_,
   typename ThreadblockSwizzle_
 >
 struct GemmSplitKParallel {

   using Mma = Mma_;
   using Epilogue = Epilogue_;
   using OutputOp = typename Epilogue::OutputOp;
   using ThreadblockSwizzle = ThreadblockSwizzle_;

   using WarpCount = typename Mma::WarpCount;
   static int const kThreadCount = 32 * WarpCount::kCount;

   static int const kAlignmentK = Mma::Operator::Shape::kK;

   struct Params {
     cutlass::gemm::GemmCoord problem_size;
     cutlass::gemm::GemmCoord grid_tiled_shape;
     typename Mma::IteratorA::Params params_A;
     typename Mma::IteratorA::TensorRef ref_A;
     typename Mma::IteratorB::Params params_B;
     typename Mma::IteratorB::TensorRef ref_B;
     typename Epilogue::OutputTileIterator::Params params_D;
     typename Epilogue::OutputTileIterator::TensorRef ref_D;
     typename OutputOp::Params output_op;
     int64_t splitk_slice_stride;
     int gemm_k_size;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(
       cutlass::gemm::GemmCoord const & problem_size,
       cutlass::gemm::GemmCoord const & grid_tiled_shape,
       typename Mma::IteratorA::TensorRef ref_A,
       typename Mma::IteratorB::TensorRef ref_B,
       typename Epilogue::OutputTileIterator::TensorRef ref_D,
       typename OutputOp::Params output_op,
       int64_t splitk_slice_stride
     ):
       problem_size(problem_size),
       grid_tiled_shape(grid_tiled_shape),
       params_A(ref_A.layout()),
       ref_A(ref_A),
       params_B(ref_B.layout()),
       ref_B(ref_B),
       params_D(ref_D.layout()),
       ref_D(ref_D),
       output_op(output_op),
       splitk_slice_stride(splitk_slice_stride) {

       int full_gemm_k_iterations = problem_size.k() / Mma::Shape::kK;
       int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();

       gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
     }
   };

   union SharedStorage {
     typename Mma::SharedStorage main_loop;
     typename Epilogue::SharedStorage epilogue;
   };

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   GemmSplitKParallel() { }

   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &shared_storage) {

     // Compute threadblock location
     ThreadblockSwizzle threadblock_swizzle;

     cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

     // Early exit if CTA is out of range
     if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
       params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {

       return;
     }

     // Compute initial location in logical coordinates
     cutlass::MatrixCoord tb_offset_A{
       threadblock_tile_offset.m() * Mma::Shape::kM,
       threadblock_tile_offset.k() * params.gemm_k_size,
     };

     cutlass::MatrixCoord tb_offset_B{
       threadblock_tile_offset.k() * params.gemm_k_size,
       threadblock_tile_offset.n() * Mma::Shape::kN
     };

     // Problem size is a function of threadblock index in the K dimension
     int problem_size_k;
     if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
       problem_size_k = params.problem_size.k();
     }
     else {
       problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
     }

     // Compute threadblock-scoped matrix multiply-add
     int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;

     // Compute position within threadblock
     int thread_idx = threadIdx.x;

     // Construct iterators to A and B operands
     typename Mma::IteratorA iterator_A(
       params.params_A,
       params.ref_A.data(),
       {params.problem_size.m(), problem_size_k},
       thread_idx,
       tb_offset_A);

     typename Mma::IteratorB iterator_B(
       params.params_B,
       params.ref_B.data(),
       {problem_size_k, params.problem_size.n()},
       thread_idx,
       tb_offset_B);

     int warp_idx = threadIdx.x / 32;
     int lane_idx = threadIdx.x % 32;


     //
     // Main loop
     //

     // Construct thread-scoped matrix multiply
     Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);

     typename Mma::FragmentC accumulators;

     accumulators.clear();

     mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);

     //
     // Epilogue
     //

     OutputOp output_op(params.output_op);

     //
     // Masked tile iterators constructed from members
     //

     threadblock_tile_offset = threadblock_swizzle.get_tile_offset();

     //assume identity swizzle
     MatrixCoord threadblock_offset(
       threadblock_tile_offset.m() * Mma::Shape::kM,
       threadblock_tile_offset.n() * Mma::Shape::kN
     );

     // Tile iterator writing to output tile
     typename Epilogue::OutputTileIterator iterator_D(
       params.params_D,
       params.ref_D.data(),
       params.problem_size.mn(),
       thread_idx,
       threadblock_offset
     );

     iterator_D.add_pointer_offset(params.splitk_slice_stride * threadblock_tile_offset.k());

     // Execute the epilogue
     Epilogue epilogue(
       shared_storage.epilogue,
       thread_idx,
       warp_idx,
       lane_idx);

     // Run efficient epilogue
     epilogue(output_op, iterator_D, accumulators, iterator_D);
   }
 };


 } // namespace kernel
 } // namespace gemm
 } // namespace cutlass

cutlass::gemm::kernel::GemmSplitKParallel::operator()
CUTLASS_DEVICE void operator()(Params const &params, SharedStorage &shared_storage)
Executes one GEMM.
Definition: kernel/gemm_splitk_parallel.h:126

cutlass::gemm::kernel::GemmSplitKParallel::GemmSplitKParallel
CUTLASS_HOST_DEVICE GemmSplitKParallel()
Definition: kernel/gemm_splitk_parallel.h:122

cutlass
Definition: aligned_buffer.h:35

cutlass::gemm::kernel::GemmSplitKParallel::Epilogue
Epilogue_ Epilogue
Definition: kernel/gemm_splitk_parallel.h:52

cutlass::gemm::kernel::GemmSplitKParallel::Params::problem_size
cutlass::gemm::GemmCoord problem_size
Definition: kernel/gemm_splitk_parallel.h:64

cutlass::gemm::kernel::GemmSplitKParallel::SharedStorage
Shared memory storage structure.
Definition: kernel/gemm_splitk_parallel.h:112

cutlass::gemm::kernel::GemmSplitKParallel::SharedStorage::epilogue
Epilogue::SharedStorage epilogue
Definition: kernel/gemm_splitk_parallel.h:114

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::gemm::GemmCoord::mn
CUTLASS_HOST_DEVICE Coord< 2 > mn() const
Obtains a Coord<2> from GemmCoord.
Definition: include/cutlass/gemm/gemm.h:171

cutlass::gemm::kernel::GemmSplitKParallel::Params::grid_tiled_shape
cutlass::gemm::GemmCoord grid_tiled_shape
Definition: kernel/gemm_splitk_parallel.h:65

cutlass::gemm::kernel::GemmSplitKParallel::kThreadCount
static int const kThreadCount
Definition: kernel/gemm_splitk_parallel.h:58

cutlass::gemm::kernel::GemmSplitKParallel::SharedStorage::main_loop
Mma::SharedStorage main_loop
Definition: kernel/gemm_splitk_parallel.h:113

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::gemm::kernel::GemmSplitKParallel::Params
Parameters structure.
Definition: kernel/gemm_splitk_parallel.h:63

cutlass::gemm::kernel::GemmSplitKParallel::WarpCount
typename Mma::WarpCount WarpCount
Warp count (concept: GemmShape)
Definition: kernel/gemm_splitk_parallel.h:57

cutlass::gemm::kernel::GemmSplitKParallel::ThreadblockSwizzle
ThreadblockSwizzle_ ThreadblockSwizzle
Definition: kernel/gemm_splitk_parallel.h:54

cutlass::gemm::kernel::GemmSplitKParallel::Params::Params
CUTLASS_HOST_DEVICE Params(cutlass::gemm::GemmCoord const &problem_size, cutlass::gemm::GemmCoord const &grid_tiled_shape, typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B, typename Epilogue::OutputTileIterator::TensorRef ref_D, typename OutputOp::Params output_op, int64_t splitk_slice_stride)
Definition: kernel/gemm_splitk_parallel.h:84

cutlass::gemm::kernel::GemmSplitKParallel::Params::output_op
OutputOp::Params output_op
Definition: kernel/gemm_splitk_parallel.h:72

cutlass::gemm::GemmCoord::k
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:145

cutlass::gemm::kernel::GemmSplitKParallel::Params::ref_A
Mma::IteratorA::TensorRef ref_A
Definition: kernel/gemm_splitk_parallel.h:67

cutlass::gemm::kernel::GemmSplitKParallel::Params::ref_B
Mma::IteratorB::TensorRef ref_B
Definition: kernel/gemm_splitk_parallel.h:69

cutlass::gemm::kernel::GemmSplitKParallel::Params::gemm_k_size
int gemm_k_size
Definition: kernel/gemm_splitk_parallel.h:74

cutlass::gemm::kernel::GemmSplitKParallel::Params::ref_D
Epilogue::OutputTileIterator::TensorRef ref_D
Definition: kernel/gemm_splitk_parallel.h:71

cutlass::gemm::kernel::GemmSplitKParallel::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: kernel/gemm_splitk_parallel.h:81

cutlass::gemm::kernel::GemmSplitKParallel::Params::params_D
Epilogue::OutputTileIterator::Params params_D
Definition: kernel/gemm_splitk_parallel.h:70

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::gemm::kernel::GemmSplitKParallel::Params::params_A
Mma::IteratorA::Params params_A
Definition: kernel/gemm_splitk_parallel.h:66

cutlass::gemm::kernel::GemmSplitKParallel::kAlignmentK
static int const kAlignmentK
Definition: kernel/gemm_splitk_parallel.h:60

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::gemm::kernel::GemmSplitKParallel
Definition: kernel/gemm_splitk_parallel.h:49

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::gemm::kernel::GemmSplitKParallel::Mma
Mma_ Mma
Definition: kernel/gemm_splitk_parallel.h:51

cutlass::gemm::kernel::GemmSplitKParallel::Params::params_B
Mma::IteratorB::Params params_B
Definition: kernel/gemm_splitk_parallel.h:68

cutlass::gemm::kernel::GemmSplitKParallel::Params::splitk_slice_stride
int64_t splitk_slice_stride
Definition: kernel/gemm_splitk_parallel.h:73

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::kernel::GemmSplitKParallel::OutputOp
typename Epilogue::OutputOp OutputOp
Definition: kernel/gemm_splitk_parallel.h:53