cutlass/batched__reduction__traits_8h_source.html

 /***************************************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 #pragma once
 #include "cutlass/cutlass.h"
 #include "cutlass/shape.h"
 #include "cutlass/reduction/threadblock_swizzle.h"
 #include "cutlass/reduction/batched_reduction.h"
 #include "cutlass/gemm/linear_scaling.h"

 namespace cutlass {
 namespace reduction {

 /*
 OutputTile defines the work load per thread block
 Subtile defines the work load per thread block per iteration
 OutputTile / Subtile = number of iterations within a kernel
 ThreadShape defines the work load per thread
 Subtile / ThreadShape = number of threads per thread block
 */
 template <
   typename ScalarA_,
   typename ScalarC_,
   typename ScalarD_,
   typename ScalarAlphaBeta_,
   typename ScalarAccum_,
   int ReductionSize_ = 1,
   typename OutputTile_ = Shape<1, 1, 128>,
   typename SubTile_ = Shape<1, 1, 64>,
   typename ThreadShape_ = Shape<1, 1, 2>,
   typename Index_ = int,
   typename BlockSwizzle_ = DefaultBlockSwizzle,
   int maxInReg_ = 160,
   int maxOutReg_ = 64,
   typename Functor_ = typename cutlass::gemm::LinearScaling<ScalarAlphaBeta_, typename cutlass::gemm::FragmentMultiplyAdd<ScalarAlphaBeta_, ScalarAccum_, (ThreadShape_::kW % 2 == 0)> >
 >
 struct BatchedReductionTraits {
   typedef BatchedReductionTraits<ScalarA_,
     ScalarC_,
     ScalarD_,
     ScalarAlphaBeta_,
     ScalarAccum_,
     ReductionSize_,
     OutputTile_,
     SubTile_,
     ThreadShape_,
     Index_,
     BlockSwizzle_,
     maxInReg_,
     maxOutReg_,
     Functor_> This_;
   typedef typename cutlass::reduction::BatchedReduction<This_> KernelClass;
   typedef OutputTile_ OutputTile;
   typedef SubTile_ SubTile;
   typedef ThreadShape_ ThreadShape;
   typedef ScalarA_ ScalarA;
   typedef ScalarC_ ScalarC;
   typedef ScalarD_ ScalarD;
   typedef ScalarAlphaBeta_ ScalarAlphaBeta;
   typedef ScalarAccum_ ScalarAccum;
   typedef Index_ Index;
   typedef BlockSwizzle_ BlockSwizzle;
   static const int ReductionSize = ReductionSize_;
   static const bool ThreadShapeMultiple2 = (ThreadShape::kW % 2 == 0);
   typedef Functor_ Functor;
   static int const kThreads = SubTile::kW / ThreadShape::kW;
   //
   static int const maxInReg = maxInReg_;
   //
   static int const maxOutReg = maxOutReg_;
   //
   static_assert(SubTile::kW % ThreadShape::kW == 0, "cannot evenly distribute work load among threads");
   //
   static_assert(kThreads % 32 == 0, "threads per threadblock is not multiple of 32");
   //
   static_assert(OutputTile::kW % SubTile::kW == 0, "cannot evenly distribute work load among iterations");
   //
   static_assert(ReductionSize * ThreadShape::kW <= maxInReg, "ReductionSize * ThreadShape::kW should not be bigger than maxInReg");
   //
   static_assert(ThreadShape::kW <= maxOutReg, "ThreadShape::kW should not be bigger than maxOutReg");

   struct Params {
     Coord<3> problem_size;
     ScalarAlphaBeta alpha;
     ScalarAlphaBeta beta;
     long long int reduction_stride;
     //
     ScalarA const *d_a;
     //
     Index lda;
     //
     ScalarC const *d_c;
     //
     Index ldc;
     //
     ScalarD *d_d;
     //
     Index ldd;
     typename Functor::Params functorParams;
     CUTLASS_HOST_DEVICE int initialize(Index m_,
                                        Index n_,
                                        ScalarAlphaBeta alpha_,
                                        ScalarAlphaBeta beta_,
                                        long long int reduction_stride_,
                                        ScalarA const *d_a_,
                                        Index lda_,
                                        ScalarC const *d_c_,
                                        Index ldc_,
                                        ScalarD *d_d_,
                                        Index ldd_){
       problem_size = make_Coord(1, n_, m_);
       alpha = alpha_;
       beta = beta_;
       reduction_stride = reduction_stride_;
       d_a = d_a_;
       lda = lda_;
       d_c = d_c_;
       d_d = d_d_;
       ldc = ldc_;
       ldd = ldd_;

       functorParams.initialize(alpha_, beta_);

       return 0;
     }
   };

 };
 } // namespace reduction
 } // namespace cutlass
cutlass::reduction::BatchedReductionTraits::Params::problem_size
Coord< 3 > problem_size
The dimension of output tensor.
Definition: batched_reduction_traits.h:140

cutlass
Definition: aligned_buffer.h:35

cutlass::reduction::BatchedReductionTraits::Params
Definition: batched_reduction_traits.h:138

cutlass::reduction::BatchedReductionTraits::BlockSwizzle
BlockSwizzle_ BlockSwizzle
The thread block swizzle.
Definition: batched_reduction_traits.h:113

cutlass::reduction::BatchedReductionTraits::This_
BatchedReductionTraits< ScalarA_, ScalarC_, ScalarD_, ScalarAlphaBeta_, ScalarAccum_, ReductionSize_, OutputTile_, SubTile_, ThreadShape_, Index_, BlockSwizzle_, maxInReg_, maxOutReg_, Functor_ > This_
Definition: batched_reduction_traits.h:91

cutlass::reduction::BatchedReductionTraits::kThreads
static int const kThreads
Definition: batched_reduction_traits.h:122

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387

cutlass::reduction::BatchedReductionTraits::ScalarAccum
ScalarAccum_ ScalarAccum
The type for accumulation.
Definition: batched_reduction_traits.h:109

cutlass::reduction::BatchedReductionTraits::Params::lda
Index lda
Definition: batched_reduction_traits.h:150

cutlass::reduction::BatchedReductionTraits::Params::beta
ScalarAlphaBeta beta
The beta.
Definition: batched_reduction_traits.h:144

threadblock_swizzle.h
Defies functors for mapping blockIdx to partitions of the batched reduction computation.

cutlass::reduction::BatchedReductionTraits::ThreadShape
ThreadShape_ ThreadShape
Definition: batched_reduction_traits.h:99

cutlass::reduction::BatchedReductionTraits::Params::ldd
Index ldd
Definition: batched_reduction_traits.h:158

cutlass::reduction::BatchedReductionTraits::ScalarD
ScalarD_ ScalarD
The output pointer type.
Definition: batched_reduction_traits.h:105

cutlass::reduction::BatchedReductionTraits::Params::reduction_stride
long long int reduction_stride
stride between two element that will be sumed
Definition: batched_reduction_traits.h:146

cutlass::reduction::BatchedReductionTraits::SubTile
SubTile_ SubTile
Definition: batched_reduction_traits.h:97

cutlass::reduction::BatchedReductionTraits::Params::d_c
ScalarC const * d_c
Definition: batched_reduction_traits.h:152

cutlass::reduction::BatchedReductionTraits::OutputTile
OutputTile_ OutputTile
Definition: batched_reduction_traits.h:95

cutlass::reduction::BatchedReductionTraits::Params::ldc
Index ldc
Definition: batched_reduction_traits.h:154

cutlass::reduction::BatchedReductionTraits::ScalarAlphaBeta
ScalarAlphaBeta_ ScalarAlphaBeta
The alpha beta type.
Definition: batched_reduction_traits.h:107

cutlass::reduction::BatchedReductionTraits::ScalarC
ScalarC_ ScalarC
Definition: batched_reduction_traits.h:103

cutlass::reduction::BatchedReductionTraits::Params::d_a
ScalarA const * d_a
Definition: batched_reduction_traits.h:148

cutlass::reduction::BatchedReduction
Definition: batched_reduction.h:52

cutlass::reduction::BatchedReductionTraits::ReductionSize
static const int ReductionSize
Definition: batched_reduction_traits.h:115

cutlass::reduction::BatchedReductionTraits::ScalarA
ScalarA_ ScalarA
The input pointer type.
Definition: batched_reduction_traits.h:101

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::Coord< 3 >

cutlass::reduction::BatchedReductionTraits::Params::alpha
ScalarAlphaBeta alpha
The alpha.
Definition: batched_reduction_traits.h:142

cutlass::reduction::BatchedReductionTraits::Functor
Functor_ Functor
Definition: batched_reduction_traits.h:119

cutlass::reduction::BatchedReductionTraits::maxInReg
static int const maxInReg
Definition: batched_reduction_traits.h:124

cutlass::reduction::BatchedReductionTraits::Params::d_d
ScalarD * d_d
Definition: batched_reduction_traits.h:156

cutlass::reduction::BatchedReductionTraits::Params::functorParams
Functor::Params functorParams
The functor params.
Definition: batched_reduction_traits.h:160

cutlass::reduction::BatchedReductionTraits::ThreadShapeMultiple2
static const bool ThreadShapeMultiple2
check if threadShape is multiple of 2.
Definition: batched_reduction_traits.h:117

cutlass::reduction::BatchedReductionTraits::Index
Index_ Index
The index.
Definition: batched_reduction_traits.h:111

cutlass::reduction::BatchedReductionTraits::maxOutReg
static int const maxOutReg
Definition: batched_reduction_traits.h:126

batched_reduction.h
Implements a software-pipelined efficient batched reduction. D = alpha * Reduction(A) + beta * C...

cutlass.h
Basic include for CUTLASS.

cutlass::reduction::BatchedReductionTraits
Definition: batched_reduction_traits.h:76

cutlass::reduction::BatchedReductionTraits::Params::initialize
CUTLASS_HOST_DEVICE int initialize(Index m_, Index n_, ScalarAlphaBeta alpha_, ScalarAlphaBeta beta_, long long int reduction_stride_, ScalarA const *d_a_, Index lda_, ScalarC const *d_c_, Index ldc_, ScalarD *d_d_, Index ldd_)
Initialize the parameters for 2D output tensor.
Definition: batched_reduction_traits.h:162

cutlass::reduction::BatchedReductionTraits::KernelClass
cutlass::reduction::BatchedReduction< This_ > KernelClass
The struct that consumes this Traits.
Definition: batched_reduction_traits.h:93