cutlass/reduce__split__k_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/numeric_conversion.h"

 #include "cutlass/layout/matrix.h"


 namespace cutlass {
 namespace reduction {
 namespace kernel {


 template <
   typename Shape_,
   typename OutputOp_ ,
   typename ReductionOp_,
   int PartitionsPerStage = 4
 >
 class ReduceSplitK {
 public:

   using Shape = Shape_;
   using ReductionOp = ReductionOp_;
   using OutputOp = OutputOp_;
   static int const kElementsPerAccess = OutputOp::kCount;
   static int const kPartitionsPerStage = PartitionsPerStage;

   using ElementWorkspace = typename ReductionOp::Element;
   using ElementAccumulator = typename ReductionOp::ElementAccumulator;
   using ElementOutput = typename OutputOp::ElementOutput;

   using WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>;
   using OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>;

   using FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>;
   using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
   using FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>;

   //
   // Types
   //

   struct Params {

     MatrixCoord problem_size;
     int partitions;
     size_t partition_stride;
     WorkspaceTensorRef workspace;
     OutputTensorRef destination;
     OutputTensorRef source;
     typename OutputOp::Params output;
     typename ReductionOp::Params reduction;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(
       MatrixCoord problem_size_,
       int partitions_,
       size_t partition_stride_,
       WorkspaceTensorRef workspace_,
       OutputTensorRef destination_,
       OutputTensorRef source_,
       typename OutputOp::Params output_ = typename OutputOp::Params(),
       typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
     ):
       problem_size(problem_size_),
       partitions(partitions_),
       partition_stride(sizeof(FragmentWorkspace) * partition_stride_ / kElementsPerAccess),
       workspace(workspace_),
       destination(destination_),
       source(source_),
       output(output_),
       reduction(reduction_) {

     }
   };

   struct SharedStorage { };


 public:

   CUTLASS_HOST_DEVICE
   static dim3 grid_shape(
     cutlass::MatrixCoord problem_size) {

     return dim3(
       (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn,
       (problem_size.row() + Shape::kRow -1) / Shape::kRow);
   }

   CUTLASS_HOST_DEVICE
   static dim3 block_shape() {
     return dim3(Shape::kColumn / kElementsPerAccess, Shape::kRow);
   }

   CUTLASS_DEVICE
   void operator()(Params const &params, SharedStorage &storage) {

     // Determine CTA position
     MatrixCoord thread_offset(
       int(blockIdx.y) * Shape::kRow + threadIdx.y,
       int(blockIdx.x) * Shape::kColumn + threadIdx.x * kElementsPerAccess
     );

     // One guard conditional
     if (!(thread_offset.row() < params.problem_size.row() &&
           thread_offset.column() < params.problem_size.column())) {

       return;
     }


     ReductionOp reduction_op(params.reduction);

     FragmentAccumulator accumulator;

     accumulator.clear();

     //
     // Load the first slice
     //

     char const *workspace_ptr =
       reinterpret_cast<char const *>(
         params.workspace.data() + params.workspace.offset(thread_offset));

     FragmentWorkspace workspace_frag[kPartitionsPerStage];

     //
     // Construct the output operator
     //

     OutputOp output_op(params.output);

     //
     // Load and accumulate with a simple batched loading sequence.
     //

     CUTLASS_PRAGMA_NO_UNROLL
     for (int k = 0; k < params.partitions; k += kPartitionsPerStage) {

       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kPartitionsPerStage; ++i) {
         if (k + i < params.partitions) {
           workspace_frag[i] = *reinterpret_cast<FragmentWorkspace const *>(workspace_ptr);
           workspace_ptr += params.partition_stride;
         }
       }

       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kPartitionsPerStage; ++i) {
         if (k + i < params.partitions) {
           accumulator = reduction_op(accumulator, workspace_frag[i]);
         }
       }
     }

     //
     // Conditionally load the source
     //

     FragmentOutput source_frag;

     source_frag.clear();

     FragmentOutput const *source_ptr = reinterpret_cast<FragmentOutput const *>(
       params.source.data() + params.source.offset(thread_offset));

     if (output_op.is_source_needed()) {
       reinterpret_cast<FragmentOutput &>(source_frag) = *source_ptr;
     }

     //
     // Compute the output
     //

     typename OutputOp::FragmentOutput output_frag = output_op(accumulator, source_frag);

     //
     // Store
     //

     FragmentOutput *dest_ptr = reinterpret_cast<FragmentOutput *>(
       params.destination.data() + params.destination.offset(thread_offset));

     *dest_ptr = reinterpret_cast<FragmentOutput const &>(output_frag);
   }
 };


 } // namespace kernel
 } // namespace reduction
 } // namespace cutlass
cutlass::reduction::kernel::ReduceSplitK::ElementWorkspace
typename ReductionOp::Element ElementWorkspace
Definition: reduce_split_k.h:64

cutlass::reduction::kernel::ReduceSplitK::ElementAccumulator
typename ReductionOp::ElementAccumulator ElementAccumulator
Definition: reduce_split_k.h:65

cutlass::MatrixCoord::column
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85

cutlass::reduction::kernel::ReduceSplitK::Params::source
OutputTensorRef source
Definition: reduce_split_k.h:87

cutlass
Definition: aligned_buffer.h:35

cutlass::reduction::kernel::ReduceSplitK::Params::destination
OutputTensorRef destination
Definition: reduce_split_k.h:86

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::reduction::kernel::ReduceSplitK::Params::partition_stride
size_t partition_stride
Definition: reduce_split_k.h:84

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::reduction::kernel::ReduceSplitK::block_shape
static CUTLASS_HOST_DEVICE dim3 block_shape()
Determines the threadblock shape.
Definition: reduce_split_k.h:138

cutlass::AlignedArray
Aligned array type.
Definition: array.h:511

cutlass::reduction::kernel::ReduceSplitK::ElementOutput
typename OutputOp::ElementOutput ElementOutput
Definition: reduce_split_k.h:66

cutlass::MatrixCoord::row
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77

cutlass::reduction::kernel::ReduceSplitK::Params::partitions
int partitions
Definition: reduce_split_k.h:83

cutlass::reduction::kernel::ReduceSplitK::Params::Params
CUTLASS_HOST_DEVICE Params(MatrixCoord problem_size_, int partitions_, size_t partition_stride_, WorkspaceTensorRef workspace_, OutputTensorRef destination_, OutputTensorRef source_, typename OutputOp::Params output_=typename OutputOp::Params(), typename ReductionOp::Params reduction_=typename ReductionOp::Params())
Definition: reduce_split_k.h:99

cutlass::reduction::kernel::ReduceSplitK::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: reduce_split_k.h:96

cutlass::reduction::kernel::ReduceSplitK::Params
Params structure.
Definition: reduce_split_k.h:80

cutlass::reduction::kernel::ReduceSplitK::ReductionOp
ReductionOp_ ReductionOp
Definition: reduce_split_k.h:59

cutlass::reduction::kernel::ReduceSplitK::operator()
CUTLASS_DEVICE void operator()(Params const &params, SharedStorage &storage)
Perform a reduction.
Definition: reduce_split_k.h:144

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::reduction::kernel::ReduceSplitK::Shape
Shape_ Shape
Definition: reduce_split_k.h:58

numeric_conversion.h
Boost-like numeric conversion operator for CUTLASS numeric types.

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::reduction::kernel::ReduceSplitK::Params::workspace
WorkspaceTensorRef workspace
Definition: reduce_split_k.h:85

cutlass::TensorRef< ElementWorkspace, layout::RowMajor >

cutlass::reduction::kernel::ReduceSplitK::Params::reduction
ReductionOp::Params reduction
Definition: reduce_split_k.h:89

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

cutlass::reduction::kernel::ReduceSplitK
Definition: reduce_split_k.h:55

CUTLASS_PRAGMA_NO_UNROLL
#define CUTLASS_PRAGMA_NO_UNROLL
Definition: cutlass.h:111

cutlass::reduction::kernel::ReduceSplitK::kPartitionsPerStage
static int const kPartitionsPerStage
Definition: reduce_split_k.h:62

cutlass::reduction::kernel::ReduceSplitK::grid_shape
static CUTLASS_HOST_DEVICE dim3 grid_shape(cutlass::MatrixCoord problem_size)
Computes the grid size given a chosen threadblock shape.
Definition: reduce_split_k.h:128

cutlass::reduction::kernel::ReduceSplitK::kElementsPerAccess
static int const kElementsPerAccess
Definition: reduce_split_k.h:61

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::reduction::kernel::ReduceSplitK::OutputOp
OutputOp_ OutputOp
Definition: reduce_split_k.h:60

cutlass::reduction::kernel::ReduceSplitK::Params::problem_size
MatrixCoord problem_size
Definition: reduce_split_k.h:82

cutlass::reduction::kernel::ReduceSplitK::FragmentAccumulator
Array< ElementAccumulator, kElementsPerAccess > FragmentAccumulator
Definition: reduce_split_k.h:72

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::reduction::kernel::ReduceSplitK::SharedStorage
Definition: reduce_split_k.h:121

cutlass::reduction::kernel::ReduceSplitK::Params::output
OutputOp::Params output
Definition: reduce_split_k.h:88

functional.h
Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...