cutlass/epilogue__workspace_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"


 namespace cutlass {
 namespace epilogue {


 template <
   typename Shape_,
   int WarpCount,
   typename FragmentC_
 >
 class EpilogueWorkspace {
 public:

   using Shape = Shape_;
   using FragmentC = FragmentC_;
   using ElementC = typename FragmentC::value_type;

   static int const kWarpCount = WarpCount;

   static int const kAccessSizeInBits = 128;

   static int const kWarpSize = 32;

   static int const kElementsPerAccess =
     kAccessSizeInBits / sizeof_bits<ElementC>::value;

   static int const kIterations = FragmentC::kElements / kElementsPerAccess;

   static_assert(
     !(FragmentC::kElements % kElementsPerAccess),
     "The number of accumulators must be divisible by the access size.");

   static int const kWarpAccesses = kIterations * kWarpSize;

   static int const kThreadblockAccesses = kWarpAccesses * kWarpCount;

   struct Params {

     ElementC *ptr_C;

     int stride_n;

     int stride_k;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params(
       ElementC *ptr_C,
       int stride_n_,
       int stride_k_
     ):
       ptr_C(ptr_C), stride_n(stride_n_ / kElementsPerAccess), stride_k(stride_k_ / kElementsPerAccess) {

     }
   };

   struct SharedStorage {
     // Intentionally empty
   };

 private:

   struct alignas((kAccessSizeInBits / 8)) AccessType {
     Array<ElementC, kElementsPerAccess> storage;
   };

   AccessType *pointer_;

   int stride_n_;

   int stride_k_;

 public:

   CUTLASS_DEVICE
   EpilogueWorkspace(
     Params const &params,
     SharedStorage &,
     int warp_idx,
     int lane_idx

   ):
     pointer_(reinterpret_cast<AccessType *>(params.ptr_C)),
     stride_n_(params.stride_n),
     stride_k_(params.stride_k) {

     // Add per-thread offset
     pointer_ += lane_idx + warp_idx * kWarpAccesses;
   }

   CUTLASS_DEVICE
   void operator()(
     cutlass::gemm::GemmCoord problem_size,
     cutlass::gemm::GemmCoord tb_tile_coord,
     FragmentC const &accum) {

     // Compute offset for entire threadblock (note, per-thread offset has been folded in already)
     AccessType *pointer = pointer_ +
       tb_tile_coord.m() * kThreadblockAccesses +
       tb_tile_coord.n() * stride_n_ +
       tb_tile_coord.k() * stride_k_;

     // Cast to vectorized view of accumulator fragments
     AccessType const * src_pointer = reinterpret_cast<AccessType const *>(&accum);

     // Write out accumulators at full speed
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kIterations; ++i) {
       pointer[i * kWarpSize] = src_pointer[i];
     }
   }
 };


 } // namespace epilogue
 } // namespace cutlass

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::EpilogueWorkspace::SharedStorage
Shared storage allocation needed by the epilogue.
Definition: epilogue_workspace.h:124

cutlass::epilogue::EpilogueWorkspace::kAccessSizeInBits
static int const kAccessSizeInBits
Optimize for 128b accesses.
Definition: epilogue_workspace.h:74

cutlass::gemm::GemmCoord
Definition: include/cutlass/gemm/gemm.h:94

cutlass::epilogue::EpilogueWorkspace::kWarpAccesses
static int const kWarpAccesses
Total number of vectorized accesses in warp (in units of vector)
Definition: epilogue_workspace.h:91

cutlass::epilogue::EpilogueWorkspace::Params::Params
CUTLASS_HOST_DEVICE Params(ElementC *ptr_C, int stride_n_, int stride_k_)
Definition: epilogue_workspace.h:113

cutlass::gemm::GemmCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the GEMM N coordinate.
Definition: include/cutlass/gemm/gemm.h:137

cutlass::epilogue::EpilogueWorkspace::kIterations
static int const kIterations
Number of stores per thread.
Definition: epilogue_workspace.h:84

cutlass::epilogue::EpilogueWorkspace::operator()
CUTLASS_DEVICE void operator()(cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tb_tile_coord, FragmentC const &accum)
Streams the result to global memory.
Definition: epilogue_workspace.h:164

cutlass::epilogue::EpilogueWorkspace::Shape
Shape_ Shape
Definition: epilogue_workspace.h:67

cutlass::gemm::GemmCoord::k
CUTLASS_HOST_DEVICE Index const & k() const
Returns the GEMM K coordinate.
Definition: include/cutlass/gemm/gemm.h:145

cutlass::epilogue::EpilogueWorkspace::ElementC
typename FragmentC::value_type ElementC
Definition: epilogue_workspace.h:69

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::EpilogueWorkspace::Params::ptr_C
ElementC * ptr_C
Pointer to C matrix.
Definition: epilogue_workspace.h:100

cutlass::epilogue::EpilogueWorkspace::FragmentC
FragmentC_ FragmentC
Definition: epilogue_workspace.h:68

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::epilogue::EpilogueWorkspace::EpilogueWorkspace
CUTLASS_DEVICE EpilogueWorkspace(Params const &params, SharedStorage &, int warp_idx, int lane_idx)
Constructor.
Definition: epilogue_workspace.h:147

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::EpilogueWorkspace::kWarpCount
static int const kWarpCount
Definition: epilogue_workspace.h:71

cutlass::epilogue::EpilogueWorkspace::Params::stride_n
int stride_n
Stride between tiles along the GEMM N dimension (in units of vectors)
Definition: epilogue_workspace.h:103

cutlass::epilogue::EpilogueWorkspace::kElementsPerAccess
static int const kElementsPerAccess
Vector length of accesses.
Definition: epilogue_workspace.h:80

cutlass::gemm::GemmCoord::m
CUTLASS_HOST_DEVICE Index const & m() const
Returns the GEMM M coordinate.
Definition: include/cutlass/gemm/gemm.h:129

cutlass::epilogue::EpilogueWorkspace::Params::stride_k
int stride_k
Stride between tiles along the GEMM K dimension (in units of vectors)
Definition: epilogue_workspace.h:106

cutlass::epilogue::EpilogueWorkspace::Params
Parameters structure.
Definition: epilogue_workspace.h:97

cutlass::epilogue::EpilogueWorkspace
Definition: epilogue_workspace.h:64

cutlass.h
Basic include for CUTLASS.

cutlass::epilogue::EpilogueWorkspace::kWarpSize
static int const kWarpSize
Warp size from the perspective of memory operations.
Definition: epilogue_workspace.h:77

cutlass::epilogue::EpilogueWorkspace::kThreadblockAccesses
static int const kThreadblockAccesses
Total number of vectorized accesses in threadblock tile (in units of vector)
Definition: epilogue_workspace.h:94