cutlass/linear__combination__clamp_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"


 namespace cutlass {
 namespace epilogue {
 namespace thread {


 template <
   typename ElementOutput_,
   int Count,
   typename ElementAccumulator_ = ElementOutput_,
   typename ElementCompute_ = ElementOutput_,
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 class LinearCombinationClamp {
 public:

   using ElementOutput = ElementOutput_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;

   static int const kCount = Count;

   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;

   static FloatRoundStyle const kRound = Round;

   struct Params {

     ElementCompute alpha;
     ElementCompute beta;
     ElementCompute const *alpha_ptr;
     ElementCompute const *beta_ptr;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params():
       alpha(ElementCompute(1)),
       beta(ElementCompute(0)),
       alpha_ptr(nullptr),
       beta_ptr(nullptr) { }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta
     ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {

     }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr
     ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

     }
   };

 private:

   //
   // Data members
   //

   ElementCompute alpha_;
   ElementCompute beta_;

 public:

   CUTLASS_HOST_DEVICE
   LinearCombinationClamp(Params const &params) {

     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
   }

   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
     return beta_ != ElementCompute(0);
   }

   CUTLASS_HOST_DEVICE
   void set_k_partition(int k_partition) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
   }

   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator,
     FragmentOutput const &source,
     ElementCompute uniform = ElementCompute(0)) const {

     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

     ComputeFragment converted_source = source_converter(source);
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);

     // Perform binary operations

     ComputeFragment intermediate;

     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;

     minimum<ComputeFragment> min_accumulator;
     maximum<ComputeFragment> max_accumulator;

     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X

     ElementCompute const kClamp = ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));

     intermediate = max_accumulator(intermediate, -kClamp);
     intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));

     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;

     return destination_converter(intermediate);
   }

 };


 // Conditional guards to enable partial specialization for packed integers
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)

 template <
   typename ElementOutput_,
   int Count,
   FloatRoundStyle Round
 >
 class LinearCombinationClamp<ElementOutput_, Count, int, float, Round> {
 public:

   using ElementOutput = ElementOutput_;
   using ElementAccumulator = int;
   using ElementCompute = float;

   static int const kCount = Count;

   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;

   static FloatRoundStyle const kRound = Round;

   struct Params {

     ElementCompute alpha;
     ElementCompute beta;
     ElementCompute const *alpha_ptr;
     ElementCompute const *beta_ptr;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params():
       alpha(ElementCompute(1)),
       beta(ElementCompute(0)),
       alpha_ptr(nullptr),
       beta_ptr(nullptr) { }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta
     ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {

     }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr
     ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

     }
   };

 private:

   //
   // Data members
   //

   ElementCompute alpha_;
   ElementCompute beta_;

 public:

   CUTLASS_HOST_DEVICE
   LinearCombinationClamp(Params const &params) {

     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
   }

   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
     return beta_ != ElementCompute(0);
   }

   CUTLASS_HOST_DEVICE
   void set_k_partition(int k_partition) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
   }

   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator,
     FragmentOutput const &source,
     ElementCompute uniform = ElementCompute(0)) const {

     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

     ComputeFragment converted_source = source_converter(source);
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);

     // Compute linear scaling in floating point
     ComputeFragment intermediate;

     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;

     // Float min-max
     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X

     // Convert floats back to INT
     FragmentAccumulator scaled_accumulator;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kCount; ++i) {
       scaled_accumulator[i] = static_cast<int>(intermediate[i]);
     }

     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, int, kCount, Round> destination_converter;

     return destination_converter(scaled_accumulator);
   }
 };

 #endif // Conditional guards to enable partial specialization for packed integers


 } // namespace thread
 } // namespace epilogue
 } // namespace cutlass
cutlass::multiply_add
Fused multiply-add.
Definition: functional.h:92

cutlass::epilogue::thread::LinearCombinationClamp::ElementCompute
ElementCompute_ ElementCompute
Definition: linear_combination_clamp.h:63

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::thread::LinearCombinationClamp::Params::beta
ElementCompute beta
scales source tensor
Definition: linear_combination_clamp.h:77

cutlass::epilogue::thread::LinearCombinationClamp::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
Definition: linear_combination_clamp.h:101

cutlass::epilogue::thread::LinearCombinationClamp::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute alpha, ElementCompute beta)
Definition: linear_combination_clamp.h:93

cutlass::epilogue::thread::LinearCombinationClamp
Definition: linear_combination_clamp.h:58

cutlass::minimum
Definition: functional.h:298

cutlass::maximum
Definition: functional.h:235

cutlass::epilogue::thread::LinearCombinationClamp::kCount
static int const kCount
Definition: linear_combination_clamp.h:65

cutlass::epilogue::thread::LinearCombinationClamp::operator()
CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source, ElementCompute uniform=ElementCompute(0)) const
Computes linear scaling: D = alpha * accumulator + beta * source.
Definition: linear_combination_clamp.h:144

cutlass::epilogue::thread::LinearCombinationClamp::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: linear_combination_clamp.h:86

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

numeric_conversion.h
Boost-like numeric conversion operator for CUTLASS numeric types.

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

nullptr
#define nullptr
nullptr
Definition: platform.h:144

cutlass::epilogue::thread::LinearCombinationClamp::LinearCombinationClamp
CUTLASS_HOST_DEVICE LinearCombinationClamp(Params const &params)
Constructs the function object, possibly loading from pointers in host memory.
Definition: linear_combination_clamp.h:122

cutlass::multiplies
Definition: functional.h:64

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::epilogue::thread::LinearCombinationClamp::ComputeFragment
Array< ElementCompute, kCount > ComputeFragment
Definition: linear_combination_clamp.h:69

cutlass::epilogue::thread::LinearCombinationClamp::ElementOutput
ElementOutput_ ElementOutput
Definition: linear_combination_clamp.h:61

cutlass::epilogue::thread::LinearCombinationClamp::FragmentOutput
Array< ElementOutput, kCount > FragmentOutput
Definition: linear_combination_clamp.h:67

cutlass::epilogue::thread::LinearCombinationClamp::ElementAccumulator
ElementAccumulator_ ElementAccumulator
Definition: linear_combination_clamp.h:62

cutlass::FloatRoundStyle::round_to_nearest
round to nearest even

cutlass::epilogue::thread::LinearCombinationClamp::Params::beta_ptr
ElementCompute const * beta_ptr
pointer to source scalar - if not null, loads it from memory
Definition: linear_combination_clamp.h:79

cutlass::epilogue::thread::LinearCombinationClamp::set_k_partition
CUTLASS_HOST_DEVICE void set_k_partition(int k_partition)
Functionally required for serial reduction in the epilogue.
Definition: linear_combination_clamp.h:136

cutlass::FloatRoundStyle
FloatRoundStyle
Definition: numeric_conversion.h:43

cutlass::NumericArrayConverter
Conversion operator for Array.
Definition: numeric_conversion.h:294

cutlass::epilogue::thread::LinearCombinationClamp::Params
Host-constructable parameters structure.
Definition: linear_combination_clamp.h:74

cutlass::epilogue::thread::LinearCombinationClamp::kRound
static FloatRoundStyle const kRound
Definition: linear_combination_clamp.h:71

cutlass::epilogue::thread::LinearCombinationClamp::is_source_needed
CUTLASS_HOST_DEVICE bool is_source_needed() const
Returns true if source is needed.
Definition: linear_combination_clamp.h:130

cutlass.h
Basic include for CUTLASS.

cutlass::epilogue::thread::LinearCombinationClamp::Params::alpha_ptr
ElementCompute const * alpha_ptr
pointer to accumulator scalar - if not null, loads it from memory
Definition: linear_combination_clamp.h:78

functional.h
Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...

cutlass::epilogue::thread::LinearCombinationClamp::Params::alpha
ElementCompute alpha
scales accumulators
Definition: linear_combination_clamp.h:76

cutlass::epilogue::thread::LinearCombinationClamp::FragmentAccumulator
Array< ElementAccumulator, kCount > FragmentAccumulator
Definition: linear_combination_clamp.h:68