cutlass/linear__combination__relu_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"


 namespace cutlass {
 namespace epilogue {
 namespace thread {


 template <
   typename ElementOutput_,
   int Count,
   typename ElementAccumulator_ = ElementOutput_,
   typename ElementCompute_ = ElementOutput_,
   FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
 >
 class LinearCombinationRelu {
 public:

   using ElementOutput = ElementOutput_;
   using ElementAccumulator = ElementAccumulator_;
   using ElementCompute = ElementCompute_;

   static int const kCount = Count;

   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;

   static FloatRoundStyle const kRound = Round;

   struct Params {

     ElementCompute alpha;
     ElementCompute beta;
     ElementCompute threshold;
     ElementCompute const *alpha_ptr;
     ElementCompute const *beta_ptr;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params():
       alpha(ElementCompute(1)),
       beta(ElementCompute(0)),
       threshold(ElementCompute(0)),
       alpha_ptr(nullptr),
       beta_ptr(nullptr) { }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
       ElementCompute threshold  = ElementCompute(0)
     ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {

     }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
       ElementCompute threshold = ElementCompute(0)
     ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

     }
   };

 private:

   //
   // Data members
   //

   ElementCompute alpha_;
   ElementCompute beta_;
   ElementCompute threshold_;

 public:

   CUTLASS_HOST_DEVICE
   LinearCombinationRelu(Params const &params) {

     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
     threshold_ = params.threshold;
   }

   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
     return beta_ != ElementCompute(0);
   }

   CUTLASS_HOST_DEVICE
   void set_k_partition(int k_partition) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
   }

   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator,
     FragmentOutput const &source,
     ElementCompute uniform = ElementCompute(0)) const {

     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

     ComputeFragment converted_source = source_converter(source);
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);

     // Perform binary operations

     ComputeFragment intermediate;

     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;

     maximum<ComputeFragment> max_accumulator;

     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X

     intermediate = max_accumulator(intermediate, threshold_);

     // Convert to destination numeric type
     NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;

     return destination_converter(intermediate);
   }
 };


 template <
   typename ElementOutput_,
   int Count,
   FloatRoundStyle Round
 >
 class LinearCombinationRelu<ElementOutput_, Count, int, float, Round> {
 public:

   using ElementOutput = ElementOutput_;
   using ElementAccumulator = int;
   using ElementCompute = float;

   static int const kCount = Count;

   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
   using ComputeFragment = Array<ElementCompute, kCount>;

   static FloatRoundStyle const kRound = Round;

   struct Params {

     ElementCompute alpha;
     ElementCompute beta;
     ElementCompute threshold;
     ElementCompute const *alpha_ptr;
     ElementCompute const *beta_ptr;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Params():
       alpha(ElementCompute(1)),
       beta(ElementCompute(0)),
       threshold(ElementCompute(0)),
       alpha_ptr(nullptr),
       beta_ptr(nullptr) { }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute alpha,
       ElementCompute beta,
       ElementCompute threshold  = ElementCompute(0)
     ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {

     }

     CUTLASS_HOST_DEVICE
     Params(
       ElementCompute const *alpha_ptr,
       ElementCompute const *beta_ptr,
       ElementCompute threshold = ElementCompute(0)
     ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

     }
   };

 private:

   //
   // Data members
   //

   ElementCompute alpha_;
   ElementCompute beta_;
   ElementCompute threshold_;

 public:

   CUTLASS_HOST_DEVICE
   LinearCombinationRelu(Params const &params) {

     alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
     beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
     threshold_ = params.threshold;
   }

   CUTLASS_HOST_DEVICE
   bool is_source_needed() const {
     return beta_ != ElementCompute(0);
   }

   CUTLASS_HOST_DEVICE
   void set_k_partition(int k_partition) {
     if (k_partition) {
       beta_ = ElementCompute(1);
     }
   }

   CUTLASS_HOST_DEVICE
   FragmentOutput operator()(
     FragmentAccumulator const &accumulator,
     FragmentOutput const &source,
     ElementCompute uniform = ElementCompute(0)) const {

     // Convert source to interal compute numeric type
     NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
     NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

     ComputeFragment converted_source = source_converter(source);
     ComputeFragment converted_accumulator = accumulator_converter(accumulator);

     // Perform binary operations

     ComputeFragment intermediate;

     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;

     maximum<ComputeFragment> max_accumulator;

     intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
     intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X

     // Clamp to theshold
     intermediate = max_accumulator(intermediate, threshold_);

     // Convert back to accumulator data type
     FragmentAccumulator scaled_accumulator;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kCount; ++i) {
       scaled_accumulator[i] = static_cast<int>(intermediate[i]);
     }

     // Convert to destination numeric type and pack
     NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;

     return destination_converter(scaled_accumulator);
   }
 };


 } // namespace thread
 } // namespace epilogue
 } // namespace cutlass
cutlass::multiply_add
Fused multiply-add.
Definition: functional.h:92

cutlass::epilogue::thread::LinearCombinationRelu::operator()
CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source, ElementCompute uniform=ElementCompute(0)) const
Computes linear scaling: D = alpha * accumulator + beta * source.
Definition: linear_combination_relu.h:150

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: linear_combination_relu.h:87

cutlass
Definition: aligned_buffer.h:35

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, ElementCompute threshold=ElementCompute(0))
Definition: linear_combination_relu.h:104

cutlass::epilogue::thread::LinearCombinationRelu::Params::beta
ElementCompute beta
scales source tensor
Definition: linear_combination_relu.h:77

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute alpha, ElementCompute beta, ElementCompute threshold=ElementCompute(0))
Definition: linear_combination_relu.h:233

cutlass::epilogue::thread::LinearCombinationRelu::FragmentOutput
Array< ElementOutput, kCount > FragmentOutput
Definition: linear_combination_relu.h:67

cutlass::epilogue::thread::LinearCombinationRelu::FragmentAccumulator
Array< ElementAccumulator, kCount > FragmentAccumulator
Definition: linear_combination_relu.h:68

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::beta_ptr
ElementCompute const * beta_ptr
pointer to source scalar - if not null, loads it from memory
Definition: linear_combination_relu.h:218

cutlass::epilogue::thread::LinearCombinationRelu
Definition: linear_combination_relu.h:58

cutlass::maximum
Definition: functional.h:235

cutlass::epilogue::thread::LinearCombinationRelu::Params::beta_ptr
ElementCompute const * beta_ptr
pointer to source scalar - if not null, loads it from memory
Definition: linear_combination_relu.h:80

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::LinearCombinationRelu
CUTLASS_HOST_DEVICE LinearCombinationRelu(Params const &params)
Constructs the function object, possibly loading from pointers in host memory.
Definition: linear_combination_relu.h:265

cutlass::epilogue::thread::LinearCombinationRelu::ElementCompute
ElementCompute_ ElementCompute
Definition: linear_combination_relu.h:63

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::set_k_partition
CUTLASS_HOST_DEVICE void set_k_partition(int k_partition)
Functionally required for serial reduction in the epilogue.
Definition: linear_combination_relu.h:280

numeric_conversion.h
Boost-like numeric conversion operator for CUTLASS numeric types.

nullptr
#define nullptr
nullptr
Definition: platform.h:144

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::alpha
ElementCompute alpha
scales accumulators
Definition: linear_combination_relu.h:214

cutlass::epilogue::thread::LinearCombinationRelu::LinearCombinationRelu
CUTLASS_HOST_DEVICE LinearCombinationRelu(Params const &params)
Constructs the function object, possibly loading from pointers in host memory.
Definition: linear_combination_relu.h:127

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::beta
ElementCompute beta
scales source tensor
Definition: linear_combination_relu.h:215

cutlass::epilogue::thread::LinearCombinationRelu::Params::threshold
ElementCompute threshold
Relu threshold.
Definition: linear_combination_relu.h:78

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::FragmentOutput
Array< ElementOutput, kCount > FragmentOutput
Definition: linear_combination_relu.h:205

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute alpha, ElementCompute beta, ElementCompute threshold=ElementCompute(0))
Definition: linear_combination_relu.h:95

cutlass::multiplies
Definition: functional.h:64

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::epilogue::thread::LinearCombinationRelu::kRound
static FloatRoundStyle const kRound
Definition: linear_combination_relu.h:71

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::epilogue::thread::LinearCombinationRelu::ComputeFragment
Array< ElementCompute, kCount > ComputeFragment
Definition: linear_combination_relu.h:69

cutlass::epilogue::thread::LinearCombinationRelu::set_k_partition
CUTLASS_HOST_DEVICE void set_k_partition(int k_partition)
Functionally required for serial reduction in the epilogue.
Definition: linear_combination_relu.h:142

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: linear_combination_relu.h:225

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ComputeFragment
Array< ElementCompute, kCount > ComputeFragment
Definition: linear_combination_relu.h:207

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementOutput
ElementOutput_ ElementOutput
Definition: linear_combination_relu.h:199

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::is_source_needed
CUTLASS_HOST_DEVICE bool is_source_needed() const
Returns true if source is needed.
Definition: linear_combination_relu.h:274

cutlass::FloatRoundStyle::round_to_nearest
round to nearest even

cutlass::FloatRoundStyle
FloatRoundStyle
Definition: numeric_conversion.h:43

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::alpha_ptr
ElementCompute const * alpha_ptr
pointer to accumulator scalar - if not null, loads it from memory
Definition: linear_combination_relu.h:217

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params
CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, ElementCompute threshold=ElementCompute(0))
Definition: linear_combination_relu.h:242

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::threshold
ElementCompute threshold
Relu threshold.
Definition: linear_combination_relu.h:216

cutlass::NumericArrayConverter
Conversion operator for Array.
Definition: numeric_conversion.h:294

cutlass::epilogue::thread::LinearCombinationRelu::Params::alpha
ElementCompute alpha
scales accumulators
Definition: linear_combination_relu.h:76

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementAccumulator
int ElementAccumulator
Definition: linear_combination_relu.h:200

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementCompute
float ElementCompute
Definition: linear_combination_relu.h:201

cutlass::epilogue::thread::LinearCombinationRelu::ElementAccumulator
ElementAccumulator_ ElementAccumulator
Definition: linear_combination_relu.h:62

cutlass::epilogue::thread::LinearCombinationRelu::is_source_needed
CUTLASS_HOST_DEVICE bool is_source_needed() const
Returns true if source is needed.
Definition: linear_combination_relu.h:136

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::operator()
CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source, ElementCompute uniform=ElementCompute(0)) const
Computes linear scaling: D = alpha * accumulator + beta * source.
Definition: linear_combination_relu.h:288

cutlass::epilogue::thread::LinearCombinationRelu::kCount
static int const kCount
Definition: linear_combination_relu.h:65

cutlass.h
Basic include for CUTLASS.

cutlass::epilogue::thread::LinearCombinationRelu::Params::alpha_ptr
ElementCompute const * alpha_ptr
pointer to accumulator scalar - if not null, loads it from memory
Definition: linear_combination_relu.h:79

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::FragmentAccumulator
Array< ElementAccumulator, kCount > FragmentAccumulator
Definition: linear_combination_relu.h:206

functional.h
Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...

cutlass::epilogue::thread::LinearCombinationRelu::Params
Host-constructable parameters structure.
Definition: linear_combination_relu.h:74

cutlass::epilogue::thread::LinearCombinationRelu::ElementOutput
ElementOutput_ ElementOutput
Definition: linear_combination_relu.h:61