cutlass/functional_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"

 #include "cutlass/complex.h"

 #include "cutlass/array.h"
 #include "cutlass/half.h"

 namespace cutlass {


 template <typename T>
 struct plus {
   CUTLASS_HOST_DEVICE
   T operator()(T lhs, T const &rhs) const {
     lhs += rhs;
     return lhs;
   }
 };

 template <typename T>
 struct minus {
   CUTLASS_HOST_DEVICE
   T operator()(T lhs, T const &rhs) const {
     lhs -= rhs;
     return lhs;
   }
 };

 template <typename T>
 struct multiplies {
   CUTLASS_HOST_DEVICE
   T operator()(T lhs, T const &rhs) const {
     lhs *= rhs;
     return lhs;
   }
 };

 template <typename T>
 struct divides {
   CUTLASS_HOST_DEVICE
   T operator()(T lhs, T const &rhs) const {
     lhs /= rhs;
     return lhs;
   }
 };


 template <typename T>
 struct negate {
   CUTLASS_HOST_DEVICE
   T operator()(T lhs) const {
     return -lhs;
   }
 };

 template <typename A, typename B = A, typename C = A>
 struct multiply_add {
   CUTLASS_HOST_DEVICE
   C operator()(A const &a, B const &b, C const &c) const {
     return C(a) * C(b) + c;
   }
 };

 template <typename T>
 struct xor_add {
   CUTLASS_HOST_DEVICE
   T operator()(T const &a, T const &b, T const &c) const {
     return ((a ^ b) + c);
   }
 };

 //
 // Partial specialization for complex<T> to target four scalar fused multiply-adds.
 //

 template <typename T>
 struct multiply_add<complex<T>, complex<T>, complex<T>> {
   CUTLASS_HOST_DEVICE
   complex<T> operator()(
     complex<T> const &a,
     complex<T> const &b,
     complex<T> const &c) const {

     T real = c.real();
     T imag = c.imag();

     real += a.real() * b.real();
     real += -a.imag() * b.imag();
     imag += a.real() * b.imag();
     imag += a.imag () * b.real();

     return complex<T>{
       real,
       imag
     };
   }
 };

 template <typename T>
 struct multiply_add<complex<T>, T, complex<T>> {
   CUTLASS_HOST_DEVICE
   complex<T> operator()(
     complex<T> const &a,
     T const &b,
     complex<T> const &c) const {

     T real = c.real();
     T imag = c.imag();

     real += a.real() * b;
     imag += a.imag () * b;

     return complex<T>{
       real,
       imag
     };
   }
 };

 template <typename T>
 struct multiply_add<T, complex<T>, complex<T>> {
   CUTLASS_HOST_DEVICE
   complex<T> operator()(
     T const &a,
     complex<T> const &b,
     complex<T> const &c) const {

     T real = c.real();
     T imag = c.imag();

     real += a * b.real();
     imag += a * b.imag();

     return complex<T>{
       real,
       imag
     };
   }
 };

 //
 // Partial specializations for Array<T, N>
 //

 template <typename T, int N>
 struct plus<Array<T, N>> {
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     plus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     plus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     plus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };


 template <typename T>
 struct maximum {

   CUTLASS_HOST_DEVICE
   T operator()(T const &lhs, T const &rhs) const {
     return (lhs < rhs ? rhs : lhs);
   }
 };

 template <>
 struct maximum<float> {
   CUTLASS_HOST_DEVICE
   float operator()(float const &lhs, float const &rhs) const {
     return fmaxf(lhs, rhs);
   }
 };

 template <typename T, int N>
 struct maximum<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     maximum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     maximum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     maximum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };

 template <typename T>
 struct minimum {

   CUTLASS_HOST_DEVICE
   T operator()(T const &lhs, T const &rhs) const {
     return (rhs < lhs ? rhs : lhs);
   }
 };

 template <>
 struct minimum<float> {
   CUTLASS_HOST_DEVICE
   float operator()(float const &lhs, float const &rhs) const {
     return fminf(lhs, rhs);
   }
 };

 template <typename T, int N>
 struct minimum<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   static T scalar_op(T const &lhs, T const &rhs) {
     return (rhs < lhs ? rhs : lhs);
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     minimum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     minimum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     minimum<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };

 template <typename T, int N>
 struct minus<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     minus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     minus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     minus<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };

 template <typename T, int N>
 struct multiplies<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     multiplies<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     multiplies<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     multiplies<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };

 template <typename T, int N>
 struct divides<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {

     Array<T, N> result;
     divides<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], rhs[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {

     Array<T, N> result;
     divides<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i], scalar);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {

     Array<T, N> result;
     divides<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, rhs[i]);
     }

     return result;
   }
 };


 template <typename T, int N>
 struct negate<Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &lhs) const {

     Array<T, N> result;
     negate<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(lhs[i]);
     }

     return result;
   }
 };

 template <typename T, int N>
 struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {

     Array<T, N> result;
     multiply_add<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(a[i], b[i], c[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {

     Array<T, N> result;
     multiply_add<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(a[i], scalar, c[i]);
     }

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {

     Array<T, N> result;
     multiply_add<T> scalar_op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = scalar_op(scalar, b[i], c[i]);
     }

     return result;
   }
 };

 //
 // Partial specializations for Array<half_t, N> targeting SIMD instructions in device code.
 //

 template <int N>
 struct plus<Array<half_t, N>> {
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
       __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] + rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
       __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs + rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] + rhs;
     }
     #endif

     return result;
   }
 };

 template <int N>
 struct minus<Array<half_t, N>> {
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
       __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] - rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
       __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs - rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] - rhs;
     }
     #endif

     return result;
   }
 };

 template <int N>
 struct multiplies<Array<half_t, N>> {
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
       __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] * rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);

       __half d_residual = __hmul(
         reinterpret_cast<__half const &>(lhs),
         b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs * rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);

       __half d_residual = __hmul(
         a_residual_ptr[N - 1],
         reinterpret_cast<__half const &>(rhs));

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] * rhs;
     }
     #endif

     return result;
   }
 };

 template <int N>
 struct divides<Array<half_t, N>> {
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);

       __half d_residual = __hdiv(
         a_residual_ptr[N - 1],
         b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] / rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
     __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
     }

     if (N % 2) {
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);

       __half d_residual = __hdiv(
         reinterpret_cast<__half const &>(lhs),
         b_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs / rhs[i];
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
     __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
     }

     if (N % 2) {
       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);

       __half d_residual = __hdiv(
         a_residual_ptr[N - 1],
         reinterpret_cast<__half const &>(rhs));

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = lhs[i] / rhs;
     }
     #endif

     return result;
   }
 };

 template <int N>
 struct negate<Array<half_t, N>> {
   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hneg2(source_ptr[i]);
     }

     if (N % 2) {
       half_t x = lhs[N - 1];
       __half lhs_val = -reinterpret_cast<__half const &>(x);
       result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
     }

     #else

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = -lhs[i];
     }
     #endif

     return result;
   }
 };

 template <int N>
 struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(
     Array<half_t, N> const &a,
     Array<half_t, N> const &b,
     Array<half_t, N> const &c) const {

     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
     __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
     __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
     }

     if (N % 2) {

       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
       __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);

       __half d_residual = __hfma(
         a_residual_ptr[N - 1],
         b_residual_ptr[N - 1],
         c_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     multiply_add<half_t> op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = op(a[i], b[i], c[i]);
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(
     half_t const &a,
     Array<half_t, N> const &b,
     Array<half_t, N> const &c) const {

     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
     __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
     __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
     }

     if (N % 2) {

       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
       __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
       __half d_residual = __hfma(
         reinterpret_cast<__half const &>(a),
         b_residual_ptr[N - 1],
         c_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     multiply_add<half_t> op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = op(a, b[i], c[i]);
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(
     Array<half_t, N> const &a,
     half_t const &b,
     Array<half_t, N> const &c) const {

     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
     __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
     __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
     }

     if (N % 2) {

       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
       __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);

       __half d_residual = __hfma(
         a_residual_ptr[N - 1],
         reinterpret_cast<__half const &>(b),
         c_residual_ptr[N - 1]);

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     multiply_add<half_t> op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = op(a[i], b, c[i]);
     }
     #endif

     return result;
   }

   CUTLASS_HOST_DEVICE
   Array<half_t, N> operator()(
     Array<half_t, N> const &a,
     Array<half_t, N> const &b,
     half_t const &c) const {

     Array<half_t, N> result;
     #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

     __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
     __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
     __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
     __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N / 2; ++i) {
       result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
     }

     if (N % 2) {

       __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
       __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);

       __half d_residual = __hfma(
         a_residual_ptr[N - 1],
         b_residual_ptr[N - 1],
         reinterpret_cast<__half const &>(c));

       result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
     }

     #else

     multiply_add<half_t> op;

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
       result[i] = op(a[i], b[i], c);
     }
     #endif

     return result;
   }
 };


 } // namespace cutlass
cutlass::multiply_add
Fused multiply-add.
Definition: functional.h:92

cutlass::minimum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:351

cutlass::plus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:578

cutlass
Definition: aligned_buffer.h:35

cutlass::xor_add::operator()
CUTLASS_HOST_DEVICE T operator()(T const &a, T const &b, T const &c) const
Definition: functional.h:103

cutlass::divides< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, half_t const &rhs) const
Definition: functional.h:955

complex.h

cutlass::minus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:383

cutlass::imag
CUTLASS_HOST_DEVICE float const & imag(cuFloatComplex const &z)
Returns the imaginary part of the complex number.
Definition: complex.h:72

cutlass::minus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:678

cutlass::plus::operator()
CUTLASS_HOST_DEVICE T operator()(T lhs, T const &rhs) const
Definition: functional.h:48

cutlass::maximum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:269

cutlass::minimum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:323

cutlass::plus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(half_t const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:611

half.h
Defines a class for using IEEE half-precision floating-point types in host or device code...

cutlass::divides< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(half_t const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:920

cutlass::multiplies< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:778

cutlass::plus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:191

cutlass::minimum
Definition: functional.h:298

cutlass::maximum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:255

cutlass::maximum
Definition: functional.h:235

cutlass::half_t
IEEE half-precision floating-point type.
Definition: half.h:126

cutlass::negate::operator()
CUTLASS_HOST_DEVICE T operator()(T lhs) const
Definition: functional.h:85

cutlass::multiplies< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(half_t const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:811

cutlass::plus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:205

cutlass::real
CUTLASS_HOST_DEVICE float const & real(cuFloatComplex const &z)
Returns the real part of the complex number.
Definition: complex.h:56

cutlass::minus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, half_t const &rhs) const
Definition: functional.h:743

cutlass::minimum< float >::operator()
CUTLASS_HOST_DEVICE float operator()(float const &lhs, float const &rhs) const
Definition: functional.h:309

cutlass::minimum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:337

cutlass::complex::imag
CUTLASS_HOST_DEVICE T const & imag() const
Accesses the imaginary part of the complex number.
Definition: complex.h:240

cutlass::multiply_add::operator()
CUTLASS_HOST_DEVICE C operator()(A const &a, B const &b, C const &c) const
Definition: functional.h:94

cutlass::multiply_add< Array< half_t, N >, Array< half_t, N >, Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &a, Array< half_t, N > const &b, half_t const &c) const
Definition: functional.h:1163

cutlass::plus
Definition: functional.h:46

cutlass::minus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:369

cutlass::maximum::operator()
CUTLASS_HOST_DEVICE T operator()(T const &lhs, T const &rhs) const
Definition: functional.h:238

cutlass::multiplies< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:415

cutlass::minus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:397

cutlass::multiplies::operator()
CUTLASS_HOST_DEVICE T operator()(T lhs, T const &rhs) const
Definition: functional.h:66

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::maximum< float >::operator()
CUTLASS_HOST_DEVICE float operator()(float const &lhs, float const &rhs) const
Definition: functional.h:246

cutlass::negate
Definition: functional.h:83

cutlass::divides::operator()
CUTLASS_HOST_DEVICE T operator()(T lhs, T const &rhs) const
Definition: functional.h:75

cutlass::multiply_add< Array< T, N >, Array< T, N >, Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &a, T const &scalar, Array< T, N > const &c) const
Definition: functional.h:541

cutlass::multiply_add< Array< T, N >, Array< T, N >, Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &a, Array< T, N > const &b, Array< T, N > const &c) const
Definition: functional.h:527

cutlass::multiplies< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:443

cutlass::multiplies
Definition: functional.h:64

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::multiplies< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:429

cutlass::divides< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:489

cutlass::divides< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:884

cutlass::divides
Definition: functional.h:73

cutlass::complex::real
CUTLASS_HOST_DEVICE T const & real() const
Accesses the real part of the complex number.
Definition: complex.h:232

cutlass::plus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, half_t const &rhs) const
Definition: functional.h:643

cutlass::complex
Definition: complex.h:92

cutlass::minimum::operator()
CUTLASS_HOST_DEVICE T operator()(T const &lhs, T const &rhs) const
Definition: functional.h:301

cutlass::minimum< Array< T, N > >::scalar_op
static CUTLASS_HOST_DEVICE T scalar_op(T const &lhs, T const &rhs)
Definition: functional.h:318

cutlass::multiply_add< T, complex< T >, complex< T > >::operator()
CUTLASS_HOST_DEVICE complex< T > operator()(T const &a, complex< T > const &b, complex< T > const &c) const
Definition: functional.h:164

cutlass::multiply_add< Array< half_t, N >, Array< half_t, N >, Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(half_t const &a, Array< half_t, N > const &b, Array< half_t, N > const &c) const
Definition: functional.h:1074

cutlass::multiply_add< complex< T >, T, complex< T > >::operator()
CUTLASS_HOST_DEVICE complex< T > operator()(complex< T > const &a, T const &b, complex< T > const &c) const
Definition: functional.h:142

cutlass::negate< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs) const
Definition: functional.h:508

cutlass::xor_add
Fused multiply-add.
Definition: functional.h:101

cutlass::maximum< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:283

cutlass::divides< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, T const &scalar) const
Definition: functional.h:475

cutlass::minus::operator()
CUTLASS_HOST_DEVICE T operator()(T lhs, T const &rhs) const
Definition: functional.h:57

cutlass::multiply_add< Array< half_t, N >, Array< half_t, N >, Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &a, half_t const &b, Array< half_t, N > const &c) const
Definition: functional.h:1118

cutlass::plus< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &rhs) const
Definition: functional.h:219

cutlass::minus< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(half_t const &lhs, Array< half_t, N > const &rhs) const
Definition: functional.h:711

cutlass::multiply_add< Array< T, N >, Array< T, N >, Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(T const &scalar, Array< T, N > const &b, Array< T, N > const &c) const
Definition: functional.h:555

cutlass::divides< Array< T, N > >::operator()
CUTLASS_HOST_DEVICE Array< T, N > operator()(Array< T, N > const &lhs, Array< T, N > const &rhs) const
Definition: functional.h:461

cutlass::negate< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs) const
Definition: functional.h:993

cutlass::minus
Definition: functional.h:55

cutlass::multiply_add< complex< T >, complex< T >, complex< T > >::operator()
CUTLASS_HOST_DEVICE complex< T > operator()(complex< T > const &a, complex< T > const &b, complex< T > const &c) const
Definition: functional.h:118

cutlass::multiplies< Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &lhs, half_t const &rhs) const
Definition: functional.h:846

cutlass.h
Basic include for CUTLASS.

cutlass::multiply_add< Array< half_t, N >, Array< half_t, N >, Array< half_t, N > >::operator()
CUTLASS_HOST_DEVICE Array< half_t, N > operator()(Array< half_t, N > const &a, Array< half_t, N > const &b, Array< half_t, N > const &c) const
Definition: functional.h:1028