cutlass/mma__sm70_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <assert.h>

 #include "mma.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/numeric_types.h"

 #if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
 #define CUTLASS_ARCH_MMA_SM70_SUPPORTED
 #endif

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))

 #if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 &&__CUDACC_VER_MINOR__ >= 1))
 #define CUTLASS_ARCH_MMA_SM70_ENABLED
 #endif

 #endif


 namespace cutlass {
 namespace arch {

 //
 // Matrix multiply accumulate 884 - FP16 accumulation
 //

 template <>
 struct Mma<
   gemm::GemmShape<8,8,4>,
   8,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::ColumnMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<half_t, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

     unsigned const *A = reinterpret_cast<unsigned const *>(&a);
     unsigned const *B = reinterpret_cast<unsigned const *>(&b);
     unsigned const *C = reinterpret_cast<unsigned const *>(&c);
     unsigned *D = reinterpret_cast<unsigned *>(&d);

     asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
       : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
     );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::RowMajor,
   half_t,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::ColumnMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::RowMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<half_t, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

     unsigned const *A = reinterpret_cast<unsigned const *>(&a);
     unsigned const *B = reinterpret_cast<unsigned const *>(&b);
     unsigned const *C = reinterpret_cast<unsigned const *>(&c);
     unsigned *D = reinterpret_cast<unsigned *>(&d);

     asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
       : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
     );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::RowMajor,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<half_t, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

     unsigned const *A = reinterpret_cast<unsigned const *>(&a);
     unsigned const *B = reinterpret_cast<unsigned const *>(&b);
     unsigned const *C = reinterpret_cast<unsigned const *>(&c);
     unsigned *D = reinterpret_cast<unsigned *>(&d);

     asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
       : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
     );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::RowMajor,
   half_t,
   layout::RowMajor,
   half_t,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::RowMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<half_t, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

     unsigned const *A = reinterpret_cast<unsigned const *>(&a);
     unsigned const *B = reinterpret_cast<unsigned const *>(&b);
     unsigned const *C = reinterpret_cast<unsigned const *>(&c);
     unsigned *D = reinterpret_cast<unsigned *>(&d);

     asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
       : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
     );

 #else
     assert(0);
 #endif
   }
 };

 //
 // Matrix multiply accumulate 884 - FP32 accumulation
 //

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::ColumnMajor,
   float,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::ColumnMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = float;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<float, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);

   asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
       "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
       : "=f"(D[0]),
         "=f"(D[1]),
         "=f"(D[2]),
         "=f"(D[3]),
         "=f"(D[4]),
         "=f"(D[5]),
         "=f"(D[6]),
         "=f"(D[7])
       : "r"(A[0]),
         "r"(A[1]),
         "r"(B[0]),
         "r"(B[1]),
         "f"(C[0]),
         "f"(C[1]),
         "f"(C[2]),
         "f"(C[3]),
         "f"(C[4]),
         "f"(C[5]),
         "f"(C[6]),
         "f"(C[7])
   );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::RowMajor,
   float,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::ColumnMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::RowMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = float;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<float, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);

   asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
       "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
       : "=f"(D[0]),
         "=f"(D[1]),
         "=f"(D[2]),
         "=f"(D[3]),
         "=f"(D[4]),
         "=f"(D[5]),
         "=f"(D[6]),
         "=f"(D[7])
       : "r"(A[0]),
         "r"(A[1]),
         "r"(B[0]),
         "r"(B[1]),
         "f"(C[0]),
         "f"(C[1]),
         "f"(C[2]),
         "f"(C[3]),
         "f"(C[4]),
         "f"(C[5]),
         "f"(C[6]),
         "f"(C[7])
   );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::RowMajor,
   half_t,
   layout::ColumnMajor,
   float,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = float;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<float, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);

   asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
       "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
       : "=f"(D[0]),
         "=f"(D[1]),
         "=f"(D[2]),
         "=f"(D[3]),
         "=f"(D[4]),
         "=f"(D[5]),
         "=f"(D[6]),
         "=f"(D[7])
       : "r"(A[0]),
         "r"(A[1]),
         "r"(B[0]),
         "r"(B[1]),
         "f"(C[0]),
         "f"(C[1]),
         "f"(C[2]),
         "f"(C[3]),
         "f"(C[4]),
         "f"(C[5]),
         "f"(C[6]),
         "f"(C[7])
   );

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 4>,
   8,
   half_t,
   layout::RowMajor,
   half_t,
   layout::RowMajor,
   float,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 4>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::RowMajor;
   using FragmentB = Array<half_t, 4>;

   using ElementC = float;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<float, 8>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) {

 #if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);

   asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
       "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
       : "=f"(D[0]),
         "=f"(D[1]),
         "=f"(D[2]),
         "=f"(D[3]),
         "=f"(D[4]),
         "=f"(D[5]),
         "=f"(D[6]),
         "=f"(D[7])
       : "r"(A[0]),
         "r"(A[1]),
         "r"(B[0]),
         "r"(B[1]),
         "f"(C[0]),
         "f"(C[1]),
         "f"(C[2]),
         "f"(C[3]),
         "f"(C[4]),
         "f"(C[5]),
         "f"(C[6]),
         "f"(C[7])
   );

 #else
     assert(0);
 #endif
   }
 };


 template <
   typename LayoutA,
   typename LayoutB,
   typename ElementC,
   typename LayoutC,
   typename Operator
 >
 struct Mma<
   gemm::GemmShape<16, 16, 4>,
   32,
   half_t,
   LayoutA,
   half_t,
   LayoutB,
   ElementC,
   LayoutC,
   Operator
 > :
   public Mma<
     gemm::GemmShape<8, 8, 4>,
     8,
     half_t,
     LayoutA,
     half_t,
     LayoutB,
     ElementC,
     LayoutC,
     Operator> {

   using Shape = gemm::GemmShape<16, 16, 4>;
 };


 } // namespace arch
 } // namespace cutlass
cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< half_t, 8 > FragmentC
Definition: mma_sm70.h:84

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:245

cutlass
Definition: aligned_buffer.h:35

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< float, 8 > FragmentC
Definition: mma_sm70.h:535

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< half_t, 8 > FragmentC
Definition: mma_sm70.h:194

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::ElementC
float ElementC
Definition: mma_sm70.h:308

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Definition: mma_sm70.h:199

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:86

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:131

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:312

cutlass::half_t
IEEE half-precision floating-point type.
Definition: half.h:126

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< float, 8 > FragmentC
Definition: mma_sm70.h:310

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Multiply-add.
Definition: mma_sm70.h:391

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:80

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< float, 8 > FragmentC
Definition: mma_sm70.h:385

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::ElementC
float ElementC
Definition: mma_sm70.h:458

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Definition: mma_sm70.h:89

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::ElementC
float ElementC
Definition: mma_sm70.h:383

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Multiply-add.
Definition: mma_sm70.h:316

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:241

mma.h
Templates exposing architecture support for multiply-add operations.

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:531

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Definition: mma_sm70.h:254

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:251

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Definition: mma_sm70.h:144

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:377

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:306

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::ElementC
float ElementC
Definition: mma_sm70.h:533

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:76

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:186

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:302

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:196

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< half_t, 8 > FragmentC
Definition: mma_sm70.h:249

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:527

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< float, 8 > FragmentC
Definition: mma_sm70.h:460

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:141

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:190

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Multiply-add.
Definition: mma_sm70.h:541

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< half_t, 8 > FragmentC
Definition: mma_sm70.h:139

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:135

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:387

cutlass::arch::Mma
Matrix multiply-add operation.
Definition: arch/mma.h:92

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c)
Multiply-add.
Definition: mma_sm70.h:466

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:537

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm70.h:462

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm70.h:452

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:456

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 4 >, 8, half_t, layout::ColumnMajor, half_t, layout::RowMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 4 > FragmentB
Definition: mma_sm70.h:381