cutlass/mma__sm75_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <assert.h>

 #include "cutlass/arch/wmma.h"

 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 // CUDA Toolkit includes for nvcuda::wmma needed for binarized matrix multiply.
 #include <mma.h>
 #include "cutlass/wmma_array.h"
 #endif

 // CUTLASS includes
 #include "cutlass/arch/mma.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/numeric_types.h"


 #if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))

 #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
 #define CUTLASS_ARCH_MMA_SM75_ENABLED
 #endif
 #endif


 namespace cutlass {
 namespace arch {

 //
 // Matrix Multiply 1688 - FP16 accumulation
 //

 template <>
 struct Mma<
   gemm::GemmShape<16, 8, 8>,
   32,
   half_t,
   layout::RowMajor,
   half_t,
   layout::ColumnMajor,
   half_t,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<16, 8, 8>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 2>;

   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<half_t, 4>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   unsigned const *C = reinterpret_cast<unsigned const *>(&c);
   unsigned *D = reinterpret_cast<unsigned *>(&d);

   asm volatile(
     "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 //
 // Matrix Multiply 1688 - FP32 accumulation
 //

 template <>
 struct Mma<
   gemm::GemmShape<16, 8, 8>,
   32,
   half_t,
   layout::RowMajor,
   half_t,
   layout::ColumnMajor,
   float,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<16, 8, 8>;

   using ElementA = half_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<half_t, 4>;

   using ElementB = half_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<half_t, 2>;

   using ElementC = float;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<float, 4>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
                   FragmentC const &c) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const *A = reinterpret_cast<unsigned const *>(&a);
   unsigned const *B = reinterpret_cast<unsigned const *>(&b);
   float const *C = reinterpret_cast<float const *>(&c);
   float *D = reinterpret_cast<float *>(&d);

   asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
       : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
       :
         "r"(A[0]), "r"(A[1]),
         "r"(B[0]),
         "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
   );

 #else
     assert(0);
 #endif
   }
 };

 //
 // Integer matrix multiply .8816 (8b)
 //

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
   int8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 16>;

   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int8_t, 4>;

   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
   int8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 16>;

   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint8_t, 4>;

   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 16>,
   32,
   int8_t,
   layout::RowMajor,
   uint8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 16>;

   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int8_t, 4>;

   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.s8.u8 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));


 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8, 8, 16>,
   32,
   uint8_t,
   layout::RowMajor,
   uint8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8, 8, 16>;

   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint8_t, 4>;

   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 //
 // Integer matrix multiply  (8b) with SATURATE
 //

 template <>
 struct Mma<
   gemm::GemmShape<8,8,16>,
   32,
   int8_t,
   layout::RowMajor,
   int8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,16>;

   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int8_t, 4>;

   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,16>,
   32,
   uint8_t,
   layout::RowMajor,
   int8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,16>;

   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint8_t, 4>;

   using ElementB = int8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,16>,
   32,
   int8_t,
   layout::RowMajor,
   uint8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,16>;

   using ElementA = int8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int8_t, 4>;

   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,16>,
   32,
   uint8_t,
   layout::RowMajor,
   uint8_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,16>;

   using ElementA = uint8_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint8_t, 4>;

   using ElementB = uint8_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint8_t, 4>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 //
 // Integer matrix multiply  (4b)
 //

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   int4b_t,
   layout::RowMajor,
   int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int4b_t, 8>;

   using ElementB = int4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   uint4b_t,
   layout::RowMajor,
   int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint4b_t, 8>;

   using ElementB = int4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   int4b_t,
   layout::RowMajor,
   uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int4b_t, 8>;

   using ElementB = uint4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("_mma.m8n8k32.row.col.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   uint4b_t,
   layout::RowMajor,
   uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAdd> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint4b_t, 8>;

   using ElementB = uint4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAdd;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 //
 // Integer matrix multiply  (4b) - SATURATE
 //

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   int4b_t,
   layout::RowMajor,
   int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int4b_t, 8>;

   using ElementB = int4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   uint4b_t,
   layout::RowMajor,
   int4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint4b_t, 8>;

   using ElementB = int4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<int4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   int4b_t,
   layout::RowMajor,
   uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = int4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<int4b_t, 8>;

   using ElementB = uint4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 template <>
 struct Mma<
   gemm::GemmShape<8,8,32>,
   32,
   uint4b_t,
   layout::RowMajor,
   uint4b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpMultiplyAddSaturate> {

   using Shape = gemm::GemmShape<8,8,32>;

   using ElementA = uint4b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint4b_t, 8>;

   using ElementB = uint4b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint4b_t, 8>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpMultiplyAddSaturate;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

   unsigned const & A = reinterpret_cast<unsigned const &>(a);
   unsigned const & B = reinterpret_cast<unsigned const &>(b);

   int const *C = reinterpret_cast<int const *>(&c);
   int *D = reinterpret_cast<int *>(&d);

   asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
       : "=r"(D[0]), "=r"(D[1])
       : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));

 #else
     assert(0);
 #endif
   }
 };

 //
 // b1 ^ b1 + s32 => s32
 //

 template <>
 struct Mma<
   gemm::GemmShape<8,8,128>,
   32,
   uint1b_t,
   layout::RowMajor,
   uint1b_t,
   layout::ColumnMajor,
   int,
   layout::RowMajor,
   OpXorPopc> {

   using Shape = gemm::GemmShape<8,8,128>;

   using ElementA = uint1b_t;
   using LayoutA = layout::RowMajor;
   using FragmentA = Array<uint1b_t, 32>;

   using ElementB = uint1b_t;
   using LayoutB = layout::ColumnMajor;
   using FragmentB = Array<uint1b_t, 32>;

   using ElementC = int;
   using LayoutC = layout::RowMajor;
   using FragmentC = Array<int, 2>;

   using Operator = OpXorPopc;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC &d,
     FragmentA const &a,
     FragmentB const &b,
     FragmentC const &c
   ) const {

 #if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)

 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
   using WmmaFragmentA = nvcuda::wmma::fragment<
           nvcuda::wmma::matrix_a,
           Shape::kM,
           Shape::kN,
           Shape::kK,
           nvcuda::wmma::experimental::precision::b1,
           nvcuda::wmma::row_major>;

   using WmmaFragmentB = nvcuda::wmma::fragment<
           nvcuda::wmma::matrix_b,
           Shape::kM,
           Shape::kN,
           Shape::kK,
           nvcuda::wmma::experimental::precision::b1,
           nvcuda::wmma::col_major>;

   using WmmaFragmentC = nvcuda::wmma::fragment<
           nvcuda::wmma::accumulator,
           Shape::kM,
           Shape::kN,
           Shape::kK,
           int>;

   WmmaFragmentA const & A = reinterpret_cast<WmmaFragmentA const &>(a);
   WmmaFragmentB const & B = reinterpret_cast<WmmaFragmentB const &>(b);

   WmmaFragmentC const & C = reinterpret_cast<WmmaFragmentC const &>(c);
   WmmaFragmentC & D = reinterpret_cast<WmmaFragmentC &>(d);

   nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR,
                                           nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
 #else

   assert(0); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.

 #endif // defined(CUTLASS_ARCH_WMMA_ENABLED)

 #else
     assert(0);
 #endif

   }
 };


 } // namespace arch
 } // namespace cutlass
cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:794

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:217

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementA
uint8_t ElementA
Definition: mma_sm75.h:492

cutlass::uint4b_t
integer_subbyte< 4, false > uint4b_t
4-bit Unsigned integer type
Definition: integer_subbyte.h:158

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:734

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< uint4b_t, 8 > FragmentB
Definition: mma_sm75.h:1070

wmma_array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< int8_t, 4 > FragmentB
Definition: mma_sm75.h:211

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< uint8_t, 4 > FragmentB
Definition: mma_sm75.h:610

cutlass
Definition: aligned_buffer.h:35

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:269

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:846

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:616

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:500

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:277

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:329

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:676

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< uint8_t, 4 > FragmentB
Definition: mma_sm75.h:323

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< uint8_t, 4 > FragmentB
Definition: mma_sm75.h:554

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:382

cutlass::uint1b_t
integer_subbyte< 1, false > uint1b_t
1-bit Unsigned integer type
Definition: integer_subbyte.h:152

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:1016

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:446

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementB
uint8_t ElementB
Definition: mma_sm75.h:378

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementA
int8_t ElementA
Definition: mma_sm75.h:205

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementA
int8_t ElementA
Definition: mma_sm75.h:548

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< int8_t, 4 > FragmentB
Definition: mma_sm75.h:267

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:968

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:1024

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< int8_t, 4 > FragmentA
Definition: mma_sm75.h:207

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 2 > FragmentB
Definition: mma_sm75.h:150

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< int8_t, 4 > FragmentA
Definition: mma_sm75.h:319

cutlass::integer_subbyte
4-bit signed integer type
Definition: integer_subbyte.h:42

cutlass::half_t
IEEE half-precision floating-point type.
Definition: half.h:126

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:730

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:444

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:564

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:904

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:1142

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< int4b_t, 8 > FragmentA
Definition: mma_sm75.h:780

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::ElementC
int ElementC
Definition: mma_sm75.h:1134

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:842

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:964

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< uint8_t, 4 > FragmentB
Definition: mma_sm75.h:380

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:560

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementB
int8_t ElementB
Definition: mma_sm75.h:496

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:912

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementB
uint8_t ElementB
Definition: mma_sm75.h:608

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:1018

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< int4b_t, 8 > FragmentB
Definition: mma_sm75.h:958

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:612

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementB
int8_t ElementB
Definition: mma_sm75.h:209

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:333

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementA
uint8_t ElementA
Definition: mma_sm75.h:261

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< int8_t, 4 > FragmentB
Definition: mma_sm75.h:498

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::ElementC
float ElementC
Definition: mma_sm75.h:152

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementB
int8_t ElementB
Definition: mma_sm75.h:440

mma.h
Templates exposing architecture support for multiply-add operations.

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< int8_t, 4 > FragmentA
Definition: mma_sm75.h:550

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementB
uint8_t ElementB
Definition: mma_sm75.h:321

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementA
uint8_t ElementA
Definition: mma_sm75.h:374

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:558

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< uint4b_t, 8 > FragmentB
Definition: mma_sm75.h:1014

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< int4b_t, 8 > FragmentA
Definition: mma_sm75.h:1010

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:156

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:95

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< uint4b_t, 8 > FragmentB
Definition: mma_sm75.h:840

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:508

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:556

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:674

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:273

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:327

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:384

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::Operator
OpXorPopc Operator
Definition: mma_sm75.h:1138

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< float, 4 > FragmentC
Definition: mma_sm75.h:154

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< uint8_t, 4 > FragmentA
Definition: mma_sm75.h:606

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:682

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementA
int8_t ElementA
Definition: mma_sm75.h:436

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:732

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< half_t, 4 > FragmentC
Definition: mma_sm75.h:93

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:215

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:221

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementB
uint8_t ElementB
Definition: mma_sm75.h:552

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< int8_t, 4 > FragmentA
Definition: mma_sm75.h:438

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:1020

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:1076

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm75.h:146

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:1080

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< int4b_t, 8 > FragmentB
Definition: mma_sm75.h:728

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementB
int8_t ElementB
Definition: mma_sm75.h:265

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:502

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementA
int8_t ElementA
Definition: mma_sm75.h:317

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< uint4b_t, 8 > FragmentA
Definition: mma_sm75.h:954

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:1136

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< int4b_t, 8 > FragmentA
Definition: mma_sm75.h:668

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:1072

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:908

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< uint4b_t, 8 > FragmentA
Definition: mma_sm75.h:724

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::FragmentA
Array< uint1b_t, 32 > FragmentA
Definition: mma_sm75.h:1128

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< uint4b_t, 8 > FragmentA
Definition: mma_sm75.h:1066

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:962

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< uint8_t, 4 > FragmentA
Definition: mma_sm75.h:376

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< uint4b_t, 8 > FragmentA
Definition: mma_sm75.h:836

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:844

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:906

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:678

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:620

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:738

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, float, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:160

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementA
uint8_t ElementA
Definition: mma_sm75.h:604

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:325

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:850

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:452

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:448

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< int8_t, 4 > FragmentB
Definition: mma_sm75.h:442

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Computes multiply-add.
Definition: mma_sm75.h:390

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< uint8_t, 4 > FragmentA
Definition: mma_sm75.h:263

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, FragmentC const &c) const
Definition: mma_sm75.h:98

cutlass::arch::Mma
Matrix multiply-add operation.
Definition: arch/mma.h:92

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< int4b_t, 8 > FragmentA
Definition: mma_sm75.h:898

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentB
Array< int4b_t, 8 > FragmentB
Definition: mma_sm75.h:902

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentA
Array< uint8_t, 4 > FragmentA
Definition: mma_sm75.h:494

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:386

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 128 >, 32, uint1b_t, layout::RowMajor, uint1b_t, layout::ColumnMajor, int, layout::RowMajor, OpXorPopc >::FragmentB
Array< uint1b_t, 32 > FragmentB
Definition: mma_sm75.h:1132

wmma.h
Templates exposing architecture support for warp matrix multiply-add (WMMA) operations.

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< half_t, 2 > FragmentB
Definition: mma_sm75.h:89

cutlass::arch::Mma< gemm::GemmShape< 16, 8, 8 >, 32, half_t, layout::RowMajor, half_t, layout::ColumnMajor, half_t, layout::RowMajor, OpMultiplyAdd >::FragmentA
Array< half_t, 4 > FragmentA
Definition: mma_sm75.h:85

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:1074

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, uint8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:614

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< uint4b_t, 8 > FragmentB
Definition: mma_sm75.h:784

cutlass::int4b_t
integer_subbyte< 4, true > int4b_t
4-bit Integer type
Definition: integer_subbyte.h:155

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::Operator
OpMultiplyAddSaturate Operator
Definition: mma_sm75.h:504

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentB
Array< int4b_t, 8 > FragmentB
Definition: mma_sm75.h:672

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, uint8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:271

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:786

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::FragmentC
Array< int, 2 > FragmentC
Definition: mma_sm75.h:788

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, int4b_t, layout::RowMajor, uint4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::Operator
OpMultiplyAdd Operator
Definition: mma_sm75.h:790

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 16 >, 32, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAdd >::ElementC
int ElementC
Definition: mma_sm75.h:213

cutlass::arch::Mma< gemm::GemmShape< 8, 8, 32 >, 32, uint4b_t, layout::RowMajor, int4b_t, layout::ColumnMajor, int, layout::RowMajor, OpMultiplyAddSaturate >::ElementC
int ElementC
Definition: mma_sm75.h:960