cutlass/gemm_2thread_2mma__sm61_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/thread/mma.h"


 namespace cutlass {
 namespace gemm {
 namespace thread {


 template <
   typename Shape_,
   typename LayoutC_
 >
 struct Mma<
   Shape_,
   int8_t,
   layout::RowMajor,
   int8_t,
   layout::ColumnMajor,
   int32_t,
   LayoutC_,
   arch::OpMultiplyAdd,
   bool> {

   using Shape = Shape_;

   using ElementA = int8_t;

   using LayoutA = layout::RowMajor;

   using ElementB = int8_t;

   using LayoutB = layout::ColumnMajor;

   using ElementC = int32_t;

   using LayoutC = LayoutC_;

   using Operator = arch::OpMultiplyAdd;

   using FragmentA = Array<ElementA, Shape::kMK>;

   using FragmentB = Array<ElementB, Shape::kKN>;

   using FragmentC = Array<ElementC, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     TensorRef<ElementC, LayoutC> d(
       reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));

     // Copy accumulators
     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,1,4>,
       1,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       LayoutC,
       arch::OpMultiplyAdd>;

     Mma mma;

     // Compute matrix product
     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {

       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Shape::kN; ++n) {

         CUTLASS_PRAGMA_UNROLL
         for (int m = 0; m < Shape::kM; ++m) {
           MatrixCoord mn(m, n);

           Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
           Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);

           Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));

           mma(
             tmp,
             ptr_A[m * Shape::kK / Mma::Shape::kK + k],
             ptr_B[n * Shape::kK / Mma::Shape::kK + k],
             tmp);

           d.at(mn) = reinterpret_cast<int32_t &>(tmp);
         }
       }
     }
   }
 };

 template <
   typename Shape_,
   typename LayoutC_
 >
 struct Mma<
   Shape_,
   int8_t,
   layout::ColumnMajor,
   int8_t,
   layout::RowMajor,
   int32_t,
   LayoutC_,
   arch::OpMultiplyAdd,
   int8_t> {

   using Shape = Shape_;

   using ElementA = int8_t;

   using LayoutA = layout::ColumnMajor;

   using ElementB = int8_t;

   using LayoutB = layout::RowMajor;

   using ElementC = int32_t;

   using LayoutC = LayoutC_;

   using Operator = arch::OpMultiplyAdd;

   using FragmentA = Array<ElementA, Shape::kMK>;

   using FragmentB = Array<ElementB, Shape::kKN>;

   using FragmentC = Array<ElementC, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     TensorRef<ElementC, LayoutC> d(
       reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));

     // Copy accumulators
     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,1,4>,
       1,
       ElementA,
       LayoutA,
       ElementB,
       LayoutB,
       ElementC,
       LayoutC,
       arch::OpMultiplyAdd>;

     Mma mma;
     Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
     Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);

     // Compute matrix product
     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {

       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Shape::kN; ++n) {

         CUTLASS_PRAGMA_UNROLL
         for (int m = 0; m < Shape::kM; ++m) {
           MatrixCoord mn(m, n);

           Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));

           mma(
             tmp,
             ptr_A[m + k * Shape::kM],
             ptr_B[n + k * Shape::kN],
             tmp);

           d.at(mn) = reinterpret_cast<int32_t &>(tmp);
         }
       }
     }
   }
 };

 } // namespace thread
 } // namespace gemm
 } // namespace cutlass

cutlass
Definition: aligned_buffer.h:35

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm61.h:102

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::FragmentB
Array< ElementB, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm61.h:204

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::Shape
Shape_ Shape
Size of the Gemm problem - concept: gemm::GemmShape<>
Definition: gemm/thread/mma_sm61.h:64

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm61.h:215

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::Operator
arch::OpMultiplyAdd Operator
Underlying mathematical operator.
Definition: gemm/thread/mma_sm61.h:85

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::FragmentC
Array< ElementC, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm61.h:207

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::Operator
arch::OpMultiplyAdd Operator
Underlying mathematical operator.
Definition: gemm/thread/mma_sm61.h:198

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::ElementB
int8_t ElementB
Data type of operand B.
Definition: gemm/thread/mma_sm61.h:73

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::FragmentA
Array< ElementA, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm61.h:88

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::TensorRef< ElementC, LayoutC >

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::ElementB
int8_t ElementB
Data type of operand B.
Definition: gemm/thread/mma_sm61.h:186

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

mma.h
Templates exposing architecture support for warp-level multiply-add operations.

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::FragmentA
Array< ElementA, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm61.h:201

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::ElementA
int8_t ElementA
Data type of operand A.
Definition: gemm/thread/mma_sm61.h:67

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::ElementC
int32_t ElementC
Element type of operand C.
Definition: gemm/thread/mma_sm61.h:192

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::LayoutC
LayoutC_ LayoutC
Layout of C matrix (concept: layout::MapFunc)
Definition: gemm/thread/mma_sm61.h:195

cutlass::TensorRef::at
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const
Returns a reference to the element at a given Coord.
Definition: tensor_ref.h:307

cutlass::gemm::thread::Mma
Structure to compute the matrix product.
Definition: gemm/thread/mma.h:66

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::LayoutC
LayoutC_ LayoutC
Layout of C matrix (concept: layout::MapFunc)
Definition: gemm/thread/mma_sm61.h:82

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::ElementC
int32_t ElementC
Element type of operand C.
Definition: gemm/thread/mma_sm61.h:79

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::FragmentB
Array< ElementB, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm61.h:91

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::RowMajor, int8_t, layout::ColumnMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, bool >::FragmentC
Array< ElementC, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm61.h:94

cutlass::arch::Mma
Matrix multiply-add operation.
Definition: arch/mma.h:92

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::Shape
Shape_ Shape
Size of the Gemm problem - concept: gemm::GemmShape<>
Definition: gemm/thread/mma_sm61.h:177

cutlass::gemm::thread::Mma< Shape_, int8_t, layout::ColumnMajor, int8_t, layout::RowMajor, int32_t, LayoutC_, arch::OpMultiplyAdd, int8_t >::ElementA
int8_t ElementA
Data type of operand A.
Definition: gemm/thread/mma_sm61.h:180