cutlass/gemm_2thread_2mma__sm60_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/thread/mma.h"
 #include "cutlass/functional.h"
 #include "cutlass/reduction/thread/reduce.h"


 namespace cutlass {
 namespace gemm {
 namespace thread {


 namespace detail {

 template <
   typename Shape,

   typename LayoutA,

   typename LayoutB,

   typename LayoutC,

   bool
 >
 struct Mma_HFMA2;


 // Specialization for NNN  //

 template <typename Shape>
 struct Mma_HFMA2 <
   Shape,
   layout::ColumnMajor,
   layout::ColumnMajor,
   layout::ColumnMajor,
   true
   > {

   static_assert(
     !(Shape::kM % 2),
     "Mma_HFMA2 requires the M dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<2,1,1>,
       1,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::ColumnMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

       CUTLASS_PRAGMA_UNROLL
       for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];

             mma(
                 tmp,
                 ptr_A[k*Shape::kM/2 + m],
                 ptr_B[n*Shape::kK + k],
                 tmp);

             ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
         }
       }
     }
   }
 };

 // Specialization for NNT  //

 template <typename Shape>
 struct Mma_HFMA2<
   Shape,
   layout::ColumnMajor,
   layout::ColumnMajor,
   layout::RowMajor,
   true
   > {

   static_assert(
     !(Shape::kN % 2),
     "Mma_HFMA2 requires the N dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,2,1>,
       1,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::RowMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];

             Array<half_t, 2> tmp_B;
             tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
             tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);

             mma(
                 tmp,
                 ptr_A[k*Shape::kM + m],
                 tmp_B,
                 tmp);

             ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
         }
       }
     }
   }
 };


 // Specialization for NTN  //

 template <typename Shape>
 struct Mma_HFMA2 <
   Shape,
   layout::ColumnMajor,
   layout::RowMajor,
   layout::ColumnMajor,
   true
   > {

   static_assert(
     !(Shape::kM % 2),
     "Mma_HFMA2 requires the GEMM M dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<2,1,1>,
       1,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::RowMajor,
       half_t,
       layout::ColumnMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {

         CUTLASS_PRAGMA_UNROLL
         for (int m = 0; m < Shape::kM / Mma::Shape::kM; ++m) {

           CUTLASS_PRAGMA_UNROLL
           for (int n = 0; n < Shape::kN / Mma::Shape::kN; ++n) {

           Array<half_t, 2> tmp;
           Array<half_t, 2> *ptr_tmp = &tmp;

           ptr_tmp[0] = ptr_D[m + n * Shape::kM/2];

           mma(
             tmp,
             ptr_A[m + k * Shape::kM/2],
             ptr_B[k * Shape::kN + n],
             tmp);

           ptr_D[m + n * Shape::kM/2] = ptr_tmp[0];
         }
       }
     }
   }
 };

 // Specialization for NTT  //

 template <typename Shape>
 struct Mma_HFMA2<
   Shape,
   layout::ColumnMajor,
   layout::RowMajor,
   layout::RowMajor,
   true
   > {

   static_assert(
     !(Shape::kN % 2),
     "Mma_HFMA2 requires the N dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,2,1>,
       1,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::RowMajor,
       half_t,
       layout::RowMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];

             mma(
                 tmp,
                 ptr_A[k*Shape::kM + m],
                 ptr_B[k*Shape::kN/2 + n],
                 tmp);

             ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
         }
       }
     }
   }
 };


 // Specialization for TNN  //

 template <typename Shape>
 struct Mma_HFMA2 <
   Shape,
   layout::RowMajor,
   layout::ColumnMajor,
   layout::ColumnMajor,
   true
   > {

   static_assert(
     !(Shape::kM % 2),
     "Mma_HFMA2 requires the M dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<2,1,1>,
       1,
       half_t,
       layout::RowMajor,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::ColumnMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

       CUTLASS_PRAGMA_UNROLL
       for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];

             Array<half_t, 2> tmp_A;
             tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
             tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);

             mma(
                 tmp,
                 tmp_A,
                 ptr_B[n*Shape::kK + k],
                 tmp);

             ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
         }
       }
     }
   }
 };

 // Specialization for TNT  //

 template <typename Shape>
 struct Mma_HFMA2 <
   Shape,
   layout::RowMajor,
   layout::ColumnMajor,
   layout::RowMajor,
   true
   > {

   static_assert(
     !(Shape::kN % 2),
     "Mma_HFMA2 requires the N dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,2,1>,
       1,
       half_t,
       layout::RowMajor,
       half_t,
       layout::ColumnMajor,
       half_t,
       layout::RowMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];

             Array<half_t, 2> tmp_B;
             tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
             tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);

             mma(
                 tmp,
                 ptr_A[m*Shape::kK + k],
                 tmp_B,
                 tmp);

             ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
         }
       }
     }
   }
 };

 // Specialization for TTN  //

 template <typename Shape>
 struct Mma_HFMA2 <
   Shape,
   layout::RowMajor,
   layout::RowMajor,
   layout::ColumnMajor,
   true
   > {

   static_assert(
     !(Shape::kM % 2),
     "Mma_HFMA2 requires the M dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<2,1,1>,
       1,
       half_t,
       layout::RowMajor,
       half_t,
       layout::RowMajor,
       half_t,
       layout::ColumnMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

       CUTLASS_PRAGMA_UNROLL
       for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[n*Shape::kM/2 + m];

             Array<half_t, 2> tmp_A;
             tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
             tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);

             mma(
                 tmp,
                 tmp_A,
                 ptr_B[k*Shape::kN + n],
                 tmp);

             ptr_D[n*Shape::kM/2 + m] = ptr_tmp[0];
         }
       }
     }
   }
 };


 // Specialization for TTT  //

 template <typename Shape>
 struct Mma_HFMA2<
   Shape,
   layout::RowMajor,
   layout::RowMajor,
   layout::RowMajor,
   true
   > {

   static_assert(
     !(Shape::kN % 2),
     "Mma_HFMA2 requires the N dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using Mma = arch::Mma<
       gemm::GemmShape<1,2,1>,
       1,
       half_t,
       layout::RowMajor,
       half_t,
       layout::RowMajor,
       half_t,
       layout::RowMajor,
       arch::OpMultiplyAdd>;

     Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
     Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     Mma mma;

     CUTLASS_PRAGMA_UNROLL
     for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){

         CUTLASS_PRAGMA_UNROLL
         for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){

           CUTLASS_PRAGMA_UNROLL
           for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){

             Array<half_t, 2> tmp;
             Array<half_t, 2> *ptr_tmp = &tmp;
             ptr_tmp[0] = ptr_D[m*Shape::kN/2 + n];

             mma(
                 tmp,
                 ptr_A[m*Shape::kK + k],
                 ptr_B[k*Shape::kN/2 + n],
                 tmp);

             ptr_D[m*Shape::kN/2 + n] = ptr_tmp[0];
         }
       }
     }
   }
 };

 // Specialization for TNT + Inner Product  or 1x1x2K + LayoutC = T //

 template <typename Shape, typename LayoutA, typename LayoutB>
 struct Mma_HFMA2<
   Shape,
   LayoutA,
   LayoutB,
   layout::RowMajor,
   false
   > {

   static_assert(
     !(Shape::kK % 2),
     "Mma_HFMA2 requires the K dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using GemmShape = gemm::GemmShape<1,1,2>;

     Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     // Inner product is calculated using MACs, followed by final reduction
     multiply_add<Array<half_t, 2>> mac;
     cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;

     CUTLASS_PRAGMA_UNROLL
     for(auto n=0; n < Shape::kN / GemmShape::kN; n++){

       CUTLASS_PRAGMA_UNROLL
       for(auto m=0; m < Shape::kM / GemmShape::kM; m++){

         Array<half_t, 2> tmp_C;
         tmp_C.clear();
         Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
         ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];

         CUTLASS_PRAGMA_UNROLL
         for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){
           tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
         }

         Array<half_t, 1> res;
         Array<half_t, 1> *ptr_res = &res;
         res = reduce(tmp_C);

         ptr_D[m*Shape::kN + n] = ptr_res[0];
       }
     }
   }
 };

 // Specialization for TNN + Inner Product  or 1x1x2K + LayoutC = N //

 template <typename Shape, typename LayoutA, typename LayoutB>
 struct Mma_HFMA2<
   Shape,
   LayoutA,
   LayoutB,
   layout::ColumnMajor,
   false
   > {

   static_assert(
     !(Shape::kK % 2),
     "Mma_HFMA2 requires the K dimension to be divisible by 2."
   );

   using FragmentA = Array<half_t, Shape::kMK>;

   using FragmentB = Array<half_t, Shape::kKN>;

   using FragmentC = Array<half_t, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     D = C;

     using GemmShape= gemm::GemmShape<1,1,2>;

     Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
     Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
     Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);

     // Inner product is calculated using MACs, followed by final reduction
     multiply_add<Array<half_t, 2>> mac;
     cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;

     CUTLASS_PRAGMA_UNROLL
     for(auto n=0; n < Shape::kN / GemmShape::kN; n++){

       CUTLASS_PRAGMA_UNROLL
       for(auto m=0; m < Shape::kM / GemmShape::kM; m++){

         Array<half_t, 2> tmp_C;
         tmp_C.clear();
         Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
         ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];

         CUTLASS_PRAGMA_UNROLL
         for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){

           tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);

         }

         Array<half_t, 1> res;
         Array<half_t, 1> *ptr_res = &res;
         res = reduce(tmp_C);

         ptr_D[n*Shape::kM + m] = ptr_res[0];
       }
     }
   }
 };

 } // namespace detail


 template <
   typename Shape_, typename LayoutA, typename LayoutB, typename LayoutC
 >
 struct Mma<
   Shape_,
   half_t,
   LayoutA,
   half_t,
   LayoutB,
   half_t,
   LayoutC,
   arch::OpMultiplyAdd
   > {

   using Shape = Shape_;

   using ElementA = half_t;

   using ElementB = half_t;

   using ElementC = half_t;

   using Operator = arch::OpMultiplyAdd;

   using FragmentA = Array<ElementA, Shape::kMK>;

   using FragmentB = Array<ElementB, Shape::kKN>;

   using FragmentC = Array<ElementC, Shape::kMN>;

   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     constexpr bool a_row_major = platform::is_same< LayoutA, layout::RowMajor>::value;
     constexpr bool b_column_major = platform::is_same< LayoutB, layout::ColumnMajor>::value;
     constexpr bool c_row_major = platform::is_same< LayoutC, layout::RowMajor>::value;
     constexpr bool c_column_major = platform::is_same< LayoutC, layout::ColumnMajor>::value;

     constexpr bool m_mod2 = !(Shape::kM % 2);
     constexpr bool n_mod2 = !(Shape::kN % 2);
     constexpr bool k_mod2 = !(Shape::kK % 2);

     // HFMA based MMA optimizations are of 2 types :
     // 1. Inner product
     // 2. Outer product
     // It is chosen based on LayoutC (for outer product gemm) or
     // Using LayoutA and LayoutB or shape=1x1x2K (for inner product gemms)
     // If all fails, we choose the generic MMA
     constexpr bool use_outer_prod = (c_column_major && m_mod2) || (c_row_major && n_mod2);
     constexpr bool use_inner_prod = (a_row_major && b_column_major && k_mod2) || (Shape::kM==1 && Shape::kN==1 && k_mod2);
     constexpr bool use_optimized =  (use_outer_prod || use_inner_prod);

     typename platform::conditional< use_optimized,
       detail::Mma_HFMA2<Shape, LayoutA, LayoutB, LayoutC, use_outer_prod>,
       MmaGeneric <Shape, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, Operator>
     >::type mma;

     mma(D, A, B, C);

   }
 };


 namespace detail {

   template <
     typename LayoutA,
     typename LayoutB>
   struct EnableMma_Crow_SM60 {

     static bool const kIsConventionalLayout =
       (platform::is_same<LayoutA, layout::RowMajor>::value ||
         platform::is_same<LayoutA, layout::ColumnMajor>::value) &&
       (platform::is_same<LayoutB, layout::RowMajor>::value ||
         platform::is_same<LayoutB, layout::ColumnMajor>::value);

     static bool const value = kIsConventionalLayout;
   };
 };


 template <
   typename Shape_,
   typename LayoutA_,
   typename LayoutB_
 >
 struct Mma<
   Shape_,
   half_t,
   LayoutA_,
   half_t,
   LayoutB_,
   half_t,
   layout::RowMajor,
   arch::OpMultiplyAdd,
   typename platform::enable_if<detail::EnableMma_Crow_SM60<
     LayoutA_,
     LayoutB_
     >::value>::type>{

   using Shape = Shape_;
   using ElementA = half_t;
   using LayoutA = LayoutA_;
   using ElementB = half_t;
   using LayoutB = LayoutB_;
   using ElementC = half_t;
   using LayoutC = layout::RowMajor;
   using Operator = arch::OpMultiplyAdd;

   using TransposeMma = Mma<
     GemmShapeTranspose<Shape>,
     half_t,
     typename layout::LayoutTranspose<LayoutB>::type,
     half_t,
     typename layout::LayoutTranspose<LayoutA>::type,
     half_t,
     layout::ColumnMajor,
     arch::OpMultiplyAdd,
     bool>;

   using FragmentA = Array<ElementA, Shape::kMK>;
   using FragmentB = Array<ElementB, Shape::kKN>;
   using FragmentC = Array<ElementC, Shape::kMN>;

   CUTLASS_HOST_DEVICE
   void operator()(
     FragmentC & D,
     FragmentA const & A,
     FragmentB const & B,
     FragmentC const & C) {

     TransposeMma mma;

     mma(D, B, A, C);
   }
 };


 } // namespace thread
 } // namespace gemm
 } // namespace cutlass

cutlass::multiply_add
Fused multiply-add.
Definition: functional.h:92

cutlass::gemm::thread::detail::EnableMma_Crow_SM60
Determines whether to enable thread::Gemm<> specializations compatible with SM50. ...
Definition: gemm/thread/mma_sm60.h:1030

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::RowMajor, false >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:801

cutlass::gemm::GemmShape::kM
static int const kM
Definition: include/cutlass/gemm/gemm.h:58

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:271

cutlass
Definition: aligned_buffer.h:35

constexpr
#define constexpr
Definition: platform.h:137

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:94

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:528

cutlass::platform::is_same
std::is_same (false specialization)
Definition: platform.h:394

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::RowMajor, false >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:809

cutlass::gemm::thread::detail::Mma_HFMA2
Structure to compute the matrix product for HFMA.
Definition: gemm/thread/mma_sm60.h:66

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::FragmentC
Array< ElementC, Shape::kMN > FragmentC
Definition: gemm/thread/mma_sm60.h:1087

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:441

cutlass::half_t
IEEE half-precision floating-point type.
Definition: half.h:126

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Definition: gemm/thread/mma_sm60.h:1090

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:444

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:438

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::RowMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:357

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::ColumnMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:102

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::RowMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:723

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::ColumnMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:632

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:174

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::RowMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:712

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:624

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::Operator
arch::OpMultiplyAdd Operator
Definition: gemm/thread/mma_sm60.h:1072

cutlass::gemm::GemmShape::kK
static int const kK
Definition: include/cutlass/gemm/gemm.h:60

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:177

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::FragmentB
Array< ElementB, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:975

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:91

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::FragmentB
Array< ElementB, Shape::kKN > FragmentB
Definition: gemm/thread/mma_sm60.h:1086

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::RowMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:365

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::Operator
arch::OpMultiplyAdd Operator
Underlying mathematical operator.
Definition: gemm/thread/mma_sm60.h:969

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::RowMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:709

cutlass::layout::LayoutTranspose
Defines transposes of matrix layouts.
Definition: layout/matrix.h:921

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:531

cutlass::gemm::thread::MmaGeneric
Gemplate that handles all packed matrix layouts.
Definition: gemm/thread/mma_sm50.h:65

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::ColumnMajor, false >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:888

reduce.h
Defines basic thread level reduction with specializations for Array<T, N>.

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::FragmentC
Array< ElementC, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:978

cutlass::platform::enable_if
std::enable_if (true specialization)
Definition: platform.h:315

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::RowMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:188

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

mma.h
Templates exposing architecture support for warp-level multiply-add operations.

cutlass::gemm::GemmShape
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57

cutlass::platform::conditional
std::conditional (true specialization)
Definition: platform.h:325

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:265

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::ColumnMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:88

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::ColumnMajor, false >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:880

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:986

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::RowMajor, false >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:795

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::RowMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:351

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::ColumnMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:452

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:621

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::Shape
Shape_ Shape
Size of the Gemm problem - concept: gemm::GemmShape<>
Definition: gemm/thread/mma_sm60.h:957

cutlass::gemm::thread::Mma
Structure to compute the matrix product.
Definition: gemm/thread/mma.h:66

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::ColumnMajor, false >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:877

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::RowMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:715

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:534

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::ColumnMajor, layout::RowMajor, true >::FragmentC
Array< half_t, Shape::kMN > FragmentC
C operand storage.
Definition: gemm/thread/mma_sm60.h:180

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::RowMajor, false >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:798

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:618

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::ColumnMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:268

cutlass::arch::Mma
Matrix multiply-add operation.
Definition: arch/mma.h:92

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA, half_t, LayoutB, half_t, LayoutC, arch::OpMultiplyAdd >::FragmentA
Array< ElementA, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:972

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::RowMajor, true >::FragmentB
Array< half_t, Shape::kKN > FragmentB
B operand storage.
Definition: gemm/thread/mma_sm60.h:354

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::RowMajor, layout::ColumnMajor, layout::RowMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:542

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, layout::ColumnMajor, layout::RowMajor, layout::ColumnMajor, true >::operator()
CUTLASS_HOST_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C)
Computes a matrix product D = A * B + C.
Definition: gemm/thread/mma_sm60.h:279

functional.h
Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::FragmentA
Array< ElementA, Shape::kMK > FragmentA
Definition: gemm/thread/mma_sm60.h:1085

cutlass::reduction::thread::Reduce
Structure to compute the thread level reduction.
Definition: reduce.h:43

cutlass::arch::mac
CUTLASS_HOST_DEVICE Array< T, N > mac(Array< T, N > const &a, Array< T, N > const &b, Array< T, N > const &c)
Definition: simd.h:84

cutlass::gemm::thread::Mma< Shape_, half_t, LayoutA_, half_t, LayoutB_, half_t, layout::RowMajor, arch::OpMultiplyAdd, typename platform::enable_if< detail::EnableMma_Crow_SM60< LayoutA_, LayoutB_ >::value >::type >::Shape
Shape_ Shape
Definition: gemm/thread/mma_sm60.h:1065

cutlass::gemm::thread::detail::Mma_HFMA2< Shape, LayoutA, LayoutB, layout::ColumnMajor, false >::FragmentA
Array< half_t, Shape::kMK > FragmentA
A operand storage.
Definition: gemm/thread/mma_sm60.h:874

cutlass::gemm::GemmShape::kN
static int const kN
Definition: include/cutlass/gemm/gemm.h:59