cutlass/mma__complex__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"

 #include "cutlass/array.h"
 #include "cutlass/complex.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/mma_sm75.h"
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/gemm/warp/mma.h"

 #include "cutlass/gemm/warp/mma_tensor_op_policy.h"
 #include "cutlass/gemm/warp/mma_tensor_op.h"

 #include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"

 namespace cutlass {
 namespace gemm {
 namespace warp {


 template <
   typename Shape_,
   typename RealElementA,
   typename LayoutA_,
   typename RealElementB,
   typename LayoutB_,
   typename RealElementC,
   typename LayoutC_,
   typename Policy_,
   ComplexTransform TransformA = ComplexTransform::kNone,
   ComplexTransform TransformB = ComplexTransform::kNone,
   typename Enable = bool
 >
 class MmaComplexTensorOp;


 template <
   typename Shape_,
   typename RealElementA,
   typename LayoutA_,
   typename RealElementB,
   typename LayoutB_,
   typename RealElementC,
   typename LayoutC_,
   typename Policy_,
   ComplexTransform TransformA,
   ComplexTransform TransformB,
   typename Enable
 >
 class MmaComplexTensorOp<
   Shape_,
   complex<RealElementA>,
   LayoutA_,
   complex<RealElementB>,
   LayoutB_,
   complex<RealElementC>,
   LayoutC_,
   Policy_,
   TransformA,
   TransformB,
   Enable>  {
 public:
   using Shape = Shape_;

   using ElementA = complex<RealElementA>;

   using LayoutA = LayoutA_;

   using ElementB = complex<RealElementB>;

   using LayoutB = LayoutB_;

   using ElementC = complex<RealElementC>;

   using LayoutC = LayoutC_;

   using Policy = Policy_;

   static ComplexTransform const kTransformA = TransformA;

   static ComplexTransform const kTransformB = TransformB;

   using OperatorClass = arch::OpClassTensorOp;

   static int const kThreadCount = 32;

 public:

   using IteratorA = MmaTensorOpMultiplicandTileIterator<
     MatrixShape<Shape::kM, Shape::kK>,
     Operand::kA,
     ElementA,
     LayoutA,
     MatrixShape<Policy::Operator::Shape::kM, Policy::Operator::Shape::kK>,
     Policy::OpDelta::kRow,
     32,
     1
   >;

   using FragmentA = typename IteratorA::Fragment;

   using IteratorB = MmaTensorOpMultiplicandTileIterator<
     MatrixShape<Shape::kK, Shape::kN>,
     Operand::kB,
     ElementB,
     LayoutB,
     MatrixShape<Policy::Operator::Shape::kK, Policy::Operator::Shape::kN>,
     Policy::OpDelta::kColumn,
     32,
     1
   >;

   using FragmentB = typename IteratorB::Fragment;


   static_assert(
     !(Shape::kM % Policy::Operator::Shape::kM) &&
     !(Shape::kN % Policy::Operator::Shape::kN),
     "Shape of warp-level Mma must be divisible by operator shape.");

   using MmaIterations = MatrixShape<
     Shape::kM / Policy::Operator::Shape::kM,
     Shape::kN / Policy::Operator::Shape::kN
   >;

   using IteratorC = MmaTensorOpAccumulatorTileIterator<
      MatrixShape<Shape::kM, Shape::kN>,
      ElementC,
      LayoutC,
      typename Policy::Operator::Shape,
      typename Policy::OpDelta>;

   using FragmentC = typename IteratorC::Fragment;

   static_assert(
     FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements,
     "Unexpected planar complex fragment length.");

 private:

   //
   // Data members
   //

   typename Policy::Operator mma;

 public:

   //
   // Methods
   //

   CUTLASS_DEVICE
   MmaComplexTensorOp() {}

   CUTLASS_DEVICE
   void operator()(
     FragmentC &D,
     FragmentA const &A,
     FragmentB const &B,
     FragmentC const &C) const {

     // Alias types for underlying real-valued matrix multiply operator
     using MmaOperandA = typename Policy::Operator::FragmentA;
     using MmaOperandB = typename Policy::Operator::FragmentB;
     using MmaOperandC = typename Policy::Operator::FragmentC;

     static_assert(MmaOperandA::kElements == 1,
       "This implementation only supports math instructions in which exactly one element is needed for the A operand."
       "We can geneneralize later.");

     static_assert(MmaOperandB::kElements == 1,
       "This implementation only supports math instructions in which exactly one element is needed for the A operand."
       "We can geneneralize later.");

     D = C;

     CUTLASS_PRAGMA_UNROLL
     for (int m = 0; m < MmaIterations::kRow; ++m) {

       // mma(accum.real(), a.real(), b.real(), accum.real());
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < MmaIterations::kColumn; ++n) {

         // Pack operands together. This may result in actual MOVs
         MmaOperandA operand_A;
         MmaOperandB operand_B;

         operand_A[0] = A[m].real();
         operand_B[0] = B[n].real();

         // Real-valued accumulator part
         MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) +
           (m + n * MmaIterations::kRow);

           mma(*accum, operand_A, operand_B, *accum);
       }

       // mma(accum.imag(), a.real(), b.imag(), accum.imag());
       CUTLASS_PRAGMA_UNROLL
       for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {

         // Pack operands together. This may result in actual MOVs
         MmaOperandA operand_A;
         MmaOperandB operand_B;

         operand_A[0] = A[m].real();
         operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());

         // Complex-valued accumulator part
         MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) +
           (m + n * MmaIterations::kRow) + MmaIterations::kCount;

         mma(*accum, operand_A, operand_B, *accum);
       }

       // mma(accum.real(), -a.imag(), b.imag(), accum.real())
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < MmaIterations::kColumn; ++n) {

         // Pack operands together. This may result in actual MOVs
         MmaOperandA operand_A;
         MmaOperandB operand_B;

         // A imaginary part is intentionally negated
         operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag());
         operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());

         // Complex-valued accumulator part
         MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) +
           (m + n * MmaIterations::kRow);

         mma(*accum, operand_A, operand_B, *accum);
       }

       // mma(accum.imag(), a.imag(), b.real(), accum.imag())
       CUTLASS_PRAGMA_UNROLL
       for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {

         // Pack operands together. This may result in actual MOVs
         MmaOperandA operand_A;
         MmaOperandB operand_B;

         operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag());
         operand_B[0] = B[n].real();

         // Real-valued accumulator part
         MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) +
           (m + n * MmaIterations::kRow) + MmaIterations::kCount;

         mma(*accum, operand_A, operand_B, *accum);
       }
     }
   }
 };


 // TODO - partial specializations of real*complex and complex*real


 } // namespace warp
 } // namespace gemm
 } // namespace cutlass

cutlass::MatrixShape
Describes the size of a matrix tile.
Definition: matrix_shape.h:42

cutlass
Definition: aligned_buffer.h:35

cutlass::ComplexTransform
ComplexTransform
Enumeraed type describing a transformation on a complex value.
Definition: complex.h:43

complex.h

memory_sm75.h
Architecture-specific operators on memory added for SM75.

mma_tensor_op_tile_iterator.h
Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::FragmentA
typename IteratorA::Fragment FragmentA
Storage for A tile.
Definition: mma_complex_tensor_op.h:173

gemm.h
Defines common types used for all GEMM-like operators.

cutlass::ComplexTransform::kNone

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::LayoutB
LayoutB_ LayoutB
Layout of multiplicand B.
Definition: mma_complex_tensor_op.h:135

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::FragmentB
typename IteratorB::Fragment FragmentB
Storage for B tile.
Definition: mma_complex_tensor_op.h:188

cutlass::gemm::warp::MmaComplexTensorOp
Definition: mma_complex_tensor_op.h:80

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

mma.h
Templates exposing architecture support for warp-level multiply-add operations.

cutlass::gemm::Operand::kA

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator
Definition: mma_tensor_op_tile_iterator.h:1794

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::FragmentC
typename IteratorC::Fragment FragmentC
Definition: mma_complex_tensor_op.h:214

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::Shape
Shape_ Shape
Shape of warp-level matrix operation (concept: GemmShape)
Definition: mma_complex_tensor_op.h:123

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::MmaComplexTensorOp
CUTLASS_DEVICE MmaComplexTensorOp()
Ctor.
Definition: mma_complex_tensor_op.h:237

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator
Definition: mma_tensor_op_tile_iterator.h:75

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::OperatorClass
arch::OpClassTensorOp OperatorClass
Indicates class of matrix operator.
Definition: mma_complex_tensor_op.h:153

cutlass::complex
Definition: complex.h:92

cutlass::ComplexTransform::kConjugate

mma_sm75.h
Matrix multiply for SM75.

cutlass::gemm::Operand::kB
A multiplicand.

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::Policy
Policy_ Policy
Shape of the warp in units of thread (concept: MmaLanePolicySimt)
Definition: mma_complex_tensor_op.h:144

mma_tensor_op.h
Templates implementing warp-level matrix multiply-accumulate operations targeting Tensor Cores...

cutlass.h
Basic include for CUTLASS.

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::operator()
CUTLASS_DEVICE void operator()(FragmentC &D, FragmentA const &A, FragmentB const &B, FragmentC const &C) const
Performs a warp-level matrix multiply-accumulate operation.
Definition: mma_complex_tensor_op.h:241

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::LayoutC
LayoutC_ LayoutC
Layout of accumulator matrix C.
Definition: mma_complex_tensor_op.h:141

mma_tensor_op_policy.h
Policy describing implementation details of warp-level GEMM targeting Tensor Cores.

cutlass::gemm::warp::MmaComplexTensorOp< Shape_, complex< RealElementA >, LayoutA_, complex< RealElementB >, LayoutB_, complex< RealElementC >, LayoutC_, Policy_, TransformA, TransformB, Enable >::LayoutA
LayoutA_ LayoutA
Layout of multiplicand A.
Definition: mma_complex_tensor_op.h:129