cutlass/mma__tensor__op__tile__iterator__wmma_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once


 #include "cutlass/cutlass.h"
 #include "cutlass/arch/wmma.h"

 #if defined(CUTLASS_ARCH_WMMA_ENABLED)

 #include "cutlass/wmma_array.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/matrix_shape.h"

 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/gemm/gemm.h"

 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/tensor.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm75.h"

 #include "cutlass/platform/platform.h"
 #include "cutlass/fast_math.h"


 namespace cutlass {
 namespace gemm {
 namespace warp {

 template <
     typename Shape_,
     Operand Operand,
     typename Element_,
     typename Layout_,
     int OpDelta_,
     int Threads,
     typename Policy_>
 class MmaTensorOpWmmaMultiplicandTileIterator;


 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     int OpDelta_,
     typename Policy_>
 class MmaTensorOpWmmaMultiplicandTileIterator<
     Shape_, Operand::kA, Element_, Layout_,
     OpDelta_, 32, Policy_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kA;

   using Element = Element_;

   using Layout = Layout_;

   static int const kOpDelta = OpDelta_;

   using Policy = Policy_;


   //
   // Derived quantities
   //
   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using WmmaShape = MatrixShape<
     Policy::Operator::Shape::kM,
     Policy::Operator::Shape::kK
   >;

   using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;

   using Iterations = MatrixShape<
     Shape::kRow / WmmaShape::kRow,
     1
   >;

   using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentA, Iterations::kCount>;


   static_assert(kOperand == Operand::kA,
     "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for A operands to warp-level Mma.");

   static_assert(
     platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
     platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
     "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");

   static_assert(kOpDelta == 1,
     "Alternative arrangements not supported at present.");


 private:

   char const *pointer_;

   Index byte_offset_;

   Index stride_;

   Layout layout_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator() { }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) {

   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
     byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     Index elements_offset = layout_({tile_offset.row() * Shape::kRow, tile_offset.column() * WmmaShape::kColumn});

     byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator++() {

     Index elements_offset = layout_({0, WmmaShape::kColumn});

     byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator--() {

     Index elements_offset = layout_({0, WmmaShape::kColumn});

     byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_byte_offset(Fragment &frag, Index byte_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kColumn; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kRow; ++m) {

         Index load_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;

         const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset);

         nvcuda::wmma::load_matrix_sync(frag[m], ptr, stride_);

       }
     }
   }
   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_byte_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kColumn; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Iterations::kRow; ++m) {

         Index store_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;

         WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);

         nvcuda::wmma::store_matrix_sync(ptr, frag[m], stride_);

       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_byte_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     int OpDelta_,
     typename Policy_>
 class MmaTensorOpWmmaMultiplicandTileIterator<
     Shape_, Operand::kB, Element_, Layout_,
     OpDelta_, 32, Policy_> {
  public:

   using Shape = Shape_;

   static Operand const kOperand = Operand::kB;

   using Element = Element_;

   using Layout = Layout_;

   static int const kOpDelta = OpDelta_;

   using Policy = Policy_;


   //
   // Derived quantities
   //

   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using WmmaShape = MatrixShape<
     Policy::Operator::Shape::kK,
     Policy::Operator::Shape::kN
   >;

   using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;

   using Iterations = MatrixShape<
     1,
     Shape::kColumn / WmmaShape::kColumn
   >;

   using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentB, Iterations::kCount>;


   static_assert(kOperand == Operand::kB,
     "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for B operands to warp-level Mma.");

   static_assert(
     platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
     platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
     "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");

   static_assert(kOpDelta == 1,
     "Alternative arrangements not supported at present.");


 private:

   char const *pointer_;

   Index byte_offset_;

   Index stride_;

   Layout layout_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator() { }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator(
     TensorRef const &ref,
     int lane_id
   ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) {
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {

     byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {

     Index elements_offset = layout_({tile_offset.row() * WmmaShape::kRow, tile_offset.column() * Shape::kColumn});

     byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator++() {

     Index elements_offset = layout_({WmmaShape::kRow, 0});

     byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator--() {

     Index elements_offset = layout_({WmmaShape::kRow, 0});

     byte_offset_ -= (elements_offset + sizeof_bits<Element>::value) / 8;
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_byte_offset(Fragment &frag, Index byte_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kRow; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {

         Index load_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;

         const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset);

         nvcuda::wmma::load_matrix_sync(frag[n], ptr, stride_);
       }
     }
   }
   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_byte_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Iterations::kRow; ++k) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {

         Index store_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;

         WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);

         nvcuda::wmma::store_matrix_sync(ptr, frag[n], stride_);
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_byte_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };

 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     typename OpDelta_,
     typename Policy_>
 class MmaTensorOpWmmaAccumulatorTileIterator;


 template <
     typename Shape_,
     typename Element_,
     typename Layout_,
     typename OpDelta_,
     typename Policy_>
 class MmaTensorOpWmmaAccumulatorTileIterator
 {
  public:

   using Shape = Shape_;

   using Element = Element_;

   using Layout = Layout_;

   using OpDelta = OpDelta_;

   static int const kThreads = 32;

   using Policy = Policy_;


   //
   // Derived quantities
   //
   using TensorRef = TensorRef<Element, Layout>;

   using Index = typename TensorRef::Index;

   using LongIndex = typename TensorRef::LongIndex;

   using TensorCoord = typename TensorRef::TensorCoord;

   using WmmaShape = MatrixShape<
     Policy::Operator::Shape::kM,
     Policy::Operator::Shape::kN
   >;

   using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;

   static nvcuda::wmma::layout_t const WmmaLayout = cutlass::arch::CutlassToWmmaLayout<Layout>::value;

   using Iterations = MatrixShape<
     Shape::kRow / WmmaShape::kRow,
     Shape::kColumn / WmmaShape::kColumn
   >;

   using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentC, Iterations::kCount>;

   static_assert(
     platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
     platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
     "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");

 private:

   cutlass::TensorRef<Element, Layout> ref_;

 public:

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator() { }

   CUTLASS_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator(
     TensorRef const &ref,
     int lane_id
   ): ref_(ref) { }

   CUTLASS_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
     ref_.add_pointer_offset(offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
     ref_.add_coord_offset({tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn});
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator & operator++() {
     ref_.add_coord_offset({Shape::kRow, 0});
     return *this;
   }

   CUTLASS_HOST_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator & operator--() {
     ref_.add_coord_offset({-Shape::kRow, 0});
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
     add_tile_offset(tile_offset);
     return *this;
   }

   CUTLASS_DEVICE
   MmaTensorOpWmmaAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
     add_tile_offset(-tile_offset);
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int m = 0; m < Iterations::kRow; ++m) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {

         const WmmaDataType * ptr = reinterpret_cast<const WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);

         nvcuda::wmma::load_matrix_sync(frag[m * Iterations::kColumn + n], ptr, ref_.stride()[0], WmmaLayout);

       }
     }
   }
   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) const {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {

     CUTLASS_PRAGMA_UNROLL
     for (int m = 0; m < Iterations::kRow; ++m) {
       CUTLASS_PRAGMA_UNROLL
       for (int n = 0; n < Iterations::kColumn; ++n) {

         WmmaDataType * ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);

         nvcuda::wmma::store_matrix_sync(ptr, frag[m * Iterations::kColumn + n], ref_.stride()[0], WmmaLayout);
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) const {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void set_kgroup_index(int k_group) {
     // no operation here
   }
 };


 } // namespace warp
 } // namespace gemm
 } // namespace cutlass


 #endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)


cutlass::platform::integral_constant::value
static const value_t value
Definition: platform.h:261

wmma_array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass
Definition: aligned_buffer.h:35

cutlass::sizeof_bits::value
static int const value
Definition: numeric_types.h:43

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

tensor_op_multiplicand_sm75.h

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::gemm::Operand
Operand
GEMM operand enumeration: D = A * B + C.
Definition: include/cutlass/gemm/gemm.h:39

memory_sm75.h
Architecture-specific operators on memory added for SM75.

gemm.h
Defines common types used for all GEMM-like operators.

platform.h
C++ features that may be otherwise unimplemented for CUDA device functions.

cutlass::operator+=
CUTLASS_HOST_DEVICE half_t & operator+=(half_t &lhs, half_t const &rhs)
Definition: half.h:654

cutlass::TensorRef::add_coord_offset
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer.
Definition: tensor_ref.h:326

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::operator-=
CUTLASS_HOST_DEVICE half_t & operator-=(half_t &lhs, half_t const &rhs)
Definition: half.h:664

tensor.h
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...

cutlass::operator++
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694

cutlass::gemm::Operand::kA

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::TensorRef::TensorCoord
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space.
Definition: tensor_ref.h:171

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::TensorRef< Element, Layout >

cutlass::operator--
CUTLASS_HOST_DEVICE half_t & operator--(half_t &lhs)
Definition: half.h:706

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::TensorRef::Index
typename Layout::Index Index
Index type.
Definition: tensor_ref.h:165

matrix.h
Defines layout functions used by TensorRef and derived classes.

fast_math.h
Math utilities.

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::TensorRef::add_pointer_offset
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer.
Definition: tensor_ref.h:319

cutlass::gemm::Operand::kB
A multiplicand.

wmma.h
Templates exposing architecture support for warp matrix multiply-add (WMMA) operations.

cutlass.h
Basic include for CUTLASS.

cutlass::TensorRef::LongIndex
typename Layout::LongIndex LongIndex
Long index used for pointer offsets.
Definition: tensor_ref.h:168