cutlass/tensor__op__multiplicand__sm70_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/coord.h"
 #include "cutlass/layout/pitch_linear.h"


 namespace cutlass {
 namespace layout {

 // template <
 //   int ElementSize,
 //   gemm::Operand Operand
 // >
 // struct VoltaTensorOpMultiplicandCongruous;

 // template <
 //   int ElementSize,
 //   gemm::Operand Operand
 // >
 // struct ColumnMajorVoltaTensorOpMultiplicandCongruous;
 // template <
 //   int ElementSize,
 //   gemm::Operand Operand
 // >
 // struct RowMajorVoltaTensorOpMultiplicandCongruous;

 template <int ElementSize>
 struct VoltaTensorOpMultiplicandCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 128;

   using TileShape = PitchLinearShape<8, 4>;

   using PartitionShape = PitchLinearShape<8, 2>;

   //
   // Static constants
   //

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;

   using PartitionCount = PitchLinearShape<
     TileShape::kContiguous / PartitionShape::kContiguous,
     TileShape::kStrided / PartitionShape::kStrided
   >;

   using AccessCount = PitchLinearShape<
     PartitionShape::kContiguous,
     PartitionShape::kStrided
   >;

 private:

   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandCongruous(Index ldm = 0): stride_(ldm) { }

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandCongruous(Stride stride): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return VoltaTensorOpMultiplicandCongruous(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {

     // First, compute c and s of vector within source (in units of vector accesses)
     int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
     int vec_strided_idx = coord.strided();

     // Compute the fundamental tile being accessed
     int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
     int tile_strided_idx = vec_strided_idx / TileShape::kStrided;

     int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
     int tile_strided_residual = vec_strided_idx % TileShape::kStrided;

     // Then swizzle in a tile
     // Swizzle pattern is (tid[2:0] << 2)|(tid[4:3] ^ tid[2:1])
     int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
     int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
                                        ((tile_contiguous_residual & 1) << 2);
     // Compute final element location
     int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
         permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);

     int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;

     return element_contiguous + element_strided * stride_[0];
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent[1] * stride_[0];
   }
 };


 template <int ElementSize>
 struct ColumnMajorVoltaTensorOpMultiplicandCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return ColumnMajorVoltaTensorOpMultiplicandCongruous(extent.row());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.row(), coord.column()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.contiguous(), coord.strided());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
   }
 };

 template <int ElementSize>
 struct RowMajorVoltaTensorOpMultiplicandCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return RowMajorVoltaTensorOpMultiplicandCongruous(extent.column());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.column(), coord.row()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.strided(), coord.contiguous());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
   }
 };


 // template <int ElementSize, Operand Operand>
 template <int ElementSize>
 struct VoltaTensorOpMultiplicandBCongruous {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 128;

   using TileShape = PitchLinearShape<8, 4>;

   using PartitionShape = PitchLinearShape<4, 4>;

   //
   // Static constants
   //

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;

   using PartitionCount = PitchLinearShape<
     TileShape::kContiguous / PartitionShape::kContiguous,
     TileShape::kStrided / PartitionShape::kStrided
   >;

   using AccessCount = PitchLinearShape<
     PartitionShape::kContiguous,
     PartitionShape::kStrided
   >;

 private:

   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandBCongruous(Index ldm = 0): stride_(ldm) { }

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandBCongruous(Stride stride): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
     return VoltaTensorOpMultiplicandBCongruous(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {

     // First, compute c and s of vector within source (in units of vector accesses)
     int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
     int vec_strided_idx = coord.strided();

     // Compute the fundamental tile being accessed
     int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
     int tile_strided_idx = vec_strided_idx / TileShape::kStrided;

     int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
     int tile_strided_residual = vec_strided_idx % TileShape::kStrided;

     // Then swizzle in a tile
     // Swizzle pattern is (tid[1:0] << 3)|(tid & 0x4)|(tid[1:0])
     int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
     int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
                                        (tile_contiguous_residual & 0x4);

     // Compute final element location
     int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
         permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);

     int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;

     return element_contiguous + element_strided * stride_[0];
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent[1] * stride_[0];
   }
 };


 template <int ElementSize>
 struct ColumnMajorVoltaTensorOpMultiplicandBCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
     return ColumnMajorVoltaTensorOpMultiplicandBCongruous(extent.row());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.row(), coord.column()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.contiguous(), coord.strided());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
   }
 };

 template <int ElementSize>
 struct RowMajorVoltaTensorOpMultiplicandBCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
     return RowMajorVoltaTensorOpMultiplicandBCongruous(extent.column());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.column(), coord.row()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.strided(), coord.contiguous());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
   }
 };

 template <int ElementSize, int KBlock>
 struct VoltaTensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 64;

   //
   // Static constants
   //

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;
   static int const kKBlock = KBlock;

  private:
   //
   // Data members
   //

   Stride stride_;
  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : stride_(ldm) {}

   CUTLASS_HOST_DEVICE
   VoltaTensorOpMultiplicandCrosswise(Stride stride) : stride_(stride) {}

   CUTLASS_HOST_DEVICE
   static VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
     return VoltaTensorOpMultiplicandCrosswise(extent[1]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {

     //
     // First, compute c and s of vector within source (in units of vector
     // accesses)
     //
     int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
     int vec_strided_idx = coord.strided();

     //
     // Then swizzle
     // The mapping is like this:
     // id[1:0]|(id[3]^id[4])|id[2]

     int vec_strided_within_tile = vec_contiguous_idx & 0x7;
     int permuted_vec_contiguous =
         (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
         (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);

     permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);

     int permuted_vec_strided = vec_contiguous_idx;

     //
     // Compute final element location
     //

     int element_contiguous = permuted_vec_contiguous *  kElementsPerAccess +
                              (coord.contiguous() % kElementsPerAccess);

     return element_contiguous + permuted_vec_strided * (stride_[0] * kElementsPerAccess);
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return stride_; }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return stride_; }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent[0] * stride_[0];
   }
 };

 template <int ElementSize, int KBlock>
 struct ColumnMajorVoltaTensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;

   static int const kAccessSize = Base::kAccessSize;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(
       TensorCoord const &extent) {
     return ColumnMajorVoltaTensorOpMultiplicandCrosswise(extent.column());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.row(), coord.column()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.contiguous(), coord.strided());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
   }
 };

 template <int ElementSize, int KBlock>
 struct RowMajorVoltaTensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;

   static int const kAccessSize = Base::kAccessSize;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static RowMajorVoltaTensorOpMultiplicandCrosswise packed(
       TensorCoord const &extent) {
     return RowMajorVoltaTensorOpMultiplicandCrosswise(extent.row());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.column(), coord.row()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.strided(), coord.contiguous());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
   }
 };

 } // namespace layout
 } // namespace cutlass

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:388

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:935

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1021

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:519

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:261

cutlass::MatrixCoord::column
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:604

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::RowMajorVoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:992

cutlass::layout::VoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69

cutlass
Definition: aligned_buffer.h:35

cutlass::layout::VoltaTensorOpMultiplicandCongruous::VoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:130

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:617

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:228

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:220

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:356

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:930

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:706

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:1030

cutlass::layout::VoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:170

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:382

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::ColumnMajorVoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:583

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:331

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:926

cutlass::MatrixCoord::row
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:848

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:477

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::RowMajorVoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:996

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:507

cutlass::layout::VoltaTensorOpMultiplicandCongruous::kElementSize
static int const kElementSize
Definition: tensor_op_multiplicand_sm70.h:97

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:658

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:840

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:321

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::packed
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:693

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:267

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642

cutlass::layout::VoltaTensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:134

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:557

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1025

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:369

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::RowMajorVoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:352

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:219

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:513

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528

cutlass::layout::PitchLinearShape::kStrided
static int const kStrided
Definition: pitch_linear.h:45

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::VoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:785

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:623

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:1008

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405

cutlass::layout::PitchLinearShape::kContiguous
static int const kContiguous
Definition: pitch_linear.h:44

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:667

cutlass::layout::VoltaTensorOpMultiplicandCongruous::kElementsPerAccess
static int const kElementsPerAccess
Definition: tensor_op_multiplicand_sm70.h:98

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:919

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::packed
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:591

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:330

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:725

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:598

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:565

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293

cutlass::layout::VoltaTensorOpMultiplicandCongruous::kRank
static int const kRank
Logical rank of tensor.
Definition: tensor_op_multiplicand_sm70.h:63

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:1000

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:659

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:1014

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::ColumnMajorVoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:897

cutlass::layout::VoltaTensorOpMultiplicandBCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:668

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:363

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:913

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::VoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:466

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:789

cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::RowMajorVoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:689

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::packed
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:470

cutlass::layout::VoltaTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:254

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:796

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:719

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:611

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:905

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:835

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:274

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:566

cutlass::layout::VoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:733

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::RowMajorVoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:348

cutlass::Coord< kStrideRank, Index, LongIndex >

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm70.h:943

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:556

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:376

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744

cutlass::layout::VoltaTensorOpMultiplicandCongruous::kStrideRank
static int const kStrideRank
Rank of stride vector.
Definition: tensor_op_multiplicand_sm70.h:66

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous::ColumnMajorVoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:587

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::ColumnMajorVoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:246

cutlass::layout::VoltaTensorOpMultiplicandCongruous::VoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:126

cutlass::layout::VoltaTensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:182

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:229

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::VoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:781

cutlass::layout::VoltaTensorOpMultiplicandCongruous::kAccessSize
static int const kAccessSize
This layout is optimized for 128b accesses.
Definition: tensor_op_multiplicand_sm70.h:85

cutlass::layout::VoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:176

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:713

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::VoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:462

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:286

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:700

cutlass::layout::RowMajorVoltaTensorOpMultiplicandCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:322

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:280

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise::ColumnMajorVoltaTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:901

cutlass::layout::VoltaTensorOpMultiplicandBCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408

cutlass::layout::VoltaTensorOpMultiplicandCongruous
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97

cutlass::layout::VoltaTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:831

cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous::ColumnMajorVoltaTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:250

cutlass::layout::VoltaTensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:141

cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous::RowMajorVoltaTensorOpMultiplicandBCongruous
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:685