cutlass/tensor__op__multiplicand__sm75_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/coord.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/layout/pitch_linear.h"


 namespace cutlass {
 namespace layout {


 template <int ElementSize, int Crosswise>
 struct TensorOpMultiplicand {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Static constants
   //

   static int const kAccessSize = 128;

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;
   static int const kCrosswise = Crosswise;

   static int const kTileShapeContiguous = 128 / (kAccessSize / 8);

   static int const kFactor =
       kTileShapeContiguous * kElementsPerAccess / kCrosswise;

   static int const kTileShapeStride =
       ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
           ? (kTileShapeContiguous / kFactor)
           : (32 / kTileShapeContiguous);

   using TileShape = PitchLinearShape<kTileShapeContiguous, kTileShapeStride>;

   using PartitionShape = PitchLinearShape<4, 4>;

   using PartitionCount =
       PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
                        TileShape::kStrided / PartitionShape::kStrided>;

   using AccessCount =
       PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;

  private:
   //
   // Data members
   //

   Stride stride_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicand(Index ldm = 0) : stride_(ldm) {}

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicand(Stride stride) : stride_(stride) {}

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicand packed(TensorCoord const &extent) {
     return TensorOpMultiplicand(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     //
     // First, compute c and s of vector within source (in units of vector
     // accesses)
     //

     int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
     int vec_strided_idx = coord.strided() / kFactor;

     // Compute the fundamental tile being accessed
     int tile_contiguous_idx =
         vec_contiguous_idx / (TileShape::kContiguous / kFactor);

     int tile_contiguous_residual =
         vec_contiguous_idx % (TileShape::kContiguous / kFactor) +
         ((coord.strided() % kFactor) * (TileShape::kContiguous / kFactor));
     int tile_strided_residual = vec_strided_idx % TileShape::kStrided;

     // Compute the 'partition' within the fundamental tile
     int partition_contiguous_idx =
         tile_contiguous_residual / PartitionShape::kContiguous;
     int partition_strided_idx =
         tile_strided_residual / PartitionShape::kStrided;

     int partition_contiguous_residual =
         tile_contiguous_residual % PartitionShape::kContiguous;
     int partition_strided_residual =
         tile_strided_residual % PartitionShape::kStrided;

     //
     // Then swizzle
     //

     int permuted_vec_contiguous_within_partition =
         partition_contiguous_residual ^ (partition_strided_residual % 4);

     int permuted_partition_contiguous_within_tile =
         partition_contiguous_idx ^ (partition_strided_idx % 2);

     //
     // Compute final element location
     //

     int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
                               permuted_partition_contiguous_within_tile *
                                   PartitionShape::kContiguous +
                               permuted_vec_contiguous_within_partition) *
                                  kElementsPerAccess +
                              (coord.contiguous() % kElementsPerAccess);

     int element_strided = vec_strided_idx;

     return element_contiguous + element_strided * stride_[0] * kFactor;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return stride_; }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return stride_; }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent[1] * stride_[0];
   }
 };


 template <int ElementSize, int Crosswise>
 struct TensorOpMultiplicandCongruous {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicand<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCongruous(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCongruous(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return TensorOpMultiplicandCongruous(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(coord);
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return coord;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(extent);
   }
 };


 template <int Crosswise>
 struct TensorOpMultiplicandCongruous<32, Crosswise> {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 128;

   using TileShape = PitchLinearShape<8, 4>;

   using PartitionShape = PitchLinearShape<8, 4>;

   using PartitionCount =
       PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
                        TileShape::kStrided / PartitionShape::kStrided>;

   using AccessCount =
       PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;

   //
   // Static constants
   //
   static int const kElementSize = 32;
   static int const kElementsPerAccess = kAccessSize / kElementSize;

  private:
   //
   // Data members
   //

   Stride stride_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCongruous(Index ldm = 0) : stride_(ldm) {}

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCongruous(Stride stride) : stride_(stride) {}

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return TensorOpMultiplicandCongruous(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     int tc = coord.contiguous() / 32;
     int ts = coord.strided() / 4;

     int c = (coord.contiguous() % 32) / kElementsPerAccess;
     int s = coord.strided() % 4;

     LongIndex offset = (c ^ (2 * s)) * kElementsPerAccess + s * stride_[0] +
                        tc * 32 + ts * stride_[0] * 4 + coord.contiguous() % 4;

     return offset;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return stride_; }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return stride_; }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent[1] * stride_[0];
   }
 };


 template <int ElementSize, int Crosswise>
 struct ColumnMajorTensorOpMultiplicandCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   ColumnMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   ColumnMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return ColumnMajorTensorOpMultiplicandCongruous(extent.row());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.row(), coord.column()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.contiguous(), coord.strided());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
   }
 };


 template <int ElementSize, int Crosswise>
 struct RowMajorTensorOpMultiplicandCongruous {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

 private:

   //
   // Data members
   //

   Base layout_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   RowMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }

   CUTLASS_HOST_DEVICE
   RowMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }

   CUTLASS_HOST_DEVICE
   static RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
     return RowMajorTensorOpMultiplicandCongruous(extent.column());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.column(), coord.row()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.strided(), coord.contiguous());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return layout_.stride();
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
   }
 };


 template <int ElementSize, int Crosswise>
 struct TensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicand<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   static int const kCrosswise = Base::kCrosswise;
   static int const kFactor = Base::kFactor;
   using PartitionCount =  typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
     return TensorOpMultiplicandCrosswise(extent[0]);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(coord);
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return coord;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(extent);
   }
 };


 template <int ElementSize, int Crosswise>
 struct ColumnMajorTensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount = typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   ColumnMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   ColumnMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static ColumnMajorTensorOpMultiplicandCrosswise packed(
       TensorCoord const &extent) {
     return ColumnMajorTensorOpMultiplicandCrosswise(extent.row());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.row(), coord.column()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.contiguous(), coord.strided());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
   }
 };


 template <int ElementSize, int Crosswise>
 struct RowMajorTensorOpMultiplicandCrosswise {
   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = MatrixCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;

   static int const kAccessSize = Base::kAccessSize;
   using TileShape = typename Base::TileShape;
   using PartitionShape = typename Base::PartitionShape;

   //
   // Static constants
   //

   static int const kElementSize = Base::kElementSize;
   static int const kElementsPerAccess = Base::kElementsPerAccess;
   using PartitionCount = typename Base::PartitionCount;
   using AccessCount = typename Base::AccessCount;

  private:
   //
   // Data members
   //

   Base layout_;

  public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   RowMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}

   CUTLASS_HOST_DEVICE
   RowMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}

   CUTLASS_HOST_DEVICE
   static RowMajorTensorOpMultiplicandCrosswise packed(
       TensorCoord const &extent) {
     return RowMajorTensorOpMultiplicandCrosswise(extent.column());
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return layout_(PitchLinearCoord(coord.column(), coord.row()));
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex offset) const {
     PitchLinearCoord coord = layout_.inverse(offset);
     return MatrixCoord(coord.strided(), coord.contiguous());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   Stride &stride() { return layout_.stride(); }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
   }
 };


 template <int ElementSize, int InterleavedK>
 struct TensorOpMultiplicandColumnMajorInterleaved {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 128;

   //
   // Static constants
   //

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;

   //static int const kThreadBlockStrided = ThreadBlockStrided;
   static int const kInterleavedK = InterleavedK;

 private:

   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandColumnMajorInterleaved(Index ldm = 0): stride_(ldm) { }

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandColumnMajorInterleaved(Stride stride): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent) {
     return TensorOpMultiplicandColumnMajorInterleaved(extent[0] * kInterleavedK);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     int const rows_per_smem_cache_line = 128 / kInterleavedK;

     int row_id = coord.strided() / rows_per_smem_cache_line;
     int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();

     int access_block_id = col_id >> 4;
     int swizzle_access_block_id = access_block_id ^ (row_id & 1);

     int swizzle_col_id = swizzle_access_block_id << 4;

     return row_id * 128 + swizzle_col_id;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return (extent[1] / kInterleavedK) * stride_[0];
   }
 };


 template <int ElementSize, int InterleavedK>
 struct TensorOpMultiplicandRowMajorInterleaved {

   static int const kRank = 2;

   static int const kStrideRank = 1;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = PitchLinearCoord;

   using Stride = Coord<kStrideRank, Index, LongIndex>;

   //
   // Invariants
   //

   static int const kAccessSize = 128;

   //
   // Static constants
   //

   static int const kElementSize = ElementSize;
   static int const kElementsPerAccess = kAccessSize / kElementSize;

   //static int const kThreadBlockStrided = ThreadBlockStrided;
   static int const kInterleavedK = InterleavedK;

 private:

   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandRowMajorInterleaved(Index ldm = 0): stride_(ldm) { }

   CUTLASS_HOST_DEVICE
   TensorOpMultiplicandRowMajorInterleaved(Stride stride): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent) {
     return TensorOpMultiplicandRowMajorInterleaved(extent[1] * kInterleavedK);
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     int const rows_per_smem_cache_line = 128 / kInterleavedK;

     int row_id = coord.strided() / rows_per_smem_cache_line;
     int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();

     int access_block_id = col_id >> 4;
     int swizzle_access_block_id = access_block_id ^ (row_id & 1);

     int swizzle_col_id = swizzle_access_block_id << 4;

     return row_id * 128 + swizzle_col_id;
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return (extent[0] / kInterleavedK) * stride_[0];
   }
 };


 } // namespace layout
 } // namespace cutlass

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:460

cutlass::layout::TensorOpMultiplicandCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:240

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::ColumnMajorTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:791

cutlass::MatrixCoord::column
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:412

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:944

cutlass::layout::TensorOpMultiplicandCrosswise::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:670

cutlass::layout::TensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::RowMajorTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:888

cutlass
Definition: aligned_buffer.h:35

cutlass::layout::TensorOpMultiplicand::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:137

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:762

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:863

cutlass::layout::TensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:719

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:321

cutlass::layout::TensorOpMultiplicandCrosswise::TensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:691

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:910

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:803

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:734

cutlass::layout::TensorOpMultiplicand::kRank
static int const kRank
Logical rank of tensor.
Definition: tensor_op_multiplicand_sm75.h:48

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539

cutlass::layout::TensorOpMultiplicandCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:287

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::TensorOpMultiplicandColumnMajorInterleaved
CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:992

cutlass::layout::TensorOpMultiplicand::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:54

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:459

cutlass::layout::TensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:1052

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:590

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:422

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:896

cutlass::layout::TensorOpMultiplicandCrosswise::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:659

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:555

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:603

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:451

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:835

cutlass::layout::TensorOpMultiplicand::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:57

cutlass::layout::TensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:695

cutlass::layout::TensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:213

cutlass::layout::TensorOpMultiplicandCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:249

cutlass::MatrixCoord::row
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77

cutlass::layout::TensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:303

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::RowMajorTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:892

cutlass::layout::TensorOpMultiplicand::kTileShapeContiguous
static int const kTileShapeContiguous
Definition: tensor_op_multiplicand_sm75.h:78

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:820

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:1101

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:996

cutlass::layout::TensorOpMultiplicandCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:250

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:917

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:825

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1025

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::ColumnMajorTensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:787

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:947

cutlass::layout::TensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:564

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:407

cutlass::layout::TensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:281

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

cutlass::layout::TensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:724

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:761

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::TensorOpMultiplicandColumnMajorInterleaved
CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:988

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:450

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:816

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:517

cutlass::layout::TensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:294

cutlass::layout::TensorOpMultiplicand::kElementsPerAccess
static int const kElementsPerAccess
Definition: tensor_op_multiplicand_sm75.h:73

cutlass::layout::PitchLinearShape::kStrided
static int const kStrided
Definition: pitch_linear.h:45

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:1049

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:871

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::RowMajorTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:586

cutlass::layout::TensorOpMultiplicand
Definition: tensor_op_multiplicand_sm75.h:46

cutlass::layout::PitchLinearShape::kContiguous
static int const kContiguous
Definition: pitch_linear.h:44

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm75.h:935

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:597

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:771

cutlass::layout::TensorOpMultiplicand::TensorOpMultiplicand
CUTLASS_HOST_DEVICE TensorOpMultiplicand(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:126

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::ColumnMajorTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:477

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:1031

cutlass::layout::TensorOpMultiplicandCongruous::TensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:266

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:388

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:904

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1124

cutlass::layout::TensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:274

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:324

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::ColumnMajorTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:481

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:505

cutlass::layout::TensorOpMultiplicandCrosswise::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:671

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:926

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:921

cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:1003

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::RowMajorTensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:582

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:511

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:610

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:556

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::packed
static CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:485

cutlass::layout::TensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643

cutlass::layout::TensorOpMultiplicandCrosswise::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:715

cutlass::layout::TensorOpMultiplicand::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:194

cutlass::layout::TensorOpMultiplicandCrosswise::TensorOpMultiplicandCrosswise
CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:687

cutlass::layout::TensorOpMultiplicandCrosswise::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:708

cutlass::layout::TensorOpMultiplicandCongruous::TensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:270

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm75.h:1040

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::TensorOpMultiplicandRowMajorInterleaved
CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:1093

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::packed
static CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:795

cutlass::Coord< kStrideRank, Index, LongIndex >

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:565

cutlass::layout::TensorOpMultiplicand::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:198

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::TileShape
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:862

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:381

cutlass::layout::TensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:298

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:616

cutlass::layout::TensorOpMultiplicand::kCrosswise
static int const kCrosswise
Definition: tensor_op_multiplicand_sm75.h:74

cutlass::layout::TensorOpMultiplicand::kTileShapeStride
static int const kTileShapeStride
Definition: tensor_op_multiplicand_sm75.h:87

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::layout::TensorOpMultiplicandColumnMajorInterleaved::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1019

cutlass::layout::TensorOpMultiplicand::TensorOpMultiplicand
CUTLASS_HOST_DEVICE TensorOpMultiplicand(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:122

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:809

cutlass::layout::TensorOpMultiplicand::kElementSize
static int const kElementSize
Definition: tensor_op_multiplicand_sm75.h:72

cutlass::layout::TensorOpMultiplicand::kFactor
static int const kFactor
Number of kblocks to store PartitionShape::kContiguous Elements.
Definition: tensor_op_multiplicand_sm75.h:81

cutlass::layout::TensorOpMultiplicand::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:203

cutlass::layout::TensorOpMultiplicandCrosswise::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:660

cutlass::layout::TensorOpMultiplicandCongruous::PartitionShape
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:241

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:403

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:498

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::AccessCount
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:872

cutlass::layout::TensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:632

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:1136

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::TensorOpMultiplicandRowMajorInterleaved
CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:1097

cutlass::layout::TensorOpMultiplicand::kStrideRank
static int const kStrideRank
Rank of stride vector.
Definition: tensor_op_multiplicand_sm75.h:51

cutlass::layout::TensorOpMultiplicandCrosswise::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:702

cutlass::layout::TensorOpMultiplicand::packed
static CUTLASS_HOST_DEVICE TensorOpMultiplicand packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:130

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:492

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::TensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:373

cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:622

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:1108

cutlass::layout::TensorOpMultiplicandRowMajorInterleaved::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1130

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::PartitionCount
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:770

cutlass::layout::TensorOpMultiplicand::kAccessSize
static int const kAccessSize
This layout is optimized for 128b accesses.
Definition: tensor_op_multiplicand_sm75.h:70

cutlass::layout::TensorOpMultiplicandCongruous< 32, Crosswise >::TensorOpMultiplicandCongruous
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:377

cutlass::layout::RowMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:527