59 template <
int ElementSize>
145 int vec_strided_idx = coord.
strided();
156 int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
157 int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
158 ((tile_contiguous_residual & 1) << 2);
163 int element_strided = tile_strided_idx *
TileShape::kStrided + permuted_strided_within_tile;
165 return element_contiguous + element_strided * stride_[0];
183 return extent[1] * stride_[0];
190 template <
int ElementSize>
292 template <
int ElementSize>
396 template <
int ElementSize>
481 int vec_strided_idx = coord.
strided();
492 int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
493 int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
494 (tile_contiguous_residual & 0x4);
500 int element_strided = tile_strided_idx *
TileShape::kStrided + permuted_strided_within_tile;
502 return element_contiguous + element_strided * stride_[0];
520 return extent[1] * stride_[0];
527 template <
int ElementSize>
629 template <
int ElementSize>
732 template <
int ElementSize,
int KBlock>
765 static int const kKBlock = KBlock;
803 int vec_strided_idx = coord.
strided();
810 int vec_strided_within_tile = vec_contiguous_idx & 0x7;
811 int permuted_vec_contiguous =
812 (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
813 (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);
815 permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);
817 int permuted_vec_strided = vec_contiguous_idx;
823 int element_contiguous = permuted_vec_contiguous * kElementsPerAccess +
841 return extent[0] * stride_[0];
847 template <
int ElementSize,
int KBlock>
942 template <
int ElementSize,
int KBlock>
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:388
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:935
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1021
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:519
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:261
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:604
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:992
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:130
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:617
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:228
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:220
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:356
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:930
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:706
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:1030
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:170
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:382
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:583
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:331
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:926
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:477
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:996
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:507
static int const kElementSize
Definition: tensor_op_multiplicand_sm70.h:97
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:658
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:840
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:321
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:693
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:267
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:134
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:557
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1025
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:369
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:352
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:219
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:513
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528
static int const kStrided
Definition: pitch_linear.h:45
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:785
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:623
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:1008
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405
static int const kContiguous
Definition: pitch_linear.h:44
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:667
static int const kElementsPerAccess
Definition: tensor_op_multiplicand_sm70.h:98
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:919
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:591
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:330
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:725
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:598
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:565
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293
static int const kRank
Logical rank of tensor.
Definition: tensor_op_multiplicand_sm70.h:63
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:1000
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:659
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:1014
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:897
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:668
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:363
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:913
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:466
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:789
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:689
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:470
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:254
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:796
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:719
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:611
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:905
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:835
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:274
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:566
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:348
Definition: tensor_op_multiplicand_sm70.h:943
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:556
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:376
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744
static int const kStrideRank
Rank of stride vector.
Definition: tensor_op_multiplicand_sm70.h:66
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:587
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:246
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:126
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:182
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:229
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:781
static int const kAccessSize
This layout is optimized for 128b accesses.
Definition: tensor_op_multiplicand_sm70.h:85
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:176
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:713
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:462
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:286
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:700
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:322
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:280
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:901
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:831
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:250
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:141
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:685