CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
tensor_op_multiplicand_sm75.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 #include "cutlass/coord.h"
33 #include "cutlass/matrix_coord.h"
35 
37 
38 namespace cutlass {
39 namespace layout {
40 
42 
45 template <int ElementSize, int Crosswise>
48  static int const kRank = 2;
49 
51  static int const kStrideRank = 1;
52 
54  using Index = int32_t;
55 
57  using LongIndex = int64_t;
58 
61 
64 
65  //
66  // Static constants
67  //
68 
70  static int const kAccessSize = 128;
71 
72  static int const kElementSize = ElementSize;
73  static int const kElementsPerAccess = kAccessSize / kElementSize;
74  static int const kCrosswise = Crosswise;
75 
78  static int const kTileShapeContiguous = 128 / (kAccessSize / 8);
79 
81  static int const kFactor =
82  kTileShapeContiguous * kElementsPerAccess / kCrosswise;
83 
87  static int const kTileShapeStride =
88  ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
89  ? (kTileShapeContiguous / kFactor)
90  : (32 / kTileShapeContiguous);
91 
96 
99 
100  using PartitionCount =
103 
104  using AccessCount =
106 
107  private:
108  //
109  // Data members
110  //
111 
113  Stride stride_;
114 
115  public:
116  //
117  // Methods
118  //
119 
122  TensorOpMultiplicand(Index ldm = 0) : stride_(ldm) {}
123 
126  TensorOpMultiplicand(Stride stride) : stride_(stride) {}
127 
130  static TensorOpMultiplicand packed(TensorCoord const &extent) {
131  return TensorOpMultiplicand(extent[0]);
132  }
133 
137  LongIndex operator()(TensorCoord const &coord) const {
138  //
139  // First, compute c and s of vector within source (in units of vector
140  // accesses)
141  //
142 
143  int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
144  int vec_strided_idx = coord.strided() / kFactor;
145 
146  // Compute the fundamental tile being accessed
147  int tile_contiguous_idx =
148  vec_contiguous_idx / (TileShape::kContiguous / kFactor);
149 
150  int tile_contiguous_residual =
151  vec_contiguous_idx % (TileShape::kContiguous / kFactor) +
152  ((coord.strided() % kFactor) * (TileShape::kContiguous / kFactor));
153  int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
154 
155  // Compute the 'partition' within the fundamental tile
156  int partition_contiguous_idx =
157  tile_contiguous_residual / PartitionShape::kContiguous;
158  int partition_strided_idx =
159  tile_strided_residual / PartitionShape::kStrided;
160 
161  int partition_contiguous_residual =
162  tile_contiguous_residual % PartitionShape::kContiguous;
163  int partition_strided_residual =
164  tile_strided_residual % PartitionShape::kStrided;
165 
166  //
167  // Then swizzle
168  //
169 
170  int permuted_vec_contiguous_within_partition =
171  partition_contiguous_residual ^ (partition_strided_residual % 4);
172 
173  int permuted_partition_contiguous_within_tile =
174  partition_contiguous_idx ^ (partition_strided_idx % 2);
175 
176  //
177  // Compute final element location
178  //
179 
180  int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
181  permuted_partition_contiguous_within_tile *
183  permuted_vec_contiguous_within_partition) *
184  kElementsPerAccess +
185  (coord.contiguous() % kElementsPerAccess);
186 
187  int element_strided = vec_strided_idx;
188 
189  return element_contiguous + element_strided * stride_[0] * kFactor;
190  }
191 
194  Stride stride() const { return stride_; }
195 
198  Stride &stride() { return stride_; }
199 
203  LongIndex capacity(TensorCoord const &extent) const {
204  return extent[1] * stride_[0];
205  }
206 };
207 
209 
212 template <int ElementSize, int Crosswise>
215  static int const kRank = 2;
216 
218  static int const kStrideRank = 1;
219 
221  using Index = int32_t;
222 
224  using LongIndex = int64_t;
225 
228 
231 
232  //
233  // Invariants
234  //
235 
237 
239  static int const kAccessSize = Base::kAccessSize;
240  using TileShape = typename Base::TileShape;
242 
243  //
244  // Static constants
245  //
246 
247  static int const kElementSize = Base::kElementSize;
248  static int const kElementsPerAccess = Base::kElementsPerAccess;
250  using AccessCount = typename Base::AccessCount;
251 
252  private:
253  //
254  // Data members
255  //
256 
257  Base layout_;
258 
259  public:
260  //
261  // Methods
262  //
263 
266  TensorOpMultiplicandCongruous(Index ldm = 0) : layout_(ldm) {}
267 
271 
275  return TensorOpMultiplicandCongruous(extent[0]);
276  }
277 
281  LongIndex operator()(TensorCoord const &coord) const {
282  return layout_(coord);
283  }
284 
287  TensorCoord inverse(LongIndex offset) const {
288  PitchLinearCoord coord = layout_.inverse(offset);
289  return coord;
290  }
291 
294  Stride stride() const { return layout_.stride(); }
295 
298  Stride &stride() { return layout_.stride(); }
299 
303  LongIndex capacity(TensorCoord const &extent) const {
304  return layout_.capacity(extent);
305  }
306 };
307 
309 
312 template <int Crosswise>
313 struct TensorOpMultiplicandCongruous<32, Crosswise> {
315  static int const kRank = 2;
316 
318  static int const kStrideRank = 1;
319 
321  using Index = int32_t;
322 
324  using LongIndex = int64_t;
325 
328 
331 
332  //
333  // Invariants
334  //
335 
337  static int const kAccessSize = 128;
338 
341 
344 
345  using PartitionCount =
348 
349  using AccessCount =
351 
352  //
353  // Static constants
354  //
355  static int const kElementSize = 32;
356  static int const kElementsPerAccess = kAccessSize / kElementSize;
357 
358  private:
359  //
360  // Data members
361  //
362 
364  Stride stride_;
365 
366  public:
367  //
368  // Methods
369  //
370 
373  TensorOpMultiplicandCongruous(Index ldm = 0) : stride_(ldm) {}
374 
378 
382  return TensorOpMultiplicandCongruous(extent[0]);
383  }
384 
388  LongIndex operator()(TensorCoord const &coord) const {
389  int tc = coord.contiguous() / 32;
390  int ts = coord.strided() / 4;
391 
392  int c = (coord.contiguous() % 32) / kElementsPerAccess;
393  int s = coord.strided() % 4;
394 
395  LongIndex offset = (c ^ (2 * s)) * kElementsPerAccess + s * stride_[0] +
396  tc * 32 + ts * stride_[0] * 4 + coord.contiguous() % 4;
397 
398  return offset;
399  }
400 
403  Stride stride() const { return stride_; }
404 
407  Stride &stride() { return stride_; }
408 
412  LongIndex capacity(TensorCoord const &extent) const {
413  return extent[1] * stride_[0];
414  }
415 };
416 
418 
421 template <int ElementSize, int Crosswise>
423 
425  static int const kRank = 2;
426 
428  static int const kStrideRank = 1;
429 
431  using Index = int32_t;
432 
434  using LongIndex = int64_t;
435 
438 
441 
442  //
443  // Invariants
444  //
445 
447 
449  static int const kAccessSize = Base::kAccessSize;
450  using TileShape = typename Base::TileShape;
452 
453  //
454  // Static constants
455  //
456 
457  static int const kElementSize = Base::kElementSize;
458  static int const kElementsPerAccess = Base::kElementsPerAccess;
460  using AccessCount = typename Base::AccessCount;
461 
462 private:
463 
464  //
465  // Data members
466  //
467 
468  Base layout_;
469 
470 public:
471  //
472  // Methods
473  //
474 
478 
482 
487  }
488 
492  LongIndex operator()(TensorCoord const &coord) const {
493  return layout_(PitchLinearCoord(coord.row(), coord.column()));
494  }
495 
498  TensorCoord inverse(LongIndex offset) const {
499  PitchLinearCoord coord = layout_.inverse(offset);
500  return MatrixCoord(coord.contiguous(), coord.strided());
501  }
502 
505  Stride stride() const {
506  return layout_.stride();
507  }
508 
512  return layout_.stride();
513  }
514 
517  LongIndex capacity(TensorCoord const &extent) const {
518  return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
519  }
520 };
521 
523 
526 template <int ElementSize, int Crosswise>
528 
530  static int const kRank = 2;
531 
533  static int const kStrideRank = 1;
534 
536  using Index = int32_t;
537 
539  using LongIndex = int64_t;
540 
543 
546 
547  //
548  // Invariants
549  //
550 
552 
554  static int const kAccessSize = Base::kAccessSize;
555  using TileShape = typename Base::TileShape;
557 
558  //
559  // Static constants
560  //
561 
562  static int const kElementSize = Base::kElementSize;
563  static int const kElementsPerAccess = Base::kElementsPerAccess;
565  using AccessCount = typename Base::AccessCount;
566 
567 private:
568 
569  //
570  // Data members
571  //
572 
573  Base layout_;
574 
575 public:
576  //
577  // Methods
578  //
579 
582  RowMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
583 
587 
592  }
593 
597  LongIndex operator()(TensorCoord const &coord) const {
598  return layout_(PitchLinearCoord(coord.column(), coord.row()));
599  }
600 
603  TensorCoord inverse(LongIndex offset) const {
604  PitchLinearCoord coord = layout_.inverse(offset);
605  return MatrixCoord(coord.strided(), coord.contiguous());
606  }
607 
610  Stride stride() const {
611  return layout_.stride();
612  }
613 
617  return layout_.stride();
618  }
619 
622  LongIndex capacity(TensorCoord const &extent) const {
623  return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
624  }
625 };
626 
628 
631 template <int ElementSize, int Crosswise>
634  static int const kRank = 2;
635 
637  static int const kStrideRank = 1;
638 
640  using Index = int32_t;
641 
643  using LongIndex = int64_t;
644 
647 
650 
651  //
652  // Invariants
653  //
654 
656 
658  static int const kAccessSize = Base::kAccessSize;
659  using TileShape = typename Base::TileShape;
661 
662  //
663  // Static constants
664  //
665 
666  static int const kElementSize = Base::kElementSize;
667  static int const kElementsPerAccess = Base::kElementsPerAccess;
668  static int const kCrosswise = Base::kCrosswise;
669  static int const kFactor = Base::kFactor;
671  using AccessCount = typename Base::AccessCount;
672 
673  private:
674  //
675  // Data members
676  //
677 
678  Base layout_;
679 
680  public:
681  //
682  // Methods
683  //
684 
687  TensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
688 
692 
696  return TensorOpMultiplicandCrosswise(extent[0]);
697  }
698 
702  LongIndex operator()(TensorCoord const &coord) const {
703  return layout_(coord);
704  }
705 
708  TensorCoord inverse(LongIndex offset) const {
709  PitchLinearCoord coord = layout_.inverse(offset);
710  return coord;
711  }
712 
715  Stride stride() const { return layout_.stride(); }
716 
719  Stride &stride() { return layout_.stride(); }
720 
724  LongIndex capacity(TensorCoord const &extent) const {
725  return layout_.capacity(extent);
726  }
727 };
728 
730 
733 template <int ElementSize, int Crosswise>
736  static int const kRank = 2;
737 
739  static int const kStrideRank = 1;
740 
742  using Index = int32_t;
743 
745  using LongIndex = int64_t;
746 
749 
752 
753  //
754  // Invariants
755  //
756 
758 
760  static int const kAccessSize = Base::kAccessSize;
761  using TileShape = typename Base::TileShape;
763 
764  //
765  // Static constants
766  //
767 
768  static int const kElementSize = Base::kElementSize;
769  static int const kElementsPerAccess = Base::kElementsPerAccess;
771  using AccessCount = typename Base::AccessCount;
772 
773  private:
774  //
775  // Data members
776  //
777 
778  Base layout_;
779 
780  public:
781  //
782  // Methods
783  //
784 
788 
792 
796  TensorCoord const &extent) {
798  }
799 
803  LongIndex operator()(TensorCoord const &coord) const {
804  return layout_(PitchLinearCoord(coord.row(), coord.column()));
805  }
806 
809  TensorCoord inverse(LongIndex offset) const {
810  PitchLinearCoord coord = layout_.inverse(offset);
811  return MatrixCoord(coord.contiguous(), coord.strided());
812  }
813 
816  Stride stride() const { return layout_.stride(); }
817 
820  Stride &stride() { return layout_.stride(); }
821 
825  LongIndex capacity(TensorCoord const &extent) const {
826  return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
827  }
828 };
829 
831 
834 template <int ElementSize, int Crosswise>
837  static int const kRank = 2;
838 
840  static int const kStrideRank = 1;
841 
843  using Index = int32_t;
844 
846  using LongIndex = int64_t;
847 
850 
853 
854  //
855  // Invariants
856  //
857 
859 
861  static int const kAccessSize = Base::kAccessSize;
862  using TileShape = typename Base::TileShape;
864 
865  //
866  // Static constants
867  //
868 
869  static int const kElementSize = Base::kElementSize;
870  static int const kElementsPerAccess = Base::kElementsPerAccess;
872  using AccessCount = typename Base::AccessCount;
873 
874  private:
875  //
876  // Data members
877  //
878 
879  Base layout_;
880 
881  public:
882  //
883  // Methods
884  //
885 
888  RowMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
889 
893 
897  TensorCoord const &extent) {
899  }
900 
904  LongIndex operator()(TensorCoord const &coord) const {
905  return layout_(PitchLinearCoord(coord.column(), coord.row()));
906  }
907 
910  TensorCoord inverse(LongIndex offset) const {
911  PitchLinearCoord coord = layout_.inverse(offset);
912  return MatrixCoord(coord.strided(), coord.contiguous());
913  }
914 
917  Stride stride() const { return layout_.stride(); }
918 
921  Stride &stride() { return layout_.stride(); }
922 
926  LongIndex capacity(TensorCoord const &extent) const {
927  return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
928  }
929 };
930 
932 
934 template <int ElementSize, int InterleavedK>
936 
938  static int const kRank = 2;
939 
941  static int const kStrideRank = 1;
942 
944  using Index = int32_t;
945 
947  using LongIndex = int64_t;
948 
951 
954 
955  //
956  // Invariants
957  //
958 
960  static int const kAccessSize = 128;
961 
962  //
963  // Static constants
964  //
965 
966  static int const kElementSize = ElementSize;
967  static int const kElementsPerAccess = kAccessSize / kElementSize;
968 
969  //static int const kThreadBlockStrided = ThreadBlockStrided;
970  static int const kInterleavedK = InterleavedK;
971 
972 private:
973 
974  //
975  // Data members
976  //
977 
979  Stride stride_;
980 
981 public:
982  //
983  // Methods
984  //
985 
989 
993 
997  return TensorOpMultiplicandColumnMajorInterleaved(extent[0] * kInterleavedK);
998  }
999 
1003  LongIndex operator()(TensorCoord const &coord) const {
1004  int const rows_per_smem_cache_line = 128 / kInterleavedK;
1005 
1006  int row_id = coord.strided() / rows_per_smem_cache_line;
1007  int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
1008 
1009  int access_block_id = col_id >> 4;
1010  int swizzle_access_block_id = access_block_id ^ (row_id & 1);
1011 
1012  int swizzle_col_id = swizzle_access_block_id << 4;
1013 
1014  return row_id * 128 + swizzle_col_id;
1015  }
1016 
1019  Stride stride() const {
1020  return stride_;
1021  }
1022 
1026  return stride_;
1027  }
1028 
1031  LongIndex capacity(TensorCoord const &extent) const {
1032  return (extent[1] / kInterleavedK) * stride_[0];
1033  }
1034 };
1035 
1037 
1039 template <int ElementSize, int InterleavedK>
1041 
1043  static int const kRank = 2;
1044 
1046  static int const kStrideRank = 1;
1047 
1049  using Index = int32_t;
1050 
1052  using LongIndex = int64_t;
1053 
1056 
1059 
1060  //
1061  // Invariants
1062  //
1063 
1065  static int const kAccessSize = 128;
1066 
1067  //
1068  // Static constants
1069  //
1070 
1071  static int const kElementSize = ElementSize;
1072  static int const kElementsPerAccess = kAccessSize / kElementSize;
1073 
1074  //static int const kThreadBlockStrided = ThreadBlockStrided;
1075  static int const kInterleavedK = InterleavedK;
1076 
1077 private:
1078 
1079  //
1080  // Data members
1081  //
1082 
1084  Stride stride_;
1085 
1086 public:
1087  //
1088  // Methods
1089  //
1090 
1094 
1098 
1102  return TensorOpMultiplicandRowMajorInterleaved(extent[1] * kInterleavedK);
1103  }
1104 
1108  LongIndex operator()(TensorCoord const &coord) const {
1109  int const rows_per_smem_cache_line = 128 / kInterleavedK;
1110 
1111  int row_id = coord.strided() / rows_per_smem_cache_line;
1112  int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
1113 
1114  int access_block_id = col_id >> 4;
1115  int swizzle_access_block_id = access_block_id ^ (row_id & 1);
1116 
1117  int swizzle_col_id = swizzle_access_block_id << 4;
1118 
1119  return row_id * 128 + swizzle_col_id;
1120  }
1121 
1124  Stride stride() const {
1125  return stride_;
1126  }
1127 
1131  return stride_;
1132  }
1133 
1136  LongIndex capacity(TensorCoord const &extent) const {
1137  return (extent[0] / kInterleavedK) * stride_[0];
1138  }
1139 };
1140 
1142 
1143 } // namespace layout
1144 } // namespace cutlass
1145 
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:460
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:240
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:791
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:412
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:944
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:670
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:888
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:137
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:762
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:863
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:719
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:321
CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:691
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:910
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:803
Definition: tensor_op_multiplicand_sm75.h:734
static int const kRank
Logical rank of tensor.
Definition: tensor_op_multiplicand_sm75.h:48
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:287
CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:992
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:54
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:459
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:1052
static CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:590
Definition: tensor_op_multiplicand_sm75.h:422
static CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:896
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:659
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:555
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:603
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:451
Definition: tensor_op_multiplicand_sm75.h:835
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:57
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:695
Definition: tensor_op_multiplicand_sm75.h:213
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:249
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:303
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:892
static int const kTileShapeContiguous
Definition: tensor_op_multiplicand_sm75.h:78
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:820
static CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:1101
static CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:996
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:250
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:917
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:825
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1025
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:787
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:947
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:564
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:407
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:281
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:724
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:761
CUTLASS_HOST_DEVICE TensorOpMultiplicandColumnMajorInterleaved(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:988
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:450
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:816
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:517
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:294
static int const kElementsPerAccess
Definition: tensor_op_multiplicand_sm75.h:73
static int const kStrided
Definition: pitch_linear.h:45
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:1049
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:871
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:586
Definition: tensor_op_multiplicand_sm75.h:46
static int const kContiguous
Definition: pitch_linear.h:44
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm75.h:935
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:597
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:771
CUTLASS_HOST_DEVICE TensorOpMultiplicand(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:126
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:477
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:1031
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:266
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:388
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:904
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1124
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:274
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:324
CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:481
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:505
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:671
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:926
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:921
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:1003
CUTLASS_HOST_DEVICE RowMajorTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:582
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:511
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:610
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:556
static CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:485
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:715
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:194
CUTLASS_HOST_DEVICE TensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:687
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:708
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:270
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm75.h:1040
CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:1093
static CUTLASS_HOST_DEVICE ColumnMajorTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:795
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:565
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:198
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm75.h:862
static CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:381
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:298
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:616
static int const kCrosswise
Definition: tensor_op_multiplicand_sm75.h:74
static int const kTileShapeStride
Definition: tensor_op_multiplicand_sm75.h:87
Defines a canonical coordinate for rank=2 matrices offering named indices.
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1019
CUTLASS_HOST_DEVICE TensorOpMultiplicand(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:122
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:809
static int const kElementSize
Definition: tensor_op_multiplicand_sm75.h:72
static int const kFactor
Number of kblocks to store PartitionShape::kContiguous Elements.
Definition: tensor_op_multiplicand_sm75.h:81
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm75.h:203
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:660
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm75.h:241
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:403
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm75.h:498
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm75.h:872
Definition: tensor_op_multiplicand_sm75.h:632
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:1136
CUTLASS_HOST_DEVICE TensorOpMultiplicandRowMajorInterleaved(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:1097
static int const kStrideRank
Rank of stride vector.
Definition: tensor_op_multiplicand_sm75.h:51
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:702
static CUTLASS_HOST_DEVICE TensorOpMultiplicand packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm75.h:130
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:492
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:373
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm75.h:622
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm75.h:1108
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm75.h:1130
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm75.h:770
static int const kAccessSize
This layout is optimized for 128b accesses.
Definition: tensor_op_multiplicand_sm75.h:70
CUTLASS_HOST_DEVICE TensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm75.h:377
Definition: tensor_op_multiplicand_sm75.h:527