CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
tensor_op_multiplicand_sm70.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 #include "cutlass/coord.h"
34 
36 
37 namespace cutlass {
38 namespace layout {
39 
40 // template <
41 // int ElementSize,
42 // gemm::Operand Operand
43 // >
44 // struct VoltaTensorOpMultiplicandCongruous;
45 
46 // template <
47 // int ElementSize,
48 // gemm::Operand Operand
49 // >
50 // struct ColumnMajorVoltaTensorOpMultiplicandCongruous;
51 // template <
52 // int ElementSize,
53 // gemm::Operand Operand
54 // >
55 // struct RowMajorVoltaTensorOpMultiplicandCongruous;
57 
59 template <int ElementSize>
61 
63  static int const kRank = 2;
64 
66  static int const kStrideRank = 1;
67 
69  using Index = int32_t;
70 
72  using LongIndex = int64_t;
73 
76 
79 
80  //
81  // Invariants
82  //
83 
85  static int const kAccessSize = 128;
86 
89 
92 
93  //
94  // Static constants
95  //
96 
97  static int const kElementSize = ElementSize;
98  static int const kElementsPerAccess = kAccessSize / kElementSize;
99 
103  >;
104 
108  >;
109 
110 private:
111 
112  //
113  // Data members
114  //
115 
117  Stride stride_;
118 
119 public:
120  //
121  // Methods
122  //
123 
126  VoltaTensorOpMultiplicandCongruous(Index ldm = 0): stride_(ldm) { }
127 
131 
135  return VoltaTensorOpMultiplicandCongruous(extent[0]);
136  }
137 
141  LongIndex operator()(TensorCoord const &coord) const {
142 
143  // First, compute c and s of vector within source (in units of vector accesses)
144  int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
145  int vec_strided_idx = coord.strided();
146 
147  // Compute the fundamental tile being accessed
148  int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
149  int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
150 
151  int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
152  int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
153 
154  // Then swizzle in a tile
155  // Swizzle pattern is (tid[2:0] << 2)|(tid[4:3] ^ tid[2:1])
156  int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
157  int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
158  ((tile_contiguous_residual & 1) << 2);
159  // Compute final element location
160  int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
161  permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
162 
163  int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
164 
165  return element_contiguous + element_strided * stride_[0];
166  }
167 
170  Stride stride() const {
171  return stride_;
172  }
173 
177  return stride_;
178  }
179 
182  LongIndex capacity(TensorCoord const &extent) const {
183  return extent[1] * stride_[0];
184  }
185 };
186 
188 
190 template <int ElementSize>
192 
194  static int const kRank = 2;
195 
197  static int const kStrideRank = 1;
198 
200  using Index = int32_t;
201 
203  using LongIndex = int64_t;
204 
207 
210 
211  //
212  // Invariants
213  //
214 
216 
218  static int const kAccessSize = Base::kAccessSize;
219  using TileShape = typename Base::TileShape;
221 
222  //
223  // Static constants
224  //
225 
226  static int const kElementSize = Base::kElementSize;
227  static int const kElementsPerAccess = Base::kElementsPerAccess;
229  using AccessCount = typename Base::AccessCount;
230 
231 private:
232 
233  //
234  // Data members
235  //
236 
237  Base layout_;
238 
239 public:
240  //
241  // Methods
242  //
243 
247 
251 
256  }
257 
261  LongIndex operator()(TensorCoord const &coord) const {
262  return layout_(PitchLinearCoord(coord.row(), coord.column()));
263  }
264 
267  TensorCoord inverse(LongIndex offset) const {
268  PitchLinearCoord coord = layout_.inverse(offset);
269  return MatrixCoord(coord.contiguous(), coord.strided());
270  }
271 
274  Stride stride() const {
275  return layout_.stride();
276  }
277 
281  return layout_.stride();
282  }
283 
286  LongIndex capacity(TensorCoord const &extent) const {
287  return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
288  }
289 };
290 
292 template <int ElementSize>
294 
296  static int const kRank = 2;
297 
299  static int const kStrideRank = 1;
300 
302  using Index = int32_t;
303 
305  using LongIndex = int64_t;
306 
309 
312 
313  //
314  // Invariants
315  //
316 
318 
320  static int const kAccessSize = Base::kAccessSize;
321  using TileShape = typename Base::TileShape;
323 
324  //
325  // Static constants
326  //
327 
328  static int const kElementSize = Base::kElementSize;
329  static int const kElementsPerAccess = Base::kElementsPerAccess;
331  using AccessCount = typename Base::AccessCount;
332 
333 private:
334 
335  //
336  // Data members
337  //
338 
339  Base layout_;
340 
341 public:
342  //
343  // Methods
344  //
345 
349 
353 
358  }
359 
363  LongIndex operator()(TensorCoord const &coord) const {
364  return layout_(PitchLinearCoord(coord.column(), coord.row()));
365  }
366 
369  TensorCoord inverse(LongIndex offset) const {
370  PitchLinearCoord coord = layout_.inverse(offset);
371  return MatrixCoord(coord.strided(), coord.contiguous());
372  }
373 
376  Stride stride() const {
377  return layout_.stride();
378  }
379 
383  return layout_.stride();
384  }
385 
388  LongIndex capacity(TensorCoord const &extent) const {
389  return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
390  }
391 };
392 
393 
395 // template <int ElementSize, Operand Operand>
396 template <int ElementSize>
399  static int const kRank = 2;
400 
402  static int const kStrideRank = 1;
403 
405  using Index = int32_t;
406 
408  using LongIndex = int64_t;
409 
412 
415 
416  //
417  // Invariants
418  //
419 
421  static int const kAccessSize = 128;
422 
425 
428 
429  //
430  // Static constants
431  //
432 
433  static int const kElementSize = ElementSize;
434  static int const kElementsPerAccess = kAccessSize / kElementSize;
435 
439  >;
440 
444  >;
445 
446 private:
447 
448  //
449  // Data members
450  //
451 
453  Stride stride_;
454 
455 public:
456  //
457  // Methods
458  //
459 
462  VoltaTensorOpMultiplicandBCongruous(Index ldm = 0): stride_(ldm) { }
463 
467 
471  return VoltaTensorOpMultiplicandBCongruous(extent[0]);
472  }
473 
477  LongIndex operator()(TensorCoord const &coord) const {
478 
479  // First, compute c and s of vector within source (in units of vector accesses)
480  int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
481  int vec_strided_idx = coord.strided();
482 
483  // Compute the fundamental tile being accessed
484  int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
485  int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
486 
487  int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
488  int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
489 
490  // Then swizzle in a tile
491  // Swizzle pattern is (tid[1:0] << 3)|(tid & 0x4)|(tid[1:0])
492  int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
493  int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
494  (tile_contiguous_residual & 0x4);
495 
496  // Compute final element location
497  int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
498  permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
499 
500  int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
501 
502  return element_contiguous + element_strided * stride_[0];
503  }
504 
507  Stride stride() const {
508  return stride_;
509  }
510 
514  return stride_;
515  }
516 
519  LongIndex capacity(TensorCoord const &extent) const {
520  return extent[1] * stride_[0];
521  }
522 };
523 
525 
527 template <int ElementSize>
529 
531  static int const kRank = 2;
532 
534  static int const kStrideRank = 1;
535 
537  using Index = int32_t;
538 
540  using LongIndex = int64_t;
541 
544 
547 
548  //
549  // Invariants
550  //
551 
553 
555  static int const kAccessSize = Base::kAccessSize;
556  using TileShape = typename Base::TileShape;
558 
559  //
560  // Static constants
561  //
562 
563  static int const kElementSize = Base::kElementSize;
564  static int const kElementsPerAccess = Base::kElementsPerAccess;
566  using AccessCount = typename Base::AccessCount;
567 
568 private:
569 
570  //
571  // Data members
572  //
573 
574  Base layout_;
575 
576 public:
577  //
578  // Methods
579  //
580 
584 
588 
593  }
594 
598  LongIndex operator()(TensorCoord const &coord) const {
599  return layout_(PitchLinearCoord(coord.row(), coord.column()));
600  }
601 
604  TensorCoord inverse(LongIndex offset) const {
605  PitchLinearCoord coord = layout_.inverse(offset);
606  return MatrixCoord(coord.contiguous(), coord.strided());
607  }
608 
611  Stride stride() const {
612  return layout_.stride();
613  }
614 
618  return layout_.stride();
619  }
620 
623  LongIndex capacity(TensorCoord const &extent) const {
624  return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
625  }
626 };
627 
629 template <int ElementSize>
631 
633  static int const kRank = 2;
634 
636  static int const kStrideRank = 1;
637 
639  using Index = int32_t;
640 
642  using LongIndex = int64_t;
643 
646 
649 
650  //
651  // Invariants
652  //
653 
655 
657  static int const kAccessSize = Base::kAccessSize;
658  using TileShape = typename Base::TileShape;
660 
661  //
662  // Static constants
663  //
664 
665  static int const kElementSize = Base::kElementSize;
666  static int const kElementsPerAccess = Base::kElementsPerAccess;
668  using AccessCount = typename Base::AccessCount;
669 
670 private:
671 
672  //
673  // Data members
674  //
675 
676  Base layout_;
677 
678 public:
679  //
680  // Methods
681  //
682 
686 
690 
695  }
696 
700  LongIndex operator()(TensorCoord const &coord) const {
701  return layout_(PitchLinearCoord(coord.column(), coord.row()));
702  }
703 
706  TensorCoord inverse(LongIndex offset) const {
707  PitchLinearCoord coord = layout_.inverse(offset);
708  return MatrixCoord(coord.strided(), coord.contiguous());
709  }
710 
713  Stride stride() const {
714  return layout_.stride();
715  }
716 
720  return layout_.stride();
721  }
722 
725  LongIndex capacity(TensorCoord const &extent) const {
726  return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
727  }
728 };
729 
732 template <int ElementSize, int KBlock>
735  static int const kRank = 2;
736 
738  static int const kStrideRank = 1;
739 
741  using Index = int32_t;
742 
744  using LongIndex = int64_t;
745 
748 
751 
752  //
753  // Invariants
754  //
755 
757  static int const kAccessSize = 64;
758 
759  //
760  // Static constants
761  //
762 
763  static int const kElementSize = ElementSize;
764  static int const kElementsPerAccess = kAccessSize / kElementSize;
765  static int const kKBlock = KBlock;
766 
767  private:
768  //
769  // Data members
770  //
771 
773  Stride stride_;
774  public:
775  //
776  // Methods
777  //
778 
781  VoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : stride_(ldm) {}
782 
786 
790  return VoltaTensorOpMultiplicandCrosswise(extent[1]);
791  }
792 
796  LongIndex operator()(TensorCoord const &coord) const {
797 
798  //
799  // First, compute c and s of vector within source (in units of vector
800  // accesses)
801  //
802  int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
803  int vec_strided_idx = coord.strided();
804 
805  //
806  // Then swizzle
807  // The mapping is like this:
808  // id[1:0]|(id[3]^id[4])|id[2]
809 
810  int vec_strided_within_tile = vec_contiguous_idx & 0x7;
811  int permuted_vec_contiguous =
812  (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
813  (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);
814 
815  permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);
816 
817  int permuted_vec_strided = vec_contiguous_idx;
818 
819  //
820  // Compute final element location
821  //
822 
823  int element_contiguous = permuted_vec_contiguous * kElementsPerAccess +
824  (coord.contiguous() % kElementsPerAccess);
825 
826  return element_contiguous + permuted_vec_strided * (stride_[0] * kElementsPerAccess);
827  }
828 
831  Stride stride() const { return stride_; }
832 
835  Stride &stride() { return stride_; }
836 
840  LongIndex capacity(TensorCoord const &extent) const {
841  return extent[0] * stride_[0];
842  }
843 };
844 
847 template <int ElementSize, int KBlock>
850  static int const kRank = 2;
851 
853  static int const kStrideRank = 1;
854 
856  using Index = int32_t;
857 
859  using LongIndex = int64_t;
860 
863 
866 
867  //
868  // Invariants
869  //
870 
872 
874  static int const kAccessSize = Base::kAccessSize;
875 
876  //
877  // Static constants
878  //
879 
880  static int const kElementSize = Base::kElementSize;
881  static int const kElementsPerAccess = Base::kElementsPerAccess;
882 
883  private:
884  //
885  // Data members
886  //
887 
888  Base layout_;
889 
890  public:
891  //
892  // Methods
893  //
894 
898 
902 
906  TensorCoord const &extent) {
908  }
909 
913  LongIndex operator()(TensorCoord const &coord) const {
914  return layout_(PitchLinearCoord(coord.row(), coord.column()));
915  }
916 
919  TensorCoord inverse(LongIndex offset) const {
920  PitchLinearCoord coord = layout_.inverse(offset);
921  return MatrixCoord(coord.contiguous(), coord.strided());
922  }
923 
926  Stride stride() const { return layout_.stride(); }
927 
930  Stride &stride() { return layout_.stride(); }
931 
935  LongIndex capacity(TensorCoord const &extent) const {
936  return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
937  }
938 };
939 
942 template <int ElementSize, int KBlock>
945  static int const kRank = 2;
946 
948  static int const kStrideRank = 1;
949 
951  using Index = int32_t;
952 
954  using LongIndex = int64_t;
955 
958 
961 
962  //
963  // Invariants
964  //
965 
967 
969  static int const kAccessSize = Base::kAccessSize;
970 
971  //
972  // Static constants
973  //
974 
975  static int const kElementSize = Base::kElementSize;
976  static int const kElementsPerAccess = Base::kElementsPerAccess;
977 
978  private:
979  //
980  // Data members
981  //
982 
983  Base layout_;
984 
985  public:
986  //
987  // Methods
988  //
989 
993 
997 
1001  TensorCoord const &extent) {
1003  }
1004 
1008  LongIndex operator()(TensorCoord const &coord) const {
1009  return layout_(PitchLinearCoord(coord.column(), coord.row()));
1010  }
1011 
1015  PitchLinearCoord coord = layout_.inverse(offset);
1016  return MatrixCoord(coord.strided(), coord.contiguous());
1017  }
1018 
1021  Stride stride() const { return layout_.stride(); }
1022 
1025  Stride &stride() { return layout_.stride(); }
1026 
1030  LongIndex capacity(TensorCoord const &extent) const {
1031  return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
1032  }
1033 };
1034 
1035 } // namespace layout
1036 } // namespace cutlass
1037 
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:388
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:935
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1021
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:630
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:537
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:519
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:261
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:604
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:992
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:69
Definition: aligned_buffer.h:35
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:130
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:617
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:228
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:220
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:356
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:930
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:859
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:706
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:1030
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:170
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:382
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:583
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:331
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:926
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:540
Definition: tensor_op_multiplicand_sm70.h:848
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:477
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:996
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:507
static int const kElementSize
Definition: tensor_op_multiplicand_sm70.h:97
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:658
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Definition: tensor_op_multiplicand_sm70.h:840
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:321
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:693
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:267
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:642
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:134
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:557
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:1025
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:369
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:200
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:352
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:219
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:513
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:528
static int const kStrided
Definition: pitch_linear.h:45
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:785
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:623
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:1008
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:405
static int const kContiguous
Definition: pitch_linear.h:44
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:667
static int const kElementsPerAccess
Definition: tensor_op_multiplicand_sm70.h:98
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:919
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:591
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:203
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:330
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:725
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:598
typename Base::PartitionCount PartitionCount
Definition: tensor_op_multiplicand_sm70.h:565
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:951
Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:293
static int const kRank
Logical rank of tensor.
Definition: tensor_op_multiplicand_sm70.h:63
static CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:1000
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:305
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:659
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex offset) const
Inverse of layout function, mapping linear offset to logical coordinate.
Definition: tensor_op_multiplicand_sm70.h:1014
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:897
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:397
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:668
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:363
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:913
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:302
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:466
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:789
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:689
static CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:470
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:72
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:254
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:796
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:741
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:719
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:611
static CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor_op_multiplicand_sm70.h:905
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:835
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:274
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:566
Definition: tensor_op_multiplicand_sm70.h:733
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:348
Definition: tensor_op_multiplicand_sm70.h:943
typename Base::TileShape TileShape
Definition: tensor_op_multiplicand_sm70.h:556
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:639
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:376
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:744
static int const kStrideRank
Rank of stride vector.
Definition: tensor_op_multiplicand_sm70.h:66
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:954
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:587
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:246
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:126
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:182
typename Base::AccessCount AccessCount
Definition: tensor_op_multiplicand_sm70.h:229
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandCrosswise(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:781
static int const kAccessSize
This layout is optimized for 128b accesses.
Definition: tensor_op_multiplicand_sm70.h:85
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:176
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:713
CUTLASS_HOST_DEVICE VoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:462
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm70.h:856
Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous.
Definition: tensor_op_multiplicand_sm70.h:191
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor_op_multiplicand_sm70.h:286
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:700
typename Base::PartitionShape PartitionShape
Definition: tensor_op_multiplicand_sm70.h:322
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:280
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:901
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm70.h:408
Template based on element size (in bits) - defined in terms of pitch-linear memory.
Definition: tensor_op_multiplicand_sm70.h:60
Basic include for CUTLASS.
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor_op_multiplicand_sm70.h:831
CUTLASS_HOST_DEVICE ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:250
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Definition: tensor_op_multiplicand_sm70.h:141
CUTLASS_HOST_DEVICE RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm=0)
Ctor.
Definition: tensor_op_multiplicand_sm70.h:685