CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
pitch_linear_thread_map.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
30 #pragma once
31 
32 #include "cutlass/cutlass.h"
33 #include "cutlass/array.h"
34 #include "cutlass/coord.h"
36 #include "cutlass/tensor_ref.h"
37 #include "cutlass/tensor_view.h"
39 
41 
42 namespace cutlass {
43 namespace transform {
44 
46 
54 template <
55  typename Shape_,
56  int Threads,
57  int ElementsPerAccess = 1
58 >
60 
63 
65  using Shape = Shape_;
66 
68  static int const kThreads = Threads;
69 
71  static int const kElementsPerAccess = ElementsPerAccess;
72 
75 
77  struct Detail {
78 
79  static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
80 
81  static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * kElementsPerAccess)),
82  "Shape must be divisible thread count.");
83 
86  Shape::kContiguous / kElementsPerAccess,
87  Shape::kStrided
88  >;
89 
91  (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
92  (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
93  "Shape must be divisible by number of iterations of each thread."
94  );
95  };
96 
98  using Iterations = typename platform::conditional<
101  1,
103  >,
104  layout::PitchLinearShape<
107  >
108  >::type;
109 
112  using Delta = typename platform::conditional<
114  layout::PitchLinearShape<
115  1,
117  >,
118  layout::PitchLinearShape<
119  kThreads * kElementsPerAccess,
120  1
121  >
122  >::type;
123 
127  static TensorCoord initial_offset(int thread_id) {
128 
129  return TensorCoord(
130  (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess,
131  thread_id / Detail::ShapeVec::kContiguous);
132  }
133 };
134 
135 template <
136  typename Shape,
137  int Threads,
138  int ElementsPerAccess = 1
139 >
141 {
142  static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
143  "Contiguous shape must divide number of threads");
144 
146 
147  static int const kThreads = Threads;
148  static int const kElementsPerAccess = ElementsPerAccess;
149 
151  Shape::kContiguous / (kThreads * kElementsPerAccess),
152  Shape::kStrided>;
153 
155 
157  static TensorCoord initial_offset(int thread_id)
158  {
159  return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
160  }
161 };
162 
163 template <
164  typename Shape,
165  int Threads,
166  int ElementsPerAccess = 1
167 >
169 {
170  static_assert((Shape::kStrided % Threads == 0),
171  "Strided shape must divide number of threads");
172 
174 
175  static int const kThreads = Threads;
176  static int const kElementsPerAccess = ElementsPerAccess;
177 
179  Shape::kContiguous / kElementsPerAccess,
180  Shape::kStrided / kThreads>;
181 
183 
184  using ShapeVec = Shape;
185 
187  static TensorCoord initial_offset(int thread_id)
188  {
189 
190  return TensorCoord(0, thread_id * Iterations::kStrided);
191  }
192 };
193 
194 
196 
199 template <
200  typename Shape_,
201  int Threads,
202  typename WarpThreadArrangement_,
203  int ElementsPerAccess = 1
204 >
206 
209 
211  using Shape = Shape_;
212 
214  static int const kThreads = Threads;
215 
217  static int const kElementsPerAccess = ElementsPerAccess;
218 
221 
223  struct Detail {
224 
226  using WarpThreadArrangement = WarpThreadArrangement_;
227 
229  static int const kWarpSize = WarpThreadArrangement::kCount;
230 
232  static int const kWarpCount = kThreads / kWarpSize;
233 
235  !(Shape::kContiguous % kElementsPerAccess),
236  "Shape must be divisible by vector length.");
237 
240  Shape::kContiguous / kElementsPerAccess,
241  Shape::kStrided
242  >;
243 
244  // compute number of warp-level accesses total
246  ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
247  ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
248  >;
249 
250  // Divide it into the number of warps, first partitioning the strided dimension then the
251  // contiguous.
252  static int const kWarpsStrided =
253  (WarpAccessIterations::kStrided >= kWarpCount
254  ? kWarpCount
255  : WarpAccessIterations::kStrided);
256 
257  static int const kWarpsContiguous =
258  (kWarpCount > WarpAccessIterations::kStrided
259  ? kWarpCount / kWarpsStrided
260  : 1);
261 
264  kWarpsContiguous, kWarpsStrided
265  >;
266  };
267 
270  Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
271  Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
272  >;
273 
274  static_assert(Iterations::kCount,
275  "Number of iterations must be non-zero");
276 
279  Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
280  Detail::WarpThreadArrangement::kStrided
281  >;
282 
285  static TensorCoord initial_offset(int thread_id) {
286 
287  int warp_id = (thread_id / Detail::kWarpSize);
288  int lane_id = (thread_id % Detail::kWarpSize);
289 
290  //
291  // compute warp-level offset
292  //
293 
294  // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
295  layout::PitchLinearCoord warp_footprint{
296  Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
297  Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
298  };
299 
300  // This is the offset of a specific warp (in units of vectors)
301  layout::PitchLinearCoord warp_offset{
302  (warp_id % Detail::kWarpsContiguous),
303  (warp_id / Detail::kWarpsContiguous)
304  };
305 
306  // This is the offset of a specific thread within a warp (units of vectors)
307  layout::PitchLinearCoord thread_offset_in_warp{
308  lane_id % Detail::WarpThreadArrangement::kContiguous,
309  lane_id / Detail::WarpThreadArrangement::kContiguous
310  };
311 
312  // This is the offset of a thread within a threadblock tile (units of vectors)
313  layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
314  warp_footprint * warp_offset + thread_offset_in_warp;
315 
316  // This is the offset of a thread within a threadblock tile (units of elements)
317  layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
318  thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
319  thread_offset_in_threadblock_tile_vec.strided()
320  };
321 
322  return thread_offset_in_threadblock_tile_base;
323  }
324 };
325 
327 
331 
332 template <typename ThreadMap_, typename WarpThreadArrangement_>
335  using ThreadMap = ThreadMap_;
336 
338  using TensorCoord = typename ThreadMap::TensorCoord;
339 
341  using Shape = typename ThreadMap::Shape;
342 
344  static int const kThreads = ThreadMap::kThreads;
345 
347  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
348 
351 
353  struct Detail {
355  using WarpThreadArrangement = WarpThreadArrangement_;
356 
358  static int const kWarpSize = WarpThreadArrangement::kCount;
359 
361  static int const kWarpCount = kThreads / kWarpSize;
362 
363  static_assert(!(Shape::kContiguous % kElementsPerAccess),
364  "Shape must be divisible by vector length.");
365 
367  using WarpArrangement =
368  layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
369  ThreadMap::Detail::kWarpsContiguous>;
370  };
371 
373  using Iterations =
374  layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
375  ThreadMap::Iterations::kContiguous>;
376 
377  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
378 
380  using Delta =
381  layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
383  Detail::WarpThreadArrangement::kStrided>;
384 
389  static TensorCoord initial_offset(int thread_id) {
390 
391  int warp_id = (thread_id / Detail::kWarpSize);
392  int lane_id = (thread_id % Detail::kWarpSize);
393 
394  //
395  // compute warp-level offset
396  //
397 
398  // This is the shape of the entire area covered by a warp's memory access
399  // (in units of vectors)
400  layout::PitchLinearCoord warp_footprint{
401  Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
402  Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
403 
404  // This is the offset of a specific warp (in units of vectors)
405  // Note the order of / and %. Also the 2nd operand is kStrided.
406  layout::PitchLinearCoord warp_offset{
407  (warp_id / Detail::WarpArrangement::kStrided),
408  (warp_id % Detail::WarpArrangement::kStrided)};
409 
410  // This is the offset of a specific thread within a warp (units of vectors)
411  layout::PitchLinearCoord thread_offset_in_warp{
412  lane_id % Detail::WarpThreadArrangement::kContiguous,
413  lane_id / Detail::WarpThreadArrangement::kContiguous};
414 
415  // This is the offset of a thread within a threadblock tile (units of
416  // vectors)
417  layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
418  warp_footprint * warp_offset + thread_offset_in_warp;
419 
420  // This is the offset of a thread within a threadblock tile (units of
421  // elements)
422  layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
423  thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
424  thread_offset_in_threadblock_tile_vec.strided()};
425 
426  return thread_offset_in_threadblock_tile_base;
427  }
428 };
429 
430 template <typename ThreadMap_>
433  using ThreadMap = ThreadMap_;
434 
436  using TensorCoord = typename ThreadMap::TensorCoord;
437 
439  using Shape = typename ThreadMap::Shape;
440 
442  static int const kThreads = ThreadMap::kThreads;
443 
445  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
446 
447  static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
449  using Iterations =
450  layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
451  ThreadMap::Iterations::kContiguous>;
452 
453  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
454 
456  using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
457 
459  using Delta =
460  layout::PitchLinearShape<ThreadMap::Delta::kStrided,
461  ThreadMap::Delta::kContiguous>;
462 
463 
468  static TensorCoord initial_offset(int thread_id) {
469 
470  TensorCoord coord = ThreadMap::initial_offset(thread_id);
471 
472  return TensorCoord(
473  coord.strided(),
474  coord.contiguous()
475  );
476  }
477 };
478 
480 
481 
485 template <
486  typename Shape_,
487  int Threads,
488  typename WarpThreadArrangement_,
489  int ElementsPerAccess = 1
490 >
492 
495 
497  using Shape = Shape_;
498 
500  static int const kThreads = Threads;
501 
503  static int const kElementsPerAccess = ElementsPerAccess;
504 
507 
509  struct Detail {
510 
512  using WarpThreadArrangement = WarpThreadArrangement_;
513 
515  static int const kWarpSize = WarpThreadArrangement::kCount;
516 
518  static int const kWarpCount = kThreads / kWarpSize;
519 
521  !(Shape::kContiguous % kElementsPerAccess),
522  "Shape must be divisible by vector length.");
523 
526  Shape::kContiguous / kElementsPerAccess,
527  Shape::kStrided
528  >;
529 
530  // compute number of warp-level accesses total
532  ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
533  ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
534  >;
535 
536  // Divide it into the number of warps, first partitioning the strided dimension then the
537  // contiguous.
538  static int const kWarpsStrided =
539  (WarpAccessIterations::kStrided >= kWarpCount
540  ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
541 
542  static int const kWarpsContiguous =
543  (kWarpCount > WarpAccessIterations::kStrided ?
544  WarpAccessIterations::kContiguous / kWarpsStrided : 1);
545 
548  kWarpsContiguous, kWarpsStrided
549  >;
550  };
551 
554  Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
555  Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
556  >;
557 
558  static_assert(Iterations::kCount,
559  "Number of iterations must be non-zero");
560 
563  Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
564  Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
565  >;
566 
569  static TensorCoord initial_offset(int thread_id) {
570 
571  int warp_id = (thread_id / Detail::kWarpSize);
572  int lane_id = (thread_id % Detail::kWarpSize);
573 
574  //
575  // compute warp-level offset
576  //
577 
578  // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
579  layout::PitchLinearCoord warp_footprint{
580  Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
581  Detail::WarpThreadArrangement::kStrided
582  };
583 
584  // This is the offset of a specific warp (in units of vectors)
585  layout::PitchLinearCoord warp_offset{
586  (warp_id % Detail::kWarpsContiguous),
587  (warp_id / Detail::kWarpsContiguous)
588  };
589 
590  // This is the offset of a specific thread within a warp (units of vectors)
591  layout::PitchLinearCoord thread_offset_in_warp{
592  lane_id % Detail::WarpThreadArrangement::kContiguous,
593  lane_id / Detail::WarpThreadArrangement::kContiguous
594  };
595 
596  // This is the offset of a thread within a threadblock tile (units of vectors)
597  layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
598  warp_footprint * warp_offset + thread_offset_in_warp;
599 
600  // This is the offset of a thread within a threadblock tile (units of elements)
601  layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
602  thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
603  thread_offset_in_threadblock_tile_vec.strided()
604  };
605 
606  return thread_offset_in_threadblock_tile_base;
607  }
608 };
609 
618 template <
619  typename Shape_,
620  int Threads,
621  typename ThreadTileShape
622 >
624 
625 
626 template <
627  typename Shape_,
628  int Threads
629 >
631 
634 
636  using Shape = Shape_;
637 
640  //using ThreadAccessShape = ThreadTileShape;
641 
643  static int const kThreads = Threads;
644 
646  static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
647 
648  static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
649 
651  struct Detail {
652 
653  static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
654 
655  static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
656 
657  static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
658  "Shape must be divisible thread count * accesses per thread.");
659 
662  Shape::kContiguous / ThreadAccessShape::kContiguous,
663  Shape::kStrided / ThreadAccessShape::kStrided
664  >;
665 
667  (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
668  (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
669  "Shape must be divisible by number of iterations of each thread."
670  );
671  };
672 
674  using Iterations = typename platform::conditional<
677  1,
679  >,
680  layout::PitchLinearShape<
683  >
684  >::type;
685 
688  using Delta = typename platform::conditional<
690  layout::PitchLinearShape<
691  Shape::kContiguous,
693  >,
694  layout::PitchLinearShape<
696  1
697  >
698  >::type;
699 
703  static TensorCoord initial_offset(int thread_id) {
704 
705  return TensorCoord(
706  (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
707  (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
708  }
709 };
710 
712 template <typename ThreadMap_>
715  using ThreadMap = ThreadMap_;
716 
718  using TensorCoord = typename ThreadMap::TensorCoord;
719 
721  using Shape = typename ThreadMap::Shape;
722 
724  static int const kThreads = ThreadMap::kThreads;
725 
727  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
728 
729 
730  static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
732  using Iterations =
733  layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
734  ThreadMap::Iterations::kContiguous>;
735 
736  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
737 
739  using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
740 
742  using Delta =
743  layout::PitchLinearShape<ThreadMap::Delta::kStrided,
744  ThreadMap::Delta::kContiguous>;
745 
746 
751  static TensorCoord initial_offset(int thread_id) {
752 
753  TensorCoord coord = ThreadMap::initial_offset(thread_id);
754  return TensorCoord(
755  coord.strided(),
756  coord.contiguous()
757  );
758  }
759 };
760 
761 
763 
764 } // namespace transform
765 } // namespace cutlass
766 
static int const kCount
Definition: pitch_linear.h:46
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:65
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
layout::PitchLinearCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:62
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:433
Definition: pitch_linear_thread_map.h:431
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:512
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:741
static int const kElementsPerAccess
Extract vector length from Layout.
Definition: pitch_linear_thread_map.h:71
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor&#39;s logical coordinate space.
Definition: pitch_linear_thread_map.h:569
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:497
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:439
Defines a structure containing strides and a pointer to tensor data.
Shape ShapeVec
Definition: pitch_linear_thread_map.h:184
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:468
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
Definition: pitch_linear_thread_map.h:333
Defines container classes and iterators for managing a statically sized vector of boolean predicates...
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:211
static int const kStrided
Definition: pitch_linear.h:45
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:684
static int const kContiguous
Definition: pitch_linear.h:44
Definition: pitch_linear_thread_map.h:491
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:187
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:226
Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.
Definition: pitch_linear_thread_map.h:713
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:355
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:353
Definition: pitch_linear_thread_map.h:205
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:335
Internal implementation details.
Definition: pitch_linear_thread_map.h:77
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:341
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89
std::conditional (true specialization)
Definition: platform.h:325
#define static_assert(__e, __m)
Definition: platform.h:153
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:157
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:389
static int const kThreads
Number of threads total.
Definition: pitch_linear_thread_map.h:68
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:751
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor&#39;s logical coordinate space.
Definition: pitch_linear_thread_map.h:285
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:127
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:223
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:436
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:108
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:721
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1, kThreads/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *kElementsPerAccess, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:122
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:338
Basic include for CUTLASS.
Definition: pitch_linear_thread_map.h:59
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< Shape::kContiguous, kThreads *ThreadAccessShape::kStrided/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *ThreadAccessShape::kContiguous, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:698
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:509
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:718
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:703
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:715
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:458