cutlass/pitch__linear__thread__map_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/coord.h"
 #include "cutlass/predicate_vector.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/layout/pitch_linear.h"


 namespace cutlass {
 namespace transform {


 template <
   typename Shape_,
   int Threads,
   int ElementsPerAccess = 1
 >
 struct PitchLinearStripminedThreadMap {

   using TensorCoord = layout::PitchLinearCoord;

   using Shape = Shape_;

   static int const kThreads = Threads;

   static int const kElementsPerAccess = ElementsPerAccess;

   using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

   struct Detail {

     static_assert(!(Shape::kContiguous % kElementsPerAccess), "");

     static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * kElementsPerAccess)),
       "Shape must be divisible thread count.");

     using ShapeVec = layout::PitchLinearShape<
       Shape::kContiguous / kElementsPerAccess,
       Shape::kStrided
     >;

     static_assert(
       (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
       (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
       "Shape must be divisible by number of iterations of each thread."
     );
   };

   using Iterations = typename platform::conditional<
     Threads >= Detail::ShapeVec::kContiguous,
     layout::PitchLinearShape<
       1,
       (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)
     >,
     layout::PitchLinearShape<
       Detail::ShapeVec::kContiguous / kThreads,
       Detail::ShapeVec::kStrided
     >
   >::type;

   using Delta = typename platform::conditional<
     Threads >= Detail::ShapeVec::kContiguous,
     layout::PitchLinearShape<
       1,
       kThreads / Detail::ShapeVec::kContiguous
     >,
     layout::PitchLinearShape<
       kThreads * kElementsPerAccess,
       1
     >
   >::type;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id) {

     return TensorCoord(
       (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess,
       thread_id / Detail::ShapeVec::kContiguous);
   }
 };

 template <
   typename Shape,
   int Threads,
   int ElementsPerAccess = 1
 >
 struct PitchLinearTilePolicyStripminedThreadContiguous
 {
  static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
               "Contiguous shape must divide number of threads");

   using TensorCoord = layout::PitchLinearCoord;

   static int const kThreads = Threads;
   static int const kElementsPerAccess = ElementsPerAccess;

   using Iterations = layout::PitchLinearShape<
                       Shape::kContiguous / (kThreads * kElementsPerAccess),
                       Shape::kStrided>;

   using Delta = layout::PitchLinearShape<1, 1>;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id)
   {
     return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
   }
 };

 template <
   typename Shape,
   int Threads,
   int ElementsPerAccess = 1
 >
 struct PitchLinearTilePolicyStripminedThreadStrided
 {
   static_assert((Shape::kStrided % Threads == 0),
                 "Strided shape must divide number of threads");

   using TensorCoord = layout::PitchLinearCoord;

   static int const kThreads = Threads;
   static int const kElementsPerAccess = ElementsPerAccess;

   using Iterations = layout::PitchLinearShape<
                       Shape::kContiguous / kElementsPerAccess,
                       Shape::kStrided / kThreads>;

   using Delta = layout::PitchLinearShape<1, 1>;

   using ShapeVec = Shape;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id)
   {

     return TensorCoord(0, thread_id * Iterations::kStrided);
   }
 };


 template <
   typename Shape_,
   int Threads,
   typename WarpThreadArrangement_,
   int ElementsPerAccess = 1
 >
 struct PitchLinearWarpRakedThreadMap {

   using TensorCoord = layout::PitchLinearCoord;

   using Shape = Shape_;

   static int const kThreads = Threads;

   static int const kElementsPerAccess = ElementsPerAccess;

   using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

   struct Detail {

     using WarpThreadArrangement = WarpThreadArrangement_;

     static int const kWarpSize = WarpThreadArrangement::kCount;

     static int const kWarpCount = kThreads / kWarpSize;

     static_assert(
       !(Shape::kContiguous % kElementsPerAccess),
       "Shape must be divisible by vector length.");

     using ShapeInAccesses = layout::PitchLinearShape<
       Shape::kContiguous / kElementsPerAccess,
       Shape::kStrided
     >;

     // compute number of warp-level accesses total
     using WarpAccessIterations = layout::PitchLinearShape<
       ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
       ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
     >;

     // Divide it into the number of warps, first partitioning the strided dimension then the
     // contiguous.
     static int const kWarpsStrided =
         (WarpAccessIterations::kStrided >= kWarpCount
              ? kWarpCount
              : WarpAccessIterations::kStrided);

     static int const kWarpsContiguous =
         (kWarpCount > WarpAccessIterations::kStrided
              ? kWarpCount / kWarpsStrided
              : 1);

     using WarpArrangement = layout::PitchLinearShape<
       kWarpsContiguous, kWarpsStrided
     >;
   };

   using Iterations = layout::PitchLinearShape<
     Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
     Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
   >;

   static_assert(Iterations::kCount,
     "Number of iterations must be non-zero");

   using Delta = layout::PitchLinearShape<
     Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
     Detail::WarpThreadArrangement::kStrided
   >;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id) {

     int warp_id = (thread_id / Detail::kWarpSize);
     int lane_id = (thread_id % Detail::kWarpSize);

     //
     // compute warp-level offset
     //

     // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
     layout::PitchLinearCoord warp_footprint{
       Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
       Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
     };

     // This is the offset of a specific warp (in units of vectors)
     layout::PitchLinearCoord warp_offset{
       (warp_id % Detail::kWarpsContiguous),
       (warp_id / Detail::kWarpsContiguous)
     };

     // This is the offset of a specific thread within a warp (units of vectors)
     layout::PitchLinearCoord thread_offset_in_warp{
       lane_id % Detail::WarpThreadArrangement::kContiguous,
       lane_id / Detail::WarpThreadArrangement::kContiguous
     };

     // This is the offset of a thread within a threadblock tile (units of vectors)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
       warp_footprint * warp_offset + thread_offset_in_warp;

     // This is the offset of a thread within a threadblock tile (units of elements)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
       thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
       thread_offset_in_threadblock_tile_vec.strided()
     };

     return thread_offset_in_threadblock_tile_base;
   }
 };


 template <typename ThreadMap_, typename WarpThreadArrangement_>
 struct TransposePitchLinearThreadMap {
   using ThreadMap = ThreadMap_;

   using TensorCoord = typename ThreadMap::TensorCoord;

   using Shape = typename ThreadMap::Shape;

   static int const kThreads = ThreadMap::kThreads;

   static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

   using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

   struct Detail {
     using WarpThreadArrangement = WarpThreadArrangement_;

     static int const kWarpSize = WarpThreadArrangement::kCount;

     static int const kWarpCount = kThreads / kWarpSize;

     static_assert(!(Shape::kContiguous % kElementsPerAccess),
                   "Shape must be divisible by vector length.");

     using WarpArrangement =
         layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
                                  ThreadMap::Detail::kWarpsContiguous>;
   };

   using Iterations =
       layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
                                ThreadMap::Iterations::kContiguous>;

   static_assert(Iterations::kCount, "Number of iterations must be non-zero");

   using Delta =
       layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
                                    kElementsPerAccess,
                                Detail::WarpThreadArrangement::kStrided>;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id) {

     int warp_id = (thread_id / Detail::kWarpSize);
     int lane_id = (thread_id % Detail::kWarpSize);

     //
     // compute warp-level offset
     //

     // This is the shape of the entire area covered by a warp's memory access
     // (in units of vectors)
     layout::PitchLinearCoord warp_footprint{
         Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
         Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};

     // This is the offset of a specific warp (in units of vectors)
     // Note the order of / and %. Also the 2nd operand is kStrided.
     layout::PitchLinearCoord warp_offset{
         (warp_id / Detail::WarpArrangement::kStrided),
         (warp_id % Detail::WarpArrangement::kStrided)};

     // This is the offset of a specific thread within a warp (units of vectors)
     layout::PitchLinearCoord thread_offset_in_warp{
         lane_id % Detail::WarpThreadArrangement::kContiguous,
         lane_id / Detail::WarpThreadArrangement::kContiguous};

     // This is the offset of a thread within a threadblock tile (units of
     // vectors)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
         warp_footprint * warp_offset + thread_offset_in_warp;

     // This is the offset of a thread within a threadblock tile (units of
     // elements)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
         thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
         thread_offset_in_threadblock_tile_vec.strided()};

     return thread_offset_in_threadblock_tile_base;
   }
 };

 template <typename ThreadMap_>
 struct TransposePitchLinearThreadMapSimt {
     using ThreadMap = ThreadMap_;

     using TensorCoord = typename ThreadMap::TensorCoord;

     using Shape = typename ThreadMap::Shape;

     static int const kThreads = ThreadMap::kThreads;

     static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

     static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
     using Iterations =
         layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
         ThreadMap::Iterations::kContiguous>;

     static_assert(Iterations::kCount, "Number of iterations must be non-zero");

     using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;

     using Delta =
         layout::PitchLinearShape<ThreadMap::Delta::kStrided,
         ThreadMap::Delta::kContiguous>;


     CUTLASS_HOST_DEVICE
         static TensorCoord initial_offset(int thread_id) {

         TensorCoord coord = ThreadMap::initial_offset(thread_id);

         return TensorCoord(
             coord.strided(),
             coord.contiguous()
         );
     }
 };


 template <
   typename Shape_,
   int Threads,
   typename WarpThreadArrangement_,
   int ElementsPerAccess = 1
 >
 struct PitchLinearWarpStripedThreadMap {

   using TensorCoord = layout::PitchLinearCoord;

   using Shape = Shape_;

   static int const kThreads = Threads;

   static int const kElementsPerAccess = ElementsPerAccess;

   using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

   struct Detail {

     using WarpThreadArrangement = WarpThreadArrangement_;

     static int const kWarpSize = WarpThreadArrangement::kCount;

     static int const kWarpCount = kThreads / kWarpSize;

     static_assert(
       !(Shape::kContiguous % kElementsPerAccess),
       "Shape must be divisible by vector length.");

     using ShapeInAccesses = layout::PitchLinearShape<
       Shape::kContiguous / kElementsPerAccess,
       Shape::kStrided
     >;

     // compute number of warp-level accesses total
     using WarpAccessIterations = layout::PitchLinearShape<
       ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
       ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
     >;

     // Divide it into the number of warps, first partitioning the strided dimension then the
     // contiguous.
     static int const kWarpsStrided =
       (WarpAccessIterations::kStrided >= kWarpCount
         ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));

     static int const kWarpsContiguous =
       (kWarpCount > WarpAccessIterations::kStrided ?
         WarpAccessIterations::kContiguous / kWarpsStrided : 1);

     using WarpArrangement = layout::PitchLinearShape<
       kWarpsContiguous, kWarpsStrided
     >;
   };

   using Iterations = layout::PitchLinearShape<
     Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
     Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
   >;

   static_assert(Iterations::kCount,
     "Number of iterations must be non-zero");

   using Delta = layout::PitchLinearShape<
     Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
     Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
   >;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id) {

     int warp_id = (thread_id / Detail::kWarpSize);
     int lane_id = (thread_id % Detail::kWarpSize);

     //
     // compute warp-level offset
     //

     // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
     layout::PitchLinearCoord warp_footprint{
       Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
       Detail::WarpThreadArrangement::kStrided
     };

     // This is the offset of a specific warp (in units of vectors)
     layout::PitchLinearCoord warp_offset{
       (warp_id % Detail::kWarpsContiguous),
       (warp_id / Detail::kWarpsContiguous)
     };

     // This is the offset of a specific thread within a warp (units of vectors)
     layout::PitchLinearCoord thread_offset_in_warp{
       lane_id % Detail::WarpThreadArrangement::kContiguous,
       lane_id / Detail::WarpThreadArrangement::kContiguous
     };

     // This is the offset of a thread within a threadblock tile (units of vectors)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
       warp_footprint * warp_offset + thread_offset_in_warp;

     // This is the offset of a thread within a threadblock tile (units of elements)
     layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
       thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
       thread_offset_in_threadblock_tile_vec.strided()
     };

     return thread_offset_in_threadblock_tile_base;
   }
 };

 template <
   typename Shape_,
   int Threads,
   typename ThreadTileShape
 >
 struct PitchLinear2DThreadTileStripminedThreadMap;


 template <
   typename Shape_,
   int Threads
 >
 struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{

   using TensorCoord = layout::PitchLinearCoord;

   using Shape = Shape_;

   using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
   //using ThreadAccessShape = ThreadTileShape;

   static int const kThreads = Threads;

   static int const kElementsPerAccess = ThreadAccessShape::kContiguous;

   static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");

   struct Detail {

     static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");

     static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");

     static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
       "Shape must be divisible thread count * accesses per thread.");

     using ShapeVec = layout::PitchLinearShape<
       Shape::kContiguous / ThreadAccessShape::kContiguous,
       Shape::kStrided / ThreadAccessShape::kStrided
     >;

     static_assert(
       (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
       (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
       "Shape must be divisible by number of iterations of each thread."
     );
   };

   using Iterations = typename platform::conditional<
     Threads >= Detail::ShapeVec::kContiguous,
     layout::PitchLinearShape<
       1,
       (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)
     >,
     layout::PitchLinearShape<
       Detail::ShapeVec::kContiguous / kThreads,
       Detail::ShapeVec::kStrided
     >
   >::type;

   using Delta = typename platform::conditional<
     Threads >= Detail::ShapeVec::kContiguous,
     layout::PitchLinearShape<
       Shape::kContiguous,
       kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
     >,
     layout::PitchLinearShape<
       kThreads * ThreadAccessShape::kContiguous,
       1
     >
   >::type;

   CUTLASS_HOST_DEVICE
   static TensorCoord initial_offset(int thread_id) {

     return TensorCoord(
       (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
       (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
   }
 };

 template <typename ThreadMap_>
 struct TransposePitchLinearThreadMap2DThreadTile {
     using ThreadMap = ThreadMap_;

     using TensorCoord = typename ThreadMap::TensorCoord;

     using Shape = typename ThreadMap::Shape;

     static int const kThreads = ThreadMap::kThreads;

     static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;


     static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
     using Iterations =
         layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
         ThreadMap::Iterations::kContiguous>;

     static_assert(Iterations::kCount, "Number of iterations must be non-zero");

     using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;

     using Delta =
         layout::PitchLinearShape<ThreadMap::Delta::kStrided,
         ThreadMap::Delta::kContiguous>;


     CUTLASS_HOST_DEVICE
         static TensorCoord initial_offset(int thread_id) {

         TensorCoord coord = ThreadMap::initial_offset(thread_id);
         return TensorCoord(
             coord.strided(),
             coord.contiguous()
         );
     }
 };


 } // namespace transform
 } // namespace cutlass

cutlass::layout::PitchLinearShape::kCount
static int const kCount
Definition: pitch_linear.h:46

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap
Definition: pitch_linear_thread_map.h:623

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::PitchLinearStripminedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:65

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::transform::PitchLinearStripminedThreadMap::TensorCoord
layout::PitchLinearCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:62

cutlass::transform::TransposePitchLinearThreadMapSimt::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:433

cutlass::transform::TransposePitchLinearThreadMapSimt
Definition: pitch_linear_thread_map.h:431

cutlass::transform::PitchLinearWarpStripedThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:512

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadAccessShape
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:741

cutlass::transform::PitchLinearStripminedThreadMap::kElementsPerAccess
static int const kElementsPerAccess
Extract vector length from Layout.
Definition: pitch_linear_thread_map.h:71

cutlass::transform::PitchLinearWarpStripedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor&#39;s logical coordinate space.
Definition: pitch_linear_thread_map.h:569

cutlass::transform::PitchLinearWarpStripedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:497

cutlass::transform::TransposePitchLinearThreadMapSimt::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:439

tensor_view.h
Defines a structure containing strides and a pointer to tensor data.

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::ShapeVec
Shape ShapeVec
Definition: pitch_linear_thread_map.h:184

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:636

cutlass::transform::TransposePitchLinearThreadMapSimt::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:468

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::transform::TransposePitchLinearThreadMap
Definition: pitch_linear_thread_map.h:333

predicate_vector.h
Defines container classes and iterators for managing a statically sized vector of boolean predicates...

cutlass::transform::PitchLinearWarpRakedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:211

cutlass::layout::PitchLinearShape::kStrided
static int const kStrided
Definition: pitch_linear.h:45

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Iterations
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:684

cutlass::layout::PitchLinearShape::kContiguous
static int const kContiguous
Definition: pitch_linear.h:44

cutlass::transform::PitchLinearWarpStripedThreadMap
Definition: pitch_linear_thread_map.h:491

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:187

cutlass::transform::PitchLinearWarpRakedThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:226

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile
Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.
Definition: pitch_linear_thread_map.h:713

cutlass::transform::TransposePitchLinearThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:355

cutlass::transform::TransposePitchLinearThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:353

cutlass::transform::PitchLinearWarpRakedThreadMap
Definition: pitch_linear_thread_map.h:205

cutlass::transform::TransposePitchLinearThreadMap::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:335

cutlass::transform::PitchLinearStripminedThreadMap::Detail
Internal implementation details.
Definition: pitch_linear_thread_map.h:77

cutlass::transform::TransposePitchLinearThreadMap::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:341

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89

cutlass::platform::conditional
std::conditional (true specialization)
Definition: platform.h:325

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:157

cutlass::transform::TransposePitchLinearThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:389

cutlass::transform::PitchLinearStripminedThreadMap::kThreads
static int const kThreads
Number of threads total.
Definition: pitch_linear_thread_map.h:68

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:751

cutlass::transform::PitchLinearWarpRakedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor&#39;s logical coordinate space.
Definition: pitch_linear_thread_map.h:285

cutlass::transform::PitchLinearStripminedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:127

cutlass::transform::PitchLinearWarpRakedThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:223

cutlass::transform::TransposePitchLinearThreadMapSimt::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:436

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::transform::PitchLinearStripminedThreadMap::Iterations
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:108

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:721

cutlass::transform::PitchLinearStripminedThreadMap::Delta
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1, kThreads/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *kElementsPerAccess, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:122

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided
Definition: pitch_linear_thread_map.h:168

cutlass::transform::TransposePitchLinearThreadMap::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:338

cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous
Definition: pitch_linear_thread_map.h:140

cutlass.h
Basic include for CUTLASS.

cutlass::transform::PitchLinearStripminedThreadMap
Definition: pitch_linear_thread_map.h:59

cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Delta
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< Shape::kContiguous, kThreads *ThreadAccessShape::kStrided/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *ThreadAccessShape::kContiguous, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:698

cutlass::transform::PitchLinearWarpStripedThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:509

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:718

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:703

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:715

cutlass::transform::TransposePitchLinearThreadMapSimt::ThreadAccessShape
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:458