cutlass/regular__tile__iterator__pitch__linear__2dthreadtile_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"

 #include "regular_tile_iterator.h"


 namespace cutlass {
 namespace transform {
 namespace threadblock {

 template <
   typename Shape,
   typename Element,
   typename Layout,
   int AdvanceRank,
   typename ThreadMap,
   int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
 >
 class RegularTileIterator2dThreadTile;


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
 public:

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::PitchLinear;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;

   static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
     "Advance rank may only be along the contiguous or strided dimensions.");

 private:

   //
   // Types
   //

   using AccessType = AlignedArray<Element, ThreadMap::ThreadAccessShape::kCount, kAlignment>;

   //
   // Data members
   //

   uint8_t *pointer_;

   Index stride_;

   Index increment_strided_;

   Index increment_advance_;

 public:

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile(
     TensorRef const &ref,
     int thread_idx,
     int interleave
   ){

     TensorCoord t = ThreadMap::initial_offset(thread_idx);
     long int offset = t[0] * interleave + t[1] * ref.stride()[0]/interleave;
     pointer_ = reinterpret_cast<uint8_t *>(ref.data() + offset);

     stride_ = ref.stride()[0] / interleave;
     increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value / 8) * ThreadMap::Delta::kStrided / interleave;

     increment_advance_ =
       (kAdvanceRank == 0 ?
         Shape::kContiguous * sizeof_bits<Element>::value / 8 :
         Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8) / interleave);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {

     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
     uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

           int idx = c + s * ThreadMap::Iterations::kContiguous;
            frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided];
         }

       if (s + 1 < ThreadMap::Iterations::kStrided) {
         byte_pointer += increment_strided_;
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag, TensorCoord const & tile_offset) {
     load_with_pointer_offset(
       frag,
       tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
         tile_offset.strided() * Shape::kStrided * stride_
     );
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {

     AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
     uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {

       AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);

       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {

           int idx = c + s * ThreadMap::Iterations::kContiguous;
           access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided] = frag_ptr[idx];
       }

       if (s + 1 < ThreadMap::Iterations::kStrided) {
         byte_pointer += increment_strided_;
       }
     }
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag, TensorCoord const & tile_offset) {
     store_with_pointer_offset(
       frag,
       tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
     );
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator++() {
     pointer_ += increment_advance_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator--() {
     pointer_ -= increment_advance_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     pointer_ += pointer_offset;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     int offset = sizeof_bits<Element>::value *
         (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
     add_pointer_offset(offset);
   }

 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::RowMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
 public:

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorInterleaved<4>;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;

   using Underlying = RegularTileIterator2dThreadTile<
     layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
     Element,
     layout::PitchLinear,
     (kAdvanceRank == 0 ? 1 : 0),
     ThreadMap,
     kAlignment
   >;

   static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
     "Advance rank may only be along the row or column dimensions.");

 private:

   Underlying iterator_;

 public:

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile() { }

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile(
     TensorRef const &ref,
     int thread_idx
   ):
     iterator_({ref.data(), ref.stride()}, thread_idx, 4) {

   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag, TensorCoord const & tile_offset) {
     iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) {
     iterator_.load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag, TensorCoord const & tile_offset) {
     iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) {
     iterator_.store_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator--() {
     --iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

 };


 template <
   typename Shape_,
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
   int Alignment
 >
 class RegularTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
 public:

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorInterleaved<4>;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
   using PitchLinearThreadMap = PitchLinearStripminedThreadMap< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
                                   ThreadMap::kThreads, ThreadMap::ThreadAccessShape::kCount >;


   using Underlying = RegularTileIterator2dThreadTile<
     layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
     Element,
     layout::PitchLinear,
     (kAdvanceRank == 0 ? 0 : 1),
     ThreadMap
   >;

   static_assert(kAdvanceRank == 0 || kAdvanceRank == 1,
     "Advance rank may only be along the row or column dimensions.");

 private:

   Underlying iterator_;

 public:

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile() { }

   CUTLASS_DEVICE
   RegularTileIterator2dThreadTile(
     TensorRef const &ref,
     int thread_idx
   ):
     iterator_({ref.data(), ref.stride()}, thread_idx, 4) {

   }

   CUTLASS_HOST_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag, TensorCoord const & tile_offset) {
     iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
   }

   CUTLASS_HOST_DEVICE
   void load(Fragment &frag) {
     iterator_.load_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag, TensorCoord const & tile_offset) {
     iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
   }

   CUTLASS_HOST_DEVICE
   void store(Fragment const &frag) {
     iterator_.store_with_pointer_offset(frag, 0);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator2dThreadTile &operator--() {
     --iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

 };


 } // namespace threadblock
 } // namespace transform
 } // namespace cutlass

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:441

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:80

cutlass::layout::ColumnMajorInterleaved< 4 >::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:355

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:459

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx, int interleave)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:121

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::operator--
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:230

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

cutlass::sizeof_bits::value
static int const value
Definition: numeric_types.h:43

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:183

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:277

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::layout::RowMajorInterleaved< 4 >::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:249

cutlass::layout::PitchLinear
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:367

cutlass::layout::ColumnMajorInterleaved< 4 >::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:352

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:329

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:386

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:177

cutlass::AlignedArray
Aligned array type.
Definition: array.h:511

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:435

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:84

cutlass::layout::RowMajorInterleaved< 4 >::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:246

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:341

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::operator--
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:354

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:271

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_HOST_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:311

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile()
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:422

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:447

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:272

cutlass::layout::PitchLinear::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile()
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:118

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:243

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_HOST_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:453

nullptr
#define nullptr
nullptr
Definition: platform.h:144

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:317

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:237

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:335

cutlass::TensorRef< Element, Layout >

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:301

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:485

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:275

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:347

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:394

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:73

cutlass::layout::PitchLinear::Index
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile(TensorRef const &ref, int thread_idx)
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:425

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:465

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:223

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:86

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:217

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:142

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:361

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_HOST_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset)
Stores a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:208

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:393

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator++()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:471

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::layout::ColumnMajorInterleaved< 4 >

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator2dThreadTile
CUTLASS_DEVICE RegularTileIterator2dThreadTile()
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:298

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:323

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:397

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:81

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_HOST_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset)
Loads a fragment.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:167

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::RowMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:264

cutlass::transform::threadblock::RegularTileIterator2dThreadTile
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:59

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::transform::PitchLinearStripminedThreadMap
Definition: pitch_linear_thread_map.h:59

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *ThreadMap::ThreadAccessShape::kCount > Fragment
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:399

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:491

cutlass::transform::threadblock::RegularTileIterator2dThreadTile< Shape_, Element_, layout::ColumnMajorInterleaved< 4 >, AdvanceRank, ThreadMap_, Alignment >::operator--
CUTLASS_HOST_DEVICE RegularTileIterator2dThreadTile & operator--()
Advances the pointer.
Definition: regular_tile_iterator_pitch_linear_2dthreadtile.h:478

cutlass::layout::RowMajorInterleaved< 4 >