cutlass/epilogue_2threadblock_2predicated__tile__iterator_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/tensor_ref.h"

 #include "cutlass/transform/pitch_linear_thread_map.h"
 #include "cutlass/epilogue/threadblock/output_tile_thread_map.h"


 namespace cutlass {


 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadMap_,
   typename Element_
 >
 class PredicatedTileIterator {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;

   using Element = Element_;

   using Layout = layout::RowMajor;
   using TensorRef = TensorRef<Element, Layout>;
   using ConstTensorRef = typename TensorRef::ConstTensorRef;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
   using TensorCoord = MatrixCoord;

   static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
   static int const kThreads = ThreadMap::kThreads;
   static int const kIterations = ThreadMap::Count::kTile;

   static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
   static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
   static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
   static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");

   using Fragment = Array<
     Element,
     ThreadMap::Iterations::kColumn *
     ThreadMap::Iterations::kRow *
     ThreadMap::Iterations::kGroup *
     ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;

   using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;

   //
   // Parameters struct
   //

   struct Params {

     //
     // Data members
     //

     Index stride;

     Index increment_row;
     Index increment_group;
     Index increment_cluster;

     Index advance_row;
     Index advance_group;
     Index advance_cluster;
     Index advance_tile;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Status initialize(Index stride_) {

       stride = stride_;

       increment_row = stride * ThreadMap::Delta::kRow;

       increment_group = stride * ThreadMap::Delta::kGroup
         - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1);

       increment_cluster = stride * ThreadMap::Delta::kCluster
         - stride * ThreadMap::Delta::kGroup * (ThreadMap::Iterations::kGroup - 1)
         - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1);

       advance_row = stride * ThreadMap::Shape::kRow;

       advance_group = stride * (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;

       advance_cluster =
         stride *
         ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;;

       advance_tile =
         stride *
         ThreadMap::Shape::kGroup *
         ThreadMap::Shape::kRow *
         ThreadMap::Shape::kCluster *
         ThreadMap::Shape::kTile;

       return Status::kSuccess;
     }

     CUTLASS_HOST_DEVICE
     Params() {
       initialize(0);
     }

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout) {

       initialize(layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess);
     }
   };

   struct Mask {

     static int const kCount = ThreadMap::Iterations::kColumn;

     bool predicates[kCount];

     //
     // Mask
     //
     CUTLASS_HOST_DEVICE
     Mask() {
       enable();
     }

     CUTLASS_HOST_DEVICE void clear() {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kCount; ++i) {
         predicates[i] = false;
       }
     }

     CUTLASS_DEVICE void enable() {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kCount; ++i) {
         predicates[i] = true;
       }
     }
   };

 private:

   //
   // Data members
   //

   Params params_;

   uint8_t *byte_pointer_;

   Mask mask_;

   Index extent_row_;

   Index thread_start_row_;

   int state_[3];

 private:

   //
   // Methods
   //

 public:

   //
   // Methods
   //

   CUTLASS_DEVICE
   PredicatedTileIterator(
     Params const & params,
     Element *pointer,
     TensorCoord extent,
     int thread_idx,
     TensorCoord threadblock_offset = TensorCoord()
   ):
     params_(params) {

     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;

     extent_row_ = extent.row();
     thread_start_row_ = thread_offset.row();

     // Initialize predicates
     CUTLASS_PRAGMA_UNROLL
     for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {

       mask_.predicates[c] = ((thread_offset.column()
         + ThreadMap::Delta::kColumn * c) < extent.column());
     }

     // Initialize pointer
     byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
       thread_offset.row() * params_.stride +
       thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;

     // Initialize internal state counter
     state_[0] = state_[1] = state_[2] = 0;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {

     uint8_t *byte_pointer = byte_pointer_;
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {

       CUTLASS_PRAGMA_UNROLL
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {

           int frag_row_idx =
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));

           int row_offset = row * ThreadMap::Delta::kRow
             + group * ThreadMap::Delta::kGroup
             + cluster * ThreadMap::Delta::kCluster;

           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);

           AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);

           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {

             bool guard = row_guard && mask_.predicates[column];

             if (guard) {
               frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] =
                 memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess];
             }
           }

           if (row + 1 < ThreadMap::Iterations::kRow) {
             byte_pointer += params_.increment_row;
           }
         }

         if (group + 1 < ThreadMap::Iterations::kGroup) {
           byte_pointer += params_.increment_group;
         }
       }

       if (cluster + 1 < ThreadMap::Iterations::kCluster) {
         byte_pointer += params_.increment_cluster;
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     uint8_t *byte_pointer = byte_pointer_;
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {

       CUTLASS_PRAGMA_UNROLL
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {

           int frag_row_idx =
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));

           int row_offset = row * ThreadMap::Delta::kRow
             + group * ThreadMap::Delta::kGroup
             + cluster * ThreadMap::Delta::kCluster;

           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);

           AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);

           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {

             bool guard = row_guard && mask_.predicates[column];

             if (guard) {

               memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
                 frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
             }
           }

           if (row + 1 < ThreadMap::Iterations::kRow) {
             byte_pointer += params_.increment_row;
           }
         }

         if (group + 1 < ThreadMap::Iterations::kGroup) {
           byte_pointer += params_.increment_group;
         }
       }

       if (cluster + 1 < ThreadMap::Iterations::kCluster) {
         byte_pointer += params_.increment_cluster;
       }
     }
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileIterator &operator++() {

     ++state_[0];
     byte_pointer_ += params_.advance_row;
     thread_start_row_ += ThreadMap::Shape::kRow;

     if (state_[0] == ThreadMap::Count::kRow) {

       state_[0] = 0;
       ++state_[1];
       byte_pointer_ += params_.advance_group;

       thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
         ThreadMap::Shape::kRow * ThreadMap::Count::kRow;

       if (state_[1] == ThreadMap::Count::kGroup) {

         state_[1] = 0;
         ++state_[2];
         byte_pointer_ += params_.advance_cluster;

         thread_start_row_ += ThreadMap::Count::kGroup *
           ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;

         if (state_[2] == ThreadMap::Count::kCluster) {
           state_[2] = 0;
           byte_pointer_ += params_.advance_tile;
         }
       }
     }

     return *this;
   }

   CUTLASS_DEVICE void clear_mask() {
     mask_.clear();
   }

   CUTLASS_DEVICE void enable_mask() {
     mask_.enable();
   }

   CUTLASS_DEVICE void get_mask(Mask &mask) {
     return mask_;
   }

   CUTLASS_DEVICE void set_mask(Mask const &mask) {
     mask_ = mask;
   }
 };

 template <
   typename ThreadMap_,
   typename Element_,
   int InterleavedK
 >
 class InterleavedPredicatedTileIterator {
 public:
   using ThreadMap = ThreadMap_;

   using Element = Element_;

   using Layout = layout::ColumnMajorInterleaved<InterleavedK>;
   using TensorRef = TensorRef<Element, Layout>;
   using ConstTensorRef = typename TensorRef::ConstTensorRef;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
   using TensorCoord = layout::PitchLinearCoord;

   static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
   static int const kThreads = ThreadMap::kThreads;
   static int const kIterations = ThreadMap::Iterations::kCount;

   using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;

   using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;

   //
   // Parameters struct
   //

   struct Params {

     //
     // Data members
     //

     Index stride;

     Index advance_row;
     Index advance_column;

     //
     // Methods
     //

     CUTLASS_HOST_DEVICE
     Status initialize(Index stride_) {
       stride = stride_;

       advance_row =
           ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8;

       advance_column =
           stride_ - ThreadMap::Iterations::kContiguous * kElementsPerAccess *
                         sizeof_bits<Element>::value * ThreadMap::kWarpSize / 8;

       return Status::kSuccess;
     }

     CUTLASS_HOST_DEVICE
     Params() {
       initialize(0);
     }

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout) {

       initialize(layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess);
     }
   };

   struct Mask {
     static int const kCount = (ThreadMap::Iterations::kContiguous < 8)
                                   ? 8
                                   : ThreadMap::Iterations::kContiguous;

     bool predicates[kCount];

     //
     // Mask
     //
     CUTLASS_HOST_DEVICE
     Mask() {
       enable();
     }

     CUTLASS_HOST_DEVICE void clear() {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kCount; ++i) {
         predicates[i] = false;
       }
     }

     CUTLASS_DEVICE void enable() {
       CUTLASS_PRAGMA_UNROLL
       for (int i = 0; i < kCount; ++i) {
         predicates[i] = true;
       }
     }
   };

 private:

   //
   // Data members
   //

   Params params_;

   uint8_t *byte_pointer_;

   Mask mask_;

   Index extent_col_;

   Index thread_start_col_;

   int iteration_contiguous_;

   int iteration_strided_;

 private:

   //
   // Methods
   //

 public:

   //
   // Methods
   //

   CUTLASS_DEVICE
   InterleavedPredicatedTileIterator(
     Params const & params,
     Element *pointer,
     TensorCoord extent,
     int thread_idx,
     TensorCoord threadblock_offset
   ):
     params_(params) {
     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) +
                                 TensorCoord(threadblock_offset.contiguous() * InterleavedK,
                                  threadblock_offset.strided() / InterleavedK);

     extent_col_ = extent.strided() / InterleavedK;
     thread_start_col_ = thread_offset.strided();

     // Initialize predicates
     CUTLASS_PRAGMA_UNROLL
     for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
       mask_.predicates[c] =
           ((thread_offset.contiguous() + ThreadMap::Delta::kContiguous * c) <
            (extent.contiguous() * InterleavedK));
     }

     // Initialize pointer
     byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
       thread_offset.strided() * params_.stride +
       thread_offset.contiguous() * sizeof(AccessType) / kElementsPerAccess;

     // Initialize internal state counter
     iteration_contiguous_ = iteration_strided_ = 0;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     uint8_t *byte_pointer = byte_pointer_;
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
     AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);

     int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;

     bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);

     bool guard = col_guard && mask_.predicates[iteration_contiguous_];

     if (guard) {
       *frag_ptr = *memory_pointer;
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     uint8_t *byte_pointer = byte_pointer_;
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
     AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);

     int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;

     bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);

     bool guard = col_guard && mask_.predicates[iteration_contiguous_];

     if (guard) {
       *memory_pointer = *frag_ptr;
     }
   }

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int iteration) {
     iteration_contiguous_ = iteration % ThreadMap::Iterations::kContiguous;
     iteration_strided_ = iteration / ThreadMap::Iterations::kContiguous;
   }

   CUTLASS_HOST_DEVICE
   InterleavedPredicatedTileIterator &operator++() {

     ++iteration_contiguous_;
     byte_pointer_ += params_.advance_row;

     if (iteration_contiguous_ == ThreadMap::Iterations::kContiguous) {

       iteration_contiguous_ = 0;
       ++iteration_strided_;
       byte_pointer_ += params_.advance_column;

       if (iteration_strided_ == ThreadMap::Iterations::kStrided) {
         iteration_strided_ = 0;
       }
     }

     return *this;
   }

   CUTLASS_DEVICE void clear_mask() {
     mask_.clear();
   }

   CUTLASS_DEVICE void enable_mask() {
     mask_.enable();
   }

   CUTLASS_DEVICE void get_mask(Mask &mask) {
     return mask_;
   }

   CUTLASS_DEVICE void set_mask(Mask const &mask) {
     mask_ = mask;
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::epilogue::threadblock::PredicatedTileIterator::Mask::predicates
bool predicates[kCount]
Predicate state.
Definition: epilogue/threadblock/predicated_tile_iterator.h:175

cutlass::layout::RowMajor::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62

cutlass::epilogue::threadblock::PredicatedTileIterator::kElementsPerAccess
static int const kElementsPerAccess
Definition: epilogue/threadblock/predicated_tile_iterator.h:80

cutlass::epilogue::threadblock::PredicatedTileIterator::Mask::enable
CUTLASS_DEVICE void enable()
Definition: epilogue/threadblock/predicated_tile_iterator.h:194

cutlass::MatrixCoord::column
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::advance_row
Index advance_row
amount to add to move to the next &#39;row&#39; position
Definition: epilogue/threadblock/predicated_tile_iterator.h:116

cutlass::epilogue::threadblock::PredicatedTileIterator::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: epilogue/threadblock/predicated_tile_iterator.h:279

cutlass::epilogue::threadblock::PredicatedTileIterator::Element
Element_ Element
Definition: epilogue/threadblock/predicated_tile_iterator.h:70

cutlass
Definition: aligned_buffer.h:35

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::epilogue::threadblock::PredicatedTileIterator::AccessType
AlignedArray< Element, ThreadMap::kElementsPerAccess > AccessType
Memory access size.
Definition: epilogue/threadblock/predicated_tile_iterator.h:98

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::initialize
CUTLASS_HOST_DEVICE Status initialize(Index stride_)
Definition: epilogue/threadblock/predicated_tile_iterator.h:496

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Mask::clear
CUTLASS_HOST_DEVICE void clear()
CUTLASS_HOST_DEVICE enables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:539

pitch_linear_thread_map.h
Templates implementing how threads are mapped to a given tile.

cutlass::epilogue::threadblock::PredicatedTileIterator::get_mask
CUTLASS_DEVICE void get_mask(Mask &mask)
Sets the mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:432

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Fragment
Array< Element, ThreadMap::kElementsPerAccess > Fragment
Fragment object.
Definition: epilogue/threadblock/predicated_tile_iterator.h:471

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::ThreadMap
ThreadMap_ ThreadMap
Definition: epilogue/threadblock/predicated_tile_iterator.h:454

cutlass::AlignedArray
Aligned array type.
Definition: array.h:511

cutlass::epilogue::threadblock::PredicatedTileIterator::Mask
Mask object.
Definition: epilogue/threadblock/predicated_tile_iterator.h:170

cutlass::MatrixCoord::row
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Mask::predicates
bool predicates[kCount]
Predicate state.
Definition: epilogue/threadblock/predicated_tile_iterator.h:528

cutlass::epilogue::threadblock::PredicatedTileIterator::ConstTensorRef
typename TensorRef::ConstTensorRef ConstTensorRef
Definition: epilogue/threadblock/predicated_tile_iterator.h:74

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Mask::Mask
CUTLASS_HOST_DEVICE Mask()
Efficiently disables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:534

cutlass::layout::ColumnMajorInterleaved::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: layout/matrix.h:418

cutlass::layout::RowMajor::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: layout/matrix.h:112

cutlass::epilogue::threadblock::PredicatedTileIterator::Fragment
Array< Element, ThreadMap::Iterations::kColumn *ThreadMap::Iterations::kRow *ThreadMap::Iterations::kGroup *ThreadMap::Iterations::kCluster *ThreadMap::kElementsPerAccess > Fragment
Fragment object.
Definition: epilogue/threadblock/predicated_tile_iterator.h:95

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params
Definition: epilogue/threadblock/predicated_tile_iterator.h:480

cutlass::epilogue::threadblock::PredicatedTileIterator::store
CUTLASS_DEVICE void store(Fragment const &frag)
Stores a fragment to memory.
Definition: epilogue/threadblock/predicated_tile_iterator.h:333

cutlass::epilogue::threadblock::PredicatedTileIterator::LongIndex
typename Layout::LongIndex LongIndex
Definition: epilogue/threadblock/predicated_tile_iterator.h:77

cutlass::TensorRef< Element, Layout >::ConstTensorRef
TensorRef< typename platform::remove_const< Element >::type const, Layout > ConstTensorRef
TensorRef to constant data.
Definition: tensor_ref.h:179

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: epilogue/threadblock/predicated_tile_iterator.h:636

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::InterleavedPredicatedTileIterator
CUTLASS_DEVICE InterleavedPredicatedTileIterator(Params const &params, Element *pointer, TensorCoord extent, int thread_idx, TensorCoord threadblock_offset)
Constructor.
Definition: epilogue/threadblock/predicated_tile_iterator.h:596

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Index
typename Layout::Index Index
Definition: epilogue/threadblock/predicated_tile_iterator.h:462

cutlass::layout::RowMajor::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::advance_cluster
Index advance_cluster
amount to add to move to the next &#39;cluster&#39; position
Definition: epilogue/threadblock/predicated_tile_iterator.h:118

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::store
CUTLASS_DEVICE void store(Fragment const &frag)
Stores a fragment to memory.
Definition: epilogue/threadblock/predicated_tile_iterator.h:654

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::advance_row
Index advance_row
amount to add to move to the next &#39;row&#39; position
Definition: epilogue/threadblock/predicated_tile_iterator.h:488

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::clear_mask
CUTLASS_DEVICE void clear_mask()
Efficiently enables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:699

cutlass::TensorRef< Element, Layout >

cutlass::epilogue::threadblock::PredicatedTileIterator::ThreadMap
ThreadMap_ ThreadMap
Definition: epilogue/threadblock/predicated_tile_iterator.h:67

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Mask
Mask object.
Definition: epilogue/threadblock/predicated_tile_iterator.h:522

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: epilogue/threadblock/predicated_tile_iterator.h:510

cutlass::epilogue::threadblock::PredicatedTileIterator::PredicatedTileIterator
CUTLASS_DEVICE PredicatedTileIterator(Params const &params, Element *pointer, TensorCoord extent, int thread_idx, TensorCoord threadblock_offset=TensorCoord())
Constructor.
Definition: epilogue/threadblock/predicated_tile_iterator.h:240

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator
Definition: epilogue/threadblock/predicated_tile_iterator.h:452

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::ConstTensorRef
typename TensorRef::ConstTensorRef ConstTensorRef
Definition: epilogue/threadblock/predicated_tile_iterator.h:460

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int iteration)
Overrides the internal iteration index.
Definition: epilogue/threadblock/predicated_tile_iterator.h:672

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::stride
Index stride
stride in bytes between rows
Definition: epilogue/threadblock/predicated_tile_iterator.h:110

cutlass::epilogue::threadblock::PredicatedTileIterator::operator++
CUTLASS_HOST_DEVICE PredicatedTileIterator & operator++()
Advances to the next position to load or store.
Definition: epilogue/threadblock/predicated_tile_iterator.h:387

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Definition: epilogue/threadblock/predicated_tile_iterator.h:163

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::stride
Index stride
stride in bytes between columns
Definition: epilogue/threadblock/predicated_tile_iterator.h:486

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::advance_column
Index advance_column
amount to add to move to the next &#39;column&#39; position
Definition: epilogue/threadblock/predicated_tile_iterator.h:489

cutlass::epilogue::threadblock::PredicatedTileIterator::Params
Definition: epilogue/threadblock/predicated_tile_iterator.h:104

cutlass::epilogue::threadblock::PredicatedTileIterator::kIterations
static int const kIterations
Definition: epilogue/threadblock/predicated_tile_iterator.h:82

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::advance_tile
Index advance_tile
amount to add to move to the next &#39;tile&#39;
Definition: epilogue/threadblock/predicated_tile_iterator.h:119

output_tile_thread_map.h
Metaprogram for determining the mapping of output elements to threads for epilogue tiles...

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::operator++
CUTLASS_HOST_DEVICE InterleavedPredicatedTileIterator & operator++()
Advances to the next position to load or store.
Definition: epilogue/threadblock/predicated_tile_iterator.h:679

cutlass::epilogue::threadblock::PredicatedTileIterator::clear_mask
CUTLASS_DEVICE void clear_mask()
Efficiently enables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:422

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::increment_group
Index increment_group
increment quantity (in bytes) to advance when moving to the next group
Definition: epilogue/threadblock/predicated_tile_iterator.h:113

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::epilogue::threadblock::PredicatedTileIterator::Index
typename Layout::Index Index
Definition: epilogue/threadblock/predicated_tile_iterator.h:76

cutlass::epilogue::threadblock::PredicatedTileIterator
Definition: epilogue/threadblock/predicated_tile_iterator.h:65

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::LongIndex
typename Layout::LongIndex LongIndex
Definition: epilogue/threadblock/predicated_tile_iterator.h:463

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::set_mask
CUTLASS_DEVICE void set_mask(Mask const &mask)
Definition: epilogue/threadblock/predicated_tile_iterator.h:714

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Definition: epilogue/threadblock/predicated_tile_iterator.h:515

cutlass::epilogue::threadblock::PredicatedTileIterator::set_mask
CUTLASS_DEVICE void set_mask(Mask const &mask)
Definition: epilogue/threadblock/predicated_tile_iterator.h:437

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::Mask::enable
CUTLASS_DEVICE void enable()
Definition: epilogue/threadblock/predicated_tile_iterator.h:547

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::advance_group
Index advance_group
amount to add to move to the next &#39;group&#39; position
Definition: epilogue/threadblock/predicated_tile_iterator.h:117

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: epilogue/threadblock/predicated_tile_iterator.h:630

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::initialize
CUTLASS_HOST_DEVICE Status initialize(Index stride_)
Definition: epilogue/threadblock/predicated_tile_iterator.h:126

cutlass::Status::kSuccess
Operation was successful.

cutlass::epilogue::threadblock::PredicatedTileIterator::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: epilogue/threadblock/predicated_tile_iterator.h:273

cutlass::epilogue::threadblock::PredicatedTileIterator::Shape
typename ThreadMap::Shape Shape
Definition: epilogue/threadblock/predicated_tile_iterator.h:68

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::epilogue::threadblock::PredicatedTileIterator::TensorCoord
MatrixCoord TensorCoord
Definition: epilogue/threadblock/predicated_tile_iterator.h:78

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: epilogue/threadblock/predicated_tile_iterator.h:158

cutlass::epilogue::threadblock::PredicatedTileIterator::enable_mask
CUTLASS_DEVICE void enable_mask()
Sets the mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:427

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::increment_row
Index increment_row
increment quantity (in bytes) to advance when moving between rows
Definition: epilogue/threadblock/predicated_tile_iterator.h:112

cutlass::epilogue::threadblock::PredicatedTileIterator::Params::increment_cluster
Index increment_cluster
increment quantity (in bytes) to advance when moving to the next cluster
Definition: epilogue/threadblock/predicated_tile_iterator.h:114

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::get_mask
CUTLASS_DEVICE void get_mask(Mask &mask)
Sets the mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:709

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::epilogue::threadblock::PredicatedTileIterator::Mask::clear
CUTLASS_HOST_DEVICE void clear()
CUTLASS_HOST_DEVICE enables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:186

cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97

cutlass::Status
Status
Status code returned by CUTLASS operations.
Definition: cutlass.h:39

cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator::enable_mask
CUTLASS_DEVICE void enable_mask()
Sets the mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:704

cutlass::epilogue::threadblock::PredicatedTileIterator::Mask::Mask
CUTLASS_HOST_DEVICE Mask()
Efficiently disables all accesses guarded by mask.
Definition: epilogue/threadblock/predicated_tile_iterator.h:181

cutlass::epilogue::threadblock::PredicatedTileIterator::kThreads
static int const kThreads
Definition: epilogue/threadblock/predicated_tile_iterator.h:81