cutlass/predicated__tile__access__iterator_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  *modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice,
  *this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
  *notice, this list of conditions and the following disclaimer in the
  *documentation and/or other materials provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its
  *contributors may be used to endorse or promote products derived from this
  *software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
  *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
  *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/array.h"
 #include "cutlass/coord.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/predicate_vector.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/tensor_view.h"


 namespace cutlass {
 namespace transform {
 namespace threadblock {


 template <typename Shape, typename Element, typename Layout, int AdvanceRank,
           typename ThreadMap, typename AccessType>
 class PredicatedTileAccessIterator;


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
                                    AdvanceRank, ThreadMap_, AccessType_> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::PitchLinear;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   using AccessType = AccessType_;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorView = TensorView<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Pointer = Element *;
   using NonConstPointer = typename platform::remove_const<Element>::type *;

   static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;

   static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
     "Vectors implied by the thread map must be divisible by the access type.");

   static int const kPredicatesPerByte = 4;
   static int const kPredicatesPerWord = 4 * kPredicatesPerByte;

   static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;

   static int const kPredicateByteCount =
     (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
   static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;

   static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;

   static_assert(kPredicateWordCount <= 4, "Too many predicates.");

   using Mask = Array<uint32_t, kPredicateWordCount>;

   class Params {
    public:
     friend PredicatedTileAccessIterator;

    private:
     int stride_;
     int inc_strided_;
     int inc_next_;
     int inc_advance_;

    public:

     // Default ctor
     CUTLASS_HOST_DEVICE
     Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout) : stride_(layout.stride(0)) {
       inc_strided_ = (stride_ * ThreadMap::Delta::kStrided) *
                      sizeof_bits<Element>::value / 8;

       if (kAdvanceRank) {
         // advance along strided dimension
         inc_advance_ =
             Shape::kStrided * stride_ * sizeof_bits<Element>::value / 8;
       } else {
         // advance along contiguous dimension
         inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
       }

       inc_next_ = inc_advance_ - (ThreadMap::Iterations::kStrided - 1) *
                                      ThreadMap::Delta::kStrided * stride_ *
                                      sizeof_bits<Element>::value / 8;
     };
   };

  private:
   using BytePointer = char *;

  private:
   //
   // Data members
   //

   Params const &params_;

   BytePointer pointer_;

   uint32_t predicates_[kPredicateWordCount];

   TensorCoord extent_;

   TensorCoord thread_offset_;

   TensorCoord residue_offset_;

   bool is_residue_tile_;

   int iteration_vector_;

   int iteration_contiguous_;

   int iteration_strided_;

  private:
   CUTLASS_DEVICE
   void compute_predicates_(
       TensorCoord extent,
       bool is_steady_state = false) {

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPredicateWordCount; ++i) {
       predicates_[i] = 0u;
     }

     for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {

       int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);

       int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);

       int c = access_residual / kAccessesPerVector;
       int v = access_residual % kAccessesPerVector;

       TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
                                 s * ThreadMap::Delta::kStrided);

       TensorCoord coord = thread_offset_ + iteration_coord;

       bool guard;

       if (is_steady_state) {
         if (kAdvanceRank == 0) {
           guard = (coord.strided() < extent.strided());
         } else {
           guard = (coord.contiguous() < extent.contiguous());
         }
       } else {
         guard = (coord.strided() < extent.strided() &&
                  coord.contiguous() < extent.contiguous());
       }

       int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);

       int word_idx = pred_idx / kPredicatesPerWord;
       int residual = pred_idx % kPredicatesPerWord;
       int byte_idx = residual / kPredicatesPerByte;
       int bit_idx = residual % kPredicatesPerByte;

       predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));

     }

   }

  public:
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id,
       TensorCoord const &threadblock_offset)
       : params_(params),
         pointer_(reinterpret_cast<BytePointer>(
             const_cast<NonConstPointer>(pointer))),
         extent_(extent),
         is_residue_tile_(true) {

     TensorCoord residue_extent;
     if (kAdvanceRank) {

       Index residue_size = (extent_[kAdvanceRank] % Shape::kStrided);
       if (!residue_size) {
         residue_size = Shape::kStrided;
       }

       residue_offset_ = make_Coord(0, residue_size);
       residue_extent = make_Coord(
         extent_.contiguous(),
         min(threadblock_offset.strided() + residue_offset_.strided(), extent_.strided())
       );

     } else {

       Index residue_size = (extent_[kAdvanceRank] % Shape::kContiguous);
       if (!residue_size) {
         residue_size = Shape::kContiguous;
       }
       residue_offset_ = make_Coord(residue_size, 0);
       residue_extent = make_Coord(
         min(extent_.contiguous(), threadblock_offset.contiguous() + residue_offset_.contiguous()),
         extent_.strided()
       );
     }

     // Per-thread offset in logical coordinates of tensor
     thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);

     // update internal pointers
     Layout layout(params_.stride_);
     add_pointer_offset(layout(thread_offset_));

     compute_predicates_(residue_extent, false);

     set_iteration_index(0);
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id)
       : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
                                      make_Coord(0, 0)) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) {

     iteration_vector_ = index % kAccessesPerVector;
     int residual_access = index / kAccessesPerVector;

     iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
     iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
   }

   CUTLASS_DEVICE
   void add_tile_offset(
       TensorCoord const &tile_offset) {
     if (is_residue_tile_) {

       thread_offset_ += residue_offset_;

       Layout layout(params_.stride_);
       add_pointer_offset(layout(residue_offset_));

       compute_predicates_(extent_, true);

       if (kAdvanceRank) {
         pointer_ += params_.inc_advance_ * (tile_offset.strided() - 1);
         pointer_ += Shape::kContiguous * tile_offset.contiguous();
       } else {
         pointer_ += params_.inc_advance_ * (tile_offset.contiguous() - 1);
         pointer_ += Shape::kStrided * tile_offset.strided();
       }
     } else {
       if (kAdvanceRank) {
         pointer_ += params_.inc_advance_ * tile_offset.strided();
         pointer_ += Shape::kContiguous * tile_offset.contiguous();
       } else {
         pointer_ += params_.inc_advance_ * tile_offset.contiguous();
         pointer_ += Shape::kStrided * tile_offset.strided();
       }
     }
     is_residue_tile_ = false;
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(
         pointer_ +
         iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator &operator++() {

     ++iteration_vector_;
     if (iteration_vector_ < kAccessesPerVector) {
       return *this;
     }

     iteration_vector_ = 0;
     ++iteration_contiguous_;

     if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
       return *this;
     }

     // Enter here only if (iteration_contiguous_ ==
     // ThreadMap::Iteration::kContiguous)
     iteration_contiguous_ = 0;
     ++iteration_strided_;

     if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
       pointer_ += params_.inc_strided_;
       return *this;
     }

     // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
     // which means we enter the next tile.
     iteration_strided_ = 0;

     // advance to next tile
     pointer_ += params_.inc_next_;

     // now return to start tile - if the iterator is subsequently advanced, this
     // subtraction as well as the subsequent integer addition are both elided by
     // the compiler.
     pointer_ -= params_.inc_advance_;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator operator++(int) {
     PredicatedTileAccessIterator self(*this);
     operator++();
     return self;
   }

   CUTLASS_HOST_DEVICE
   void clear_mask() {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPredicateWordCount; ++i) {
       predicates_[i] = 0u;
     }

   }

   CUTLASS_HOST_DEVICE
   void enable_mask() {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPredicateWordCount; ++i) {
       predicates_[i] = 0xffffffff;
     }
   }

   CUTLASS_HOST_DEVICE
   void set_mask(Mask const &mask) {
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPredicateWordCount; ++i) {
       predicates_[i] = mask[i];
     }

   }

   CUTLASS_HOST_DEVICE
   void get_mask(Mask &mask) {
      CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < kPredicateWordCount; ++i) {
       mask[i] = predicates_[i];
     }
   }

   CUTLASS_HOST_DEVICE
   bool valid() {


     int pred_idx =
       iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);

     int word_idx = pred_idx / kPredicatesPerWord;
     int residual = pred_idx % kPredicatesPerWord;
     int byte_idx = residual / kPredicatesPerByte;
     int bit_idx = residual % kPredicatesPerByte;

     bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
     return pred;


     //return true;
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
                                    AdvanceRank, ThreadMap_, AccessType_> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajor;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   using AccessType = AccessType_;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorView = TensorView<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Pointer = Element *;
   using NonConstPointer = typename platform::remove_const<Element>::type *;

   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;

   using Mask = typename UnderlyingIterator::Mask;

   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;

   class Params {
    private:
     friend PredicatedTileAccessIterator;

     typename UnderlyingIterator::Params params_;

    public:

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout)
         : params_(layout::PitchLinear(layout.stride(0))){};
   };

  private:
   //
   // Data members
   //

   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id,
       TensorCoord const &threadblock_offset)
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row(), extent.column()),
                   thread_id,
                   layout::PitchLinearCoord(threadblock_offset.row(),
                                            threadblock_offset.column())) {}

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id
       )
       : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
                                      make_Coord(0, 0)) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void add_tile_offset(TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator operator++(int) {
     PredicatedTileAccessIterator self(*this);
     operator++();
     return self;
   }

   CUTLASS_HOST_DEVICE
   void clear_mask() { iterator_.clear_mask(); }

   CUTLASS_HOST_DEVICE
   void enable_mask() { iterator_.enable_mask(); }

   CUTLASS_HOST_DEVICE
   void set_mask(Mask const &mask) { iterator_.set_mask(mask); }

   CUTLASS_HOST_DEVICE
   void get_mask(Mask &mask) { iterator_.get_mask(mask); }

   CUTLASS_HOST_DEVICE
   bool valid() {
     return iterator_.valid();
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
                                    AdvanceRank, ThreadMap_, AccessType_> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajor;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   using AccessType = AccessType_;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorView = TensorView<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Pointer = Element *;
   using NonConstPointer = typename platform::remove_const<Element>::type *;

   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;

   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;

   using Mask = typename UnderlyingIterator::Mask;

   class Params {
    private:
     friend PredicatedTileAccessIterator;

     typename UnderlyingIterator::Params params_;

    public:

     CUTLASS_HOST_DEVICE
     Params() { }

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout)
         : params_(layout::PitchLinear(layout.stride(0))){};
   };

  private:
   //
   // Data members
   //

   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id,
       TensorCoord const &threadblock_offset)
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column(), extent.row()),
                   thread_id,
                   layout::PitchLinearCoord(threadblock_offset.column(),
                                            threadblock_offset.row())) {}

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id
       )
       : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
                                      make_Coord(0, 0)) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void add_tile_offset(TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator operator++(int) {
     PredicatedTileAccessIterator self(*this);
     operator++();
     return self;
   }

   CUTLASS_HOST_DEVICE
   void clear_mask() { iterator_.clear_mask(); }

   CUTLASS_HOST_DEVICE
   void enable_mask() { iterator_.enable_mask(); }

   CUTLASS_HOST_DEVICE
   void set_mask(Mask const &mask) { iterator_.set_mask(mask); }

   CUTLASS_HOST_DEVICE
   void get_mask(Mask &mask) { iterator_.get_mask(mask); }

   CUTLASS_HOST_DEVICE
   bool valid() {
     return iterator_.valid();
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::ColumnMajorInterleaved<InterleavedK>,
                                    AdvanceRank, ThreadMap_, AccessType_> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   static int const kInterleavedK = InterleavedK;
   using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   using AccessType = AccessType_;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorView = TensorView<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Pointer = Element *;
   using NonConstPointer = typename platform::remove_const<Element>::type *;

   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow * kInterleavedK,
                                Shape::kColumn / kInterleavedK>,
       Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
       AccessType>;

   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;

   using Mask = typename UnderlyingIterator::Mask;

   class Params {
    private:
     friend PredicatedTileAccessIterator;

     typename UnderlyingIterator::Params params_;

    public:
     CUTLASS_HOST_DEVICE
     Params() {}

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout)
         : params_(layout::PitchLinear(layout.stride(0))) {}
   };

  private:
   //
   // Data members
   //

   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id,
       TensorCoord const &threadblock_offset)
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row() * kInterleavedK,
                                            extent.column() / kInterleavedK),
                   thread_id,
                   layout::PitchLinearCoord(
                       threadblock_offset.row() * kInterleavedK,
                       threadblock_offset.column() / kInterleavedK)) {}

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id
       )
       : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
                                      make_Coord(0, 0)) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void add_tile_offset(TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator operator++(int) {
     PredicatedTileAccessIterator self(*this);
     operator++();
     return self;
   }

   CUTLASS_HOST_DEVICE
   void clear_mask() { iterator_.clear_mask(); }

   CUTLASS_HOST_DEVICE
   void enable_mask() { iterator_.enable_mask(); }

   CUTLASS_HOST_DEVICE
   void set_mask(Mask const &mask) { iterator_.set_mask(mask); }

   CUTLASS_HOST_DEVICE
   void get_mask(Mask &mask) { iterator_.get_mask(mask); }

   CUTLASS_HOST_DEVICE
   bool valid() { return iterator_.valid(); }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::RowMajorInterleaved<InterleavedK>,
                                    AdvanceRank, ThreadMap_, AccessType_> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   static int const kInterleavedK = InterleavedK;
   using Layout = layout::RowMajorInterleaved<kInterleavedK>;
   static int const kAdvanceRank = AdvanceRank;
   using ThreadMap = ThreadMap_;
   using AccessType = AccessType_;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorView = TensorView<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using Pointer = Element *;
   using NonConstPointer = typename platform::remove_const<Element>::type *;

   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
                                Shape::kRow / kInterleavedK>,
       Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
       AccessType>;


   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;

   using Mask = typename UnderlyingIterator::Mask;

   class Params {
    private:
     friend PredicatedTileAccessIterator;

     typename UnderlyingIterator::Params params_;

    public:
     CUTLASS_HOST_DEVICE
     Params() {}

     CUTLASS_HOST_DEVICE
     Params(Layout const &layout)
         : params_(layout::PitchLinear(layout.stride(0))) {}
   };

  private:
   //
   // Data members
   //

   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id,
       TensorCoord const &threadblock_offset)
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column() * kInterleavedK,
                                            extent.row() / kInterleavedK),
                   thread_id,
                   layout::PitchLinearCoord(
                       threadblock_offset.column() * kInterleavedK,
                       threadblock_offset.row() / kInterleavedK)) {}

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator(
       Params const &params,
       Pointer pointer,
       TensorCoord extent,
       int thread_id
       )
       : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
                                      make_Coord(0, 0)) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   void add_tile_offset(TensorCoord const &tile_offset) {
     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   PredicatedTileAccessIterator operator++(int) {
     PredicatedTileAccessIterator self(*this);
     operator++();
     return self;
   }

   CUTLASS_HOST_DEVICE
   void clear_mask() { iterator_.clear_mask(); }

   CUTLASS_HOST_DEVICE
   void enable_mask() { iterator_.enable_mask(); }

   CUTLASS_HOST_DEVICE
   void set_mask(Mask const &mask) { iterator_.set_mask(mask); }

   CUTLASS_HOST_DEVICE
   void get_mask(Mask &mask) { iterator_.get_mask(mask); }

   CUTLASS_HOST_DEVICE
   bool valid() { return iterator_.valid(); }
 };


 }  // namespace threadblock
 }  // namespace transform
 }  // namespace cutlass

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: predicated_tile_access_iterator.h:606

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator & operator++()
Definition: predicated_tile_access_iterator.h:1172

cutlass::layout::RowMajor::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62

cutlass::layout::ColumnMajorInterleaved::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:355

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::LongIndex
typename Layout::LongIndex LongIndex
Definition: predicated_tile_access_iterator.h:702

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::get_mask
CUTLASS_HOST_DEVICE void get_mask(Mask &mask)
Gets the mask.
Definition: predicated_tile_access_iterator.h:842

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::AccessType
AccessType_ AccessType
Definition: predicated_tile_access_iterator.h:1058

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::get_mask
CUTLASS_HOST_DEVICE void get_mask(Mask &mask)
Gets the mask.
Definition: predicated_tile_access_iterator.h:1204

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::set_mask
CUTLASS_HOST_DEVICE void set_mask(Mask const &mask)
Sets the predicate mask, overriding value stored in predicate iterator.
Definition: predicated_tile_access_iterator.h:1200

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id, TensorCoord const &threadblock_offset)
Definition: predicated_tile_access_iterator.h:932

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::LongIndex
typename Layout::LongIndex LongIndex
Definition: predicated_tile_access_iterator.h:882

cutlass::platform::remove_const::type
T type
Definition: platform.h:351

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: predicated_tile_access_iterator.h:1065

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Index
typename Layout::Index Index
Definition: predicated_tile_access_iterator.h:881

cutlass::layout::RowMajorInterleaved::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:249

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id)
Construct a PredicatedTileAccessIterator with zero threadblock offset.
Definition: predicated_tile_access_iterator.h:1133

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Pointer
Element * Pointer
Definition: predicated_tile_access_iterator.h:532

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: predicated_tile_access_iterator.h:610

cutlass::layout::PitchLinear
Mapping function for pitch-linear memory.
Definition: pitch_linear.h:163

cutlass::layout::ColumnMajorInterleaved::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:352

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: predicated_tile_access_iterator.h:706

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Index
typename Layout::Index Index
Definition: predicated_tile_access_iterator.h:1060

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387

cutlass::layout::ColumnMajor::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:154

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::enable_mask
CUTLASS_HOST_DEVICE void enable_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:1196

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::add_tile_offset
CUTLASS_HOST_DEVICE void add_tile_offset(TensorCoord const &tile_offset)
Definition: predicated_tile_access_iterator.h:793

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id)
Construct a PredicatedTileAccessIterator with zero threadblock offset.
Definition: predicated_tile_access_iterator.h:595

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::NonConstPointer
typename platform::remove_const< Element >::type * NonConstPointer
Definition: predicated_tile_access_iterator.h:97

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::NonConstPointer
typename platform::remove_const< Element >::type * NonConstPointer
Definition: predicated_tile_access_iterator.h:709

cutlass::layout::RowMajorInterleaved::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:246

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::valid
CUTLASS_HOST_DEVICE bool valid()
Returns whether access is valid or not.
Definition: predicated_tile_access_iterator.h:670

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::enable_mask
CUTLASS_HOST_DEVICE void enable_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:1016

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Mask
typename UnderlyingIterator::Mask Mask
Predicate vector stores mask to guard accesses.
Definition: predicated_tile_access_iterator.h:1080

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator & operator++()
Definition: predicated_tile_access_iterator.h:992

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: predicated_tile_access_iterator.h:964

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Mask
typename UnderlyingIterator::Mask Mask
Predicate vector stores mask to guard accesses.
Definition: predicated_tile_access_iterator.h:900

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Construct the Params object given a pitch-linear tensor&#39;s layout.
Definition: predicated_tile_access_iterator.h:916

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::valid
CUTLASS_HOST_DEVICE bool valid()
Returns whether access is valid or not.
Definition: predicated_tile_access_iterator.h:480

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Shape
Shape_ Shape
Definition: predicated_tile_access_iterator.h:873

tensor_view.h
Defines a structure containing strides and a pointer to tensor data.

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::LongIndex
typename Layout::LongIndex LongIndex
Definition: predicated_tile_access_iterator.h:526

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator operator++(int)
Definition: predicated_tile_access_iterator.h:822

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::AccessType
AccessType_ AccessType
Definition: predicated_tile_access_iterator.h:523

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::NonConstPointer
typename platform::remove_const< Element >::type * NonConstPointer
Definition: predicated_tile_access_iterator.h:889

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Construct the Params object given a pitch-linear tensor&#39;s layout.
Definition: predicated_tile_access_iterator.h:147

cutlass::layout::ColumnMajor
Mapping function for column-major matrices.
Definition: layout/matrix.h:142

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Pointer
Element * Pointer
Definition: predicated_tile_access_iterator.h:708

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: predicated_tile_access_iterator.h:530

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::clear_mask
CUTLASS_HOST_DEVICE void clear_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:830

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Shape
Shape_ Shape
Definition: predicated_tile_access_iterator.h:518

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Mask
Array< uint32_t, kPredicateWordCount > Mask
Predicate vector stores mask to guard accesses.
Definition: predicated_tile_access_iterator.h:119

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: predicated_tile_access_iterator.h:347

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: predicated_tile_access_iterator.h:886

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::valid
CUTLASS_HOST_DEVICE bool valid()
Returns whether access is valid or not.
Definition: predicated_tile_access_iterator.h:846

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator operator++(int)
Definition: predicated_tile_access_iterator.h:646

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator & operator++()
Increment and return an instance to self.
Definition: predicated_tile_access_iterator.h:393

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id)
Construct a PredicatedTileAccessIterator with zero threadblock offset.
Definition: predicated_tile_access_iterator.h:321

predicate_vector.h
Defines container classes and iterators for managing a statically sized vector of boolean predicates...

cutlass::layout::RowMajor::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Shape
Shape_ Shape
Definition: predicated_tile_access_iterator.h:694

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::AccessType
AccessType_ AccessType
Definition: predicated_tile_access_iterator.h:87

cutlass::operator++
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::clear_mask
CUTLASS_HOST_DEVICE void clear_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:1192

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::set_mask
CUTLASS_HOST_DEVICE void set_mask(Mask const &mask)
Sets the predicate mask, overriding value stored in predicate iterator.
Definition: predicated_tile_access_iterator.h:662

cutlass::layout::PitchLinear::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: pitch_linear.h:175

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles.
Definition: predicated_tile_access_iterator.h:353

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Pointer
Element * Pointer
Definition: predicated_tile_access_iterator.h:888

cutlass::TensorView< Element, Layout >

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Index
typename Layout::Index Index
Definition: predicated_tile_access_iterator.h:525

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::add_tile_offset
CUTLASS_HOST_DEVICE void add_tile_offset(TensorCoord const &tile_offset)
Definition: predicated_tile_access_iterator.h:617

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::LongIndex
typename Layout::LongIndex LongIndex
Definition: predicated_tile_access_iterator.h:1061

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: predicated_tile_access_iterator.h:143

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id)
Construct a PredicatedTileAccessIterator with zero threadblock offset.
Definition: predicated_tile_access_iterator.h:953

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Mask
typename UnderlyingIterator::Mask Mask
Predicate vector stores mask to guard accesses.
Definition: predicated_tile_access_iterator.h:718

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Pointer
Element * Pointer
Definition: predicated_tile_access_iterator.h:96

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Shape
Shape_ Shape
Definition: predicated_tile_access_iterator.h:1052

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Index
typename Layout::Index Index
Definition: predicated_tile_access_iterator.h:701

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id)
Construct a PredicatedTileAccessIterator with zero threadblock offset.
Definition: predicated_tile_access_iterator.h:771

cutlass::TensorRef< Element, Layout >

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::get_mask
CUTLASS_HOST_DEVICE void get_mask(Mask &mask)
Gets the mask.
Definition: predicated_tile_access_iterator.h:666

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::get_mask
CUTLASS_HOST_DEVICE void get_mask(Mask &mask)
Gets the mask.
Definition: predicated_tile_access_iterator.h:471

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: predicated_tile_access_iterator.h:1092

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id, TensorCoord const &threadblock_offset)
Definition: predicated_tile_access_iterator.h:576

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params()
Default ctor.
Definition: predicated_tile_access_iterator.h:556

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::AccessType
AccessType_ AccessType
Definition: predicated_tile_access_iterator.h:699

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Shape
Shape_ Shape
Definition: predicated_tile_access_iterator.h:82

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator & operator++()
Definition: predicated_tile_access_iterator.h:634

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params()
Default ctor.
Definition: predicated_tile_access_iterator.h:732

cutlass::platform::min
CUTLASS_HOST_DEVICE constexpr const T & min(const T &a, const T &b)
std::min
Definition: platform.h:183

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator operator++(int)
Increment and return an instance to self.
Definition: predicated_tile_access_iterator.h:434

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::get_mask
CUTLASS_HOST_DEVICE void get_mask(Mask &mask)
Gets the mask.
Definition: predicated_tile_access_iterator.h:1024

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::layout::PitchLinear::Index
int32_t Index
Index type used for coordinates.
Definition: pitch_linear.h:172

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::NonConstPointer
typename platform::remove_const< Element >::type * NonConstPointer
Definition: predicated_tile_access_iterator.h:1068

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params()
Definition: predicated_tile_access_iterator.h:912

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::add_tile_offset
CUTLASS_HOST_DEVICE void add_tile_offset(TensorCoord const &tile_offset)
Definition: predicated_tile_access_iterator.h:975

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id, TensorCoord const &threadblock_offset)
Definition: predicated_tile_access_iterator.h:1112

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Construct the Params object given a pitch-linear tensor&#39;s layout.
Definition: predicated_tile_access_iterator.h:1096

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Params::PredicatedTileAccessIterator
friend PredicatedTileAccessIterator
Definition: predicated_tile_access_iterator.h:124

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: predicated_tile_access_iterator.h:786

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id, TensorCoord const &threadblock_offset)
Definition: predicated_tile_access_iterator.h:263

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::valid
CUTLASS_HOST_DEVICE bool valid()
Returns whether access is valid or not.
Definition: predicated_tile_access_iterator.h:1028

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: predicated_tile_access_iterator.h:782

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::clear_mask
CUTLASS_HOST_DEVICE void clear_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:1012

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::PredicatedTileAccessIterator
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator(Params const &params, Pointer pointer, TensorCoord extent, int thread_id, TensorCoord const &threadblock_offset)
Definition: predicated_tile_access_iterator.h:752

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::enable_mask
CUTLASS_HOST_DEVICE void enable_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:658

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::clear_mask
CUTLASS_HOST_DEVICE void clear_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:654

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: predicated_tile_access_iterator.h:1148

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator operator++(int)
Definition: predicated_tile_access_iterator.h:1004

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: predicated_tile_access_iterator.h:94

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::set_mask
CUTLASS_HOST_DEVICE void set_mask(Mask const &mask)
Sets the predicate mask, overriding value stored in predicate iterator.
Definition: predicated_tile_access_iterator.h:461

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator operator++(int)
Definition: predicated_tile_access_iterator.h:1184

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: predicated_tile_access_iterator.h:968

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::AccessType
AccessType_ AccessType
Definition: predicated_tile_access_iterator.h:879

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::Index
typename Layout::Index Index
Definition: predicated_tile_access_iterator.h:89

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Construct the Params object given a pitch-linear tensor&#39;s layout.
Definition: predicated_tile_access_iterator.h:560

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::LongIndex
typename Layout::LongIndex LongIndex
Definition: predicated_tile_access_iterator.h:90

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::enable_mask
CUTLASS_HOST_DEVICE void enable_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:834

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::layout::ColumnMajorInterleaved
Definition: layout/matrix.h:343

cutlass::layout::ColumnMajor::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:151

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::NonConstPointer
typename platform::remove_const< Element >::type * NonConstPointer
Definition: predicated_tile_access_iterator.h:533

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: predicated_tile_access_iterator.h:335

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::set_mask
CUTLASS_HOST_DEVICE void set_mask(Mask const &mask)
Sets the predicate mask, overriding value stored in predicate iterator.
Definition: predicated_tile_access_iterator.h:1020

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: predicated_tile_access_iterator.h:1144

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::enable_mask
CUTLASS_HOST_DEVICE void enable_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:452

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, AccessType_ >::clear_mask
CUTLASS_HOST_DEVICE void clear_mask()
Clears the predicate set efficiently.
Definition: predicated_tile_access_iterator.h:442

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::operator++
CUTLASS_HOST_DEVICE PredicatedTileAccessIterator & operator++()
Definition: predicated_tile_access_iterator.h:810

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessType_ >::Mask
typename UnderlyingIterator::Mask Mask
Predicate vector stores mask to guard accesses.
Definition: predicated_tile_access_iterator.h:540

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::valid
CUTLASS_HOST_DEVICE bool valid()
Returns whether access is valid or not.
Definition: predicated_tile_access_iterator.h:1208

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::Pointer
Element * Pointer
Definition: predicated_tile_access_iterator.h:1067

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::Params::Params
CUTLASS_HOST_DEVICE Params(Layout const &layout)
Construct the Params object given a pitch-linear tensor&#39;s layout.
Definition: predicated_tile_access_iterator.h:736

cutlass::transform::threadblock::PredicatedTileAccessIterator
Definition: predicated_tile_access_iterator.h:66

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessType_ >::set_mask
CUTLASS_HOST_DEVICE void set_mask(Mask const &mask)
Sets the predicate mask, overriding value stored in predicate iterator.
Definition: predicated_tile_access_iterator.h:838

cutlass::transform::threadblock::PredicatedTileAccessIterator< Shape_, Element_, layout::RowMajorInterleaved< InterleavedK >, AdvanceRank, ThreadMap_, AccessType_ >::add_tile_offset
CUTLASS_HOST_DEVICE void add_tile_offset(TensorCoord const &tile_offset)
Definition: predicated_tile_access_iterator.h:1155

cutlass::layout::RowMajorInterleaved
Definition: layout/matrix.h:237