cutlass/regular__tile__iterator__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/transform/threadblock/regular_tile_iterator.h"
 #include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"


 namespace cutlass {
 namespace transform {
 namespace threadblock {


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileIterator<
     Shape_, Element_,
     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                           int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for pitch-linear iterator may along advance along the "
     "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {

     static int const kAccessSizeInBits = 128;

     static_assert(
       sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
       "This iterator requires a policy whose access size is 128bs");
   };

 private:

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

 public:

   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;

   using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
                                                        kAdvanceRank, ThreadMap>;

 private:

   //
   // Data members
   //

   TileAccessIterator address_iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : address_iterator_(ref, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     address_iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     address_iterator_.add_tile_offset({0, 1});
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     address_iterator_.add_tile_offset(coord);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     address_iterator_.set_iteration_index(0);
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
         frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
         ++address_iterator_;
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     address_iterator_.set_iteration_index(0);
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
         *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
         ++address_iterator_;
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileIterator<
     Shape_, Element_,
     layout::ColumnMajorTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for column-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element))>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;

  public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileIterator<
     Shape_, Element_,
     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                                   int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:

   static_assert(AdvanceRank == 0 || AdvanceRank == 1,
     "Specialization for row-major iterator may along advance along the "
     "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element))>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;

  public:

   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

 private:

   UnderlyingIterator iterator_;

 public:

   CUTLASS_HOST_DEVICE
   RegularTileIterator(
     TensorRef ref,
     int thread_id
   ): iterator_({ref.data(), ref.stride()}, thread_id) {

   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {

     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {

     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {
     load_with_pointer_offset(frag, 0);
   }

   CUTLASS_DEVICE
   void store_with_pointer_offset(
     Fragment const &frag,
     Index pointer_offset) {

     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) {
     store_with_pointer_offset(frag, 0);
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<Shape_, Element_,
                           layout::TensorOpMultiplicandCrosswise<
                               sizeof_bits<Element_>::value, Crosswise>,
                           AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>;

   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {
     static int const kAccessSizeInBits = 128;

     static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
                       kAccessSizeInBits,
                   "This iterator requires a policy whose access size is 128bs");
   };

  private:
   using AccessType = Array<Element, Layout::kElementsPerAccess>;

  public:
   using Fragment =
       Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;

   using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
                                                        kAdvanceRank, ThreadMap>;

  private:
   //
   // Data members
   //

   TileAccessIterator address_iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : address_iterator_(ref, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     address_iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     address_iterator_.add_tile_offset({1, 0});
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     address_iterator_.add_tile_offset(coord);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     address_iterator_.set_iteration_index(0);
     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
         frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
         ++address_iterator_;
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     address_iterator_.set_iteration_index(0);
     AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
         *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
         ++address_iterator_;
       }
     }
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<Shape_, Element_,
                           layout::ColumnMajorTensorOpMultiplicandCrosswise<
                               sizeof_bits<Element_>::value, Crosswise>,
                           AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for column-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;

  public:
   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileIterator<Shape_, Element_,
                           layout::RowMajorTensorOpMultiplicandCrosswise<
                               sizeof_bits<Element_>::value, Crosswise>,
                           AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for row-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;

  public:
   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileIterator(TensorRef ref,
                       int thread_id
                       )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileIterator operator++(int) {
     RegularTileIterator prev(*this);
     ++iterator_;

     return prev;
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
     iterator_.load_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }

   CUTLASS_DEVICE
   void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
     iterator_.store_with_pointer_offset(frag, pointer_offset);
   }

   CUTLASS_DEVICE
   void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
 };

 } // namespace threadblock
 } // namespace transform
 } // namespace cutlass
cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:291

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:123

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:618

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:656

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:476

cutlass::layout::TensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:617

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:97

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:384

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:420

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:731

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:610

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:725

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:226

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:461

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:758

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:347

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, ThreadMap::Iterations::kCount *Layout::kElementsPerAccess > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:496

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:734

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:229

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:364

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:352

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:168

cutlass::layout::TensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:520

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:586

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:390

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:634

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:729

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:422

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:375

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:346

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:669

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:835

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:435

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:243

cutlass::layout::TensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:213

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:533

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:312

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:718

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:70

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:751

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:623

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:218

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:136

cutlass::layout::TensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:145

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:404

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:129

cutlass::operator++
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:71

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:62

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:414

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:470

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:275

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:796

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op.h:219

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:263

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:770

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:350

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:396

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:678

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_iterator_tensor_op.h:650

cutlass::TensorRef< Element, Layout >

regular_tile_access_iterator_tensor_op.h
Templates implementing computing the addresses of storing of tiles from pitch-linear rank=2 tensors...

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_iterator_tensor_op.h:63

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:621

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:76

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:688

cutlass::transform::threadblock::RegularTileIterator
Definition: regular_tile_iterator.h:50

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:777

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:303

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:786

cutlass::layout::TensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:74

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:426

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_iterator_tensor_op.h:231

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:569

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:513

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:684

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store_with_pointer_offset
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:174

cutlass::transform::threadblock::RegularTileAccessIterator< Shape, Element, Layout, kAdvanceRank, ThreadMap >

regular_tile_iterator.h
Templates implementing storing of tiles from pitch-linear rank=2 tensors.

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:802

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:565

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:269

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:191

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:542

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_iterator_tensor_op.h:225

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:526

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:726

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_iterator_tensor_op.h:471

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:282

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:792

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_iterator_tensor_op.h:662

cutlass::layout::TensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:632

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Fragment
Array< Element, UnderlyingIterator::Fragment::kElements > Fragment
Fragment object to be loaded or stored.
Definition: regular_tile_iterator_tensor_op.h:742

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:548

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_iterator_tensor_op.h:764

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:116

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:643

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_iterator_tensor_op.h:474

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileIterator
CUTLASS_HOST_DEVICE RegularTileIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_iterator_tensor_op.h:254

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:297

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::store
CUTLASS_DEVICE void store(Fragment const &frag)
Store a fragment to memory.
Definition: regular_tile_iterator_tensor_op.h:694

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_iterator_tensor_op.h:339

cutlass::transform::threadblock::RegularTileIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: regular_tile_iterator_tensor_op.h:151

cutlass::layout::RowMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:527