cutlass/regular__tile__access__iterator__tensor__op_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/array.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/layout/pitch_linear.h"
 #include "cutlass/layout/tensor_op_multiplicand_sm75.h"
 #include "cutlass/matrix_coord.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/tensor_ref.h"
 #include "cutlass/transform/threadblock/regular_tile_access_iterator.h"


 namespace cutlass {
 namespace transform {
 namespace threadblock {


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                           int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element_))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {
     static int const kAccessSizeInBits = 128;

     static_assert(sizeof_bits<Element_>::value *
                           ThreadMap::kElementsPerAccess ==
                       kAccessSizeInBits,
                   "This iterator requires a policy whose access size is 128bs");

     static int const kPointerCount =
         (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   };

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

  private:
   //
   // Data members
   //

   Index stride_;

   AccessType *pointer_[Detail::kPointerCount];

   Index byte_offset_;

   int iteration_contiguous_;

   int iteration_strided_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : stride_(ref.stride(0) / Layout::kElementsPerAccess),
         byte_offset_(0) {
     layout::PitchLinearCoord thread_offset_base =
         ThreadMap::initial_offset(thread_id);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Detail::kPointerCount; ++i) {
       // This is the offset of a thread within a threadblock tile for a specific
       // pointer (units of elements)
       layout::PitchLinearCoord thread_offset_in_threadblock_tile =
           thread_offset_base +
           layout::PitchLinearCoord{
               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};

       // initialize pointer
       pointer_[i] = reinterpret_cast<AccessType *>(
           ref.data() + ref.offset(thread_offset_in_threadblock_tile));
     }

     set_iteration_index(0);
   }

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) {
     iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
     iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_offset_ += pointer_offset * sizeof(Element);
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     AccessType *access_ptr = pointer_[iteration_strided_ & 1];
     int stride_idx = (iteration_strided_ & ~1);

     int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
                         iteration_contiguous_ * ThreadMap::Delta::kContiguous /
                             ThreadMap::kElementsPerAccess;

     char *access_byte_ptr =
         reinterpret_cast<char *>(access_ptr + access_offset);
     return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iteration_contiguous_;

     if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
       return *this;

     // Enter here only if (iteration_contiguous_ ==
     // ThreadMap::Iteration::kContiguous)
     iteration_contiguous_ = 0;
     ++iteration_strided_;

     if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
       return *this;
     }

     // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
     // which means we enter the next tile.
     iteration_strided_ = 0;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset(coord.contiguous() * Shape::kContiguous +
                        coord.strided() * Shape::kStrided * stride_ *
                            Layout::kElementsPerAccess);
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::ColumnMajorTensorOpMultiplicandCongruous<
         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for column-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element_))>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;

   using AccessType = typename UnderlyingIterator::AccessType;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     ++iterator_;

     return prev;
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                                   int(128 / sizeof(Element_))>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for row-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
       sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
                                             int(128 / sizeof(Element_))>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;

   using AccessType = typename UnderlyingIterator::AccessType;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     ++iterator_;

     return prev;
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<Shape_, Element_,
                                 layout::TensorOpMultiplicandCrosswise<
                                     sizeof_bits<Element_>::value, Crosswise>,
                                 AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for pitch-linear iterator may along advance along the "
       "contiguous(rank=0) or strided(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout =
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;
   static int const kCrosswise = Crosswise;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   struct Detail {
     static int const kAccessSizeInBits = 128;

     static_assert(sizeof_bits<Element_>::value *
                           ThreadMap::kElementsPerAccess ==
                       kAccessSizeInBits,
                   "This iterator requires a policy whose access size is 128bs");

     static int const kPointerCount =
         (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   };

   using AccessType = Array<Element, Layout::kElementsPerAccess>;

  private:
   //
   // Data members
   //

   int sections_;

   int sections_per_stage_;

   Index stride_;

   AccessType *pointer_[Detail::kPointerCount];

   Index byte_offset_;

   int iteration_contiguous_;

   int iteration_strided_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : sections_(ref.stride(0) / kCrosswise),
         sections_per_stage_(Shape::kContiguous / kCrosswise),
         // stride_ = kCrosswise x sections_ x kFactor
         stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
         byte_offset_(0) {
     layout::PitchLinearCoord thread_offset_base =
         ThreadMap::initial_offset(thread_id);

     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < Detail::kPointerCount; ++i) {
       // This is the offset of a thread within a threadblock tile for a specific
       // pointer (units of elements)
       layout::PitchLinearCoord thread_offset_in_threadblock_tile =
           thread_offset_base +
           layout::PitchLinearCoord{
               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
       // initialize pointer
       pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
                     ref.offset(thread_offset_in_threadblock_tile) /
                         Layout::kElementsPerAccess;
     }

     set_iteration_index(0);
   }

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) {
     iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
     iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     AccessType *access_ptr = pointer_[iteration_strided_ & 1];
     int stride_idx = (iteration_strided_ & ~1);

     int access_offset =
         stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
         iteration_contiguous_ * ThreadMap::Delta::kContiguous /
             ThreadMap::kElementsPerAccess;
     char *access_byte_ptr =
         reinterpret_cast<char *>(access_ptr + access_offset);
     return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iteration_contiguous_;

     if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
       return *this;

     // Enter here only if (iteration_contiguous_ ==
     // ThreadMap::Iteration::kContiguous)
     iteration_contiguous_ = 0;
     ++iteration_strided_;

     if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
       return *this;
     }

     // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
     // which means we enter the next section.
     iteration_strided_ = 0;

     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     this->operator++();

     return prev;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
                            ThreadMap::kElementsPerAccess / sections_ +
                        coord.strided() * Shape::kStrided * stride_ *
                            Layout::kElementsPerAccess);
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<
     Shape_, Element_,
     layout::ColumnMajorTensorOpMultiplicandCrosswise<
         sizeof_bits<Element_>::value, Crosswise>,
     AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for column-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>,
       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;

   using AccessType = typename UnderlyingIterator::AccessType;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.row(), coord.column()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     ++iterator_;

     return prev;
   }
 };


 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int Alignment, int Crosswise>
 class RegularTileAccessIterator<Shape_, Element_,
                                 layout::RowMajorTensorOpMultiplicandCrosswise<
                                     sizeof_bits<Element_>::value, Crosswise>,
                                 AdvanceRank, ThreadMap_, Alignment> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
       "Specialization for row-major iterator may along advance along the "
       "columns(rank=0) or rows(rank=1) dimension.");

   using Shape = Shape_;
   using Element = Element_;
   using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
       sizeof_bits<Element_>::value, Crosswise>;
   static int const kAdvanceRank = AdvanceRank;
   static int const kAlignment = Alignment;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;

   using TensorRef = TensorRef<Element, Layout>;
   using TensorCoord = typename Layout::TensorCoord;

   using ThreadMap = ThreadMap_;

   using UnderlyingIterator = RegularTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
       layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
                                             Crosswise>,
       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;

   using AccessType = typename UnderlyingIterator::AccessType;

  private:
   UnderlyingIterator iterator_;

  public:
   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator(TensorRef ref,
                             int thread_id
                             )
       : iterator_({ref.data(), ref.stride()}, thread_id) {}

   CUTLASS_HOST_DEVICE
   void set_iteration_index(int index) { iterator_.set_iteration_index(index); }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     iterator_.add_pointer_offset(pointer_offset);
   }

   CUTLASS_HOST_DEVICE
   AccessType *get() const {
     return reinterpret_cast<AccessType *>(iterator_.get());
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &coord) {
     iterator_.add_tile_offset({coord.column(), coord.row()});
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator &operator++() {
     ++iterator_;
     return *this;
   }

   CUTLASS_HOST_DEVICE
   RegularTileAccessIterator operator++(int) {
     RegularTileAccessIterator prev(*this);
     ++iterator_;

     return prev;
   }
 };

 }  // namespace threadblock
 }  // namespace transform
 }  // namespace cutlass

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:434

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:683

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:508

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:640

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:275

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:395

cutlass::layout::TensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:640

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:251

cutlass
Definition: aligned_buffer.h:35

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:538

cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:566

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:431

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:725

tensor_op_multiplicand_sm75.h

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:536

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:734

cutlass::TensorRef::data
CUTLASS_HOST_DEVICE Element * data() const
Returns the pointer to referenced data.
Definition: tensor_ref.h:254

cutlass::layout::RowMajorTensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:539

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:629

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:372

cutlass::layout::TensorOpMultiplicandCongruous::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:221

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:127

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:341

cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:422

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:843

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::Index
int32_t Index
Index type used for coordinates.
Definition: tensor_op_multiplicand_sm75.h:742

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:642

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:835

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:401

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:77

cutlass::layout::TensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:213

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:298

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:590

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:154

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:452

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:545

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:792

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:282

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:354

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:161

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::AccessType
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_tensor_op.h:747

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:437

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:446

cutlass::layout::TensorOpMultiplicandCongruous::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:224

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:637

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:408

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:81

cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:767

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:450

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:352

cutlass::operator++
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:78

cutlass::layout::RowMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:846

cutlass::TensorRef::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the layout object&#39;s stride vector.
Definition: tensor_ref.h:277

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:745

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:348

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:255

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:206

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:689

cutlass::TensorRef< Element, Layout >

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:379

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:636

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_access_iterator_tensor_op.h:245

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:182

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Element
Element_ Element
Definition: regular_tile_access_iterator_tensor_op.h:70

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::Index
typename Layout::Index Index
Definition: regular_tile_access_iterator_tensor_op.h:732

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:349

cutlass::TensorRef::offset
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const
Computes the offset of an index from the origin of the tensor.
Definition: tensor_ref.h:301

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::AccessType
Array< Element, Layout::kElementsPerAccess > AccessType
Element type per access.
Definition: regular_tile_access_iterator_tensor_op.h:102

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:763

static_assert
#define static_assert(__e, __m)
Definition: platform.h:153

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:738

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:599

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:671

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:304

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:244

cutlass::layout::TensorOpMultiplicandCrosswise::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor_op_multiplicand_sm75.h:643

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::AccessType
Array< Element, Layout::kElementsPerAccess > AccessType
Element type per access.
Definition: regular_tile_access_iterator_tensor_op.h:474

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:696

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:383

regular_tile_access_iterator.h
Templates implementing the address computation of storing of tiles from pitch-linear rank=2 tensors...

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:660

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::AccessType
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_tensor_op.h:651

matrix_coord.h
Defines a canonical coordinate for rank=2 matrices offering named indices.

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator operator++(int)
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:311

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: regular_tile_access_iterator_tensor_op.h:286

cutlass::transform::threadblock::RegularTileAccessIterator
Definition: regular_tile_access_iterator.h:48

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::AccessType
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_tensor_op.h:363

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:257

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::AccessType
typename UnderlyingIterator::AccessType AccessType
Definition: regular_tile_access_iterator_tensor_op.h:266

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::Shape
Shape_ Shape
Definition: regular_tile_access_iterator_tensor_op.h:69

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::set_iteration_index
CUTLASS_HOST_DEVICE void set_iteration_index(int index)
Overrides the internal iteration index.
Definition: regular_tile_access_iterator_tensor_op.h:667

pitch_linear.h
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::operator++
CUTLASS_HOST_DEVICE RegularTileAccessIterator & operator++()
Advances to the next tile in memory.
Definition: regular_tile_access_iterator_tensor_op.h:785

cutlass::layout::TensorOpMultiplicandCrosswise
Definition: tensor_op_multiplicand_sm75.h:632

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::TensorCoord
typename Layout::TensorCoord TensorCoord
Definition: regular_tile_access_iterator_tensor_op.h:736

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:779

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::ColumnMajorTensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:252

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:733

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::LongIndex
typename Layout::LongIndex LongIndex
Definition: regular_tile_access_iterator_tensor_op.h:447

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::RowMajorTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, Crosswise >, AdvanceRank, ThreadMap_, Alignment >::RegularTileAccessIterator
CUTLASS_HOST_DEVICE RegularTileAccessIterator(TensorRef ref, int thread_id)
Construct a TileIterator with zero threadblock offset.
Definition: regular_tile_access_iterator_tensor_op.h:756

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::ThreadMap
ThreadMap_ ThreadMap
Definition: regular_tile_access_iterator_tensor_op.h:83

cutlass::transform::threadblock::RegularTileAccessIterator< Shape_, Element_, layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, int(128/sizeof(Element_))>, AdvanceRank, ThreadMap_, Alignment >::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord)
Adds a tile offset.
Definition: regular_tile_access_iterator_tensor_op.h:215

cutlass::layout::RowMajorTensorOpMultiplicandCongruous
Definition: tensor_op_multiplicand_sm75.h:527