cutlass/shared__load__iterator_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/matrix_shape.h"
 #include "cutlass/tensor_ref.h"

 #include "cutlass/epilogue/threadblock/output_tile_thread_map.h"


 namespace cutlass {
 namespace epilogue {
 namespace threadblock {


 template <
   typename ThreadMap_,
   typename Element_,
   int MaxAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8
 >
 class SharedLoadIterator {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;

   using Element = Element_;

   using Layout = layout::RowMajor;
   using TensorRef = TensorRef<Element, Layout>;
   using ConstTensorRef = typename TensorRef::ConstTensorRef;

   using Index = typename Layout::Index;
   using LongIndex = typename Layout::LongIndex;
   using TensorCoord = MatrixCoord;

   static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

   static int const kMinAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;

   static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);

   static int const kThreads = ThreadMap::kThreads;

   using Fragment = Array<
     Element,
     ThreadMap::Iterations::kColumn *
     ThreadMap::Iterations::kRow *
     ThreadMap::Iterations::kGroup *
     ThreadMap::Iterations::kCluster *
     ThreadMap::kElementsPerAccess>;

   using AccessType = AlignedArray<
     Element,
     ThreadMap::kElementsPerAccess,
     kAlignment>;

 private:

   //
   // Data members
   //

   uint8_t *byte_pointer_;

   int stride_;

 public:

   //
   // Methods
   //

   CUTLASS_DEVICE
   SharedLoadIterator(
     TensorRef ref,
     int thread_idx
   ):
     byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
     stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {

     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);

     // Initialize pointer
     byte_pointer_ +=
       thread_offset.row() * stride_ +
       thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;

     int byte_offset = thread_offset.row() * stride_ +
       thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
   }

   CUTLASS_HOST_DEVICE
   void add_pointer_offset(LongIndex pointer_offset) {
     byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
   }

   CUTLASS_DEVICE
   void add_tile_offset(TensorCoord const &offset) {
     add_pointer_offset(offset.row() * stride_ / (sizeof_bits<Element>::value / 8) + offset.column() * Shape::kColumn);
   }

   CUTLASS_DEVICE
   void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {

     AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

     CUTLASS_PRAGMA_UNROLL
     for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {

       CUTLASS_PRAGMA_UNROLL
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {

         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {

           uint8_t const *byte_pointer = byte_pointer_ +
             row * ThreadMap::Delta::kRow * stride_ +
             group * ThreadMap::Delta::kGroup* stride_ +
             cluster * ThreadMap::Delta::kCluster * stride_ +
             pointer_offset * sizeof_bits<Element>::value / 8;

           int frag_row_idx =
             (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));

           AccessType const *memory_pointer = reinterpret_cast<AccessType const *>(byte_pointer);

           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {

             int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;

             frag_ptr[frag_idx] =
               memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess];
           }
         }
       }
     }
   }

   CUTLASS_DEVICE
   void load(Fragment &frag) {

     load_with_pointer_offset(frag, 0);
   }
 };


 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass

cutlass::layout::RowMajor::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: layout/matrix.h:62

cutlass::MatrixCoord::column
CUTLASS_HOST_DEVICE Index const & column() const
Returns the column of the coordinate.
Definition: matrix_coord.h:85

cutlass::epilogue::threadblock::SharedLoadIterator::Fragment
Array< Element, ThreadMap::Iterations::kColumn *ThreadMap::Iterations::kRow *ThreadMap::Iterations::kGroup *ThreadMap::Iterations::kCluster *ThreadMap::kElementsPerAccess > Fragment
Fragment object.
Definition: shared_load_iterator.h:91

cutlass
Definition: aligned_buffer.h:35

cutlass::sizeof_bits::value
static int const value
Definition: numeric_types.h:43

tensor_ref.h
Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::epilogue::threadblock::SharedLoadIterator::load_with_pointer_offset
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)
Loads a fragment from memory.
Definition: shared_load_iterator.h:150

cutlass::epilogue::threadblock::SharedLoadIterator::kThreads
static int const kThreads
Definition: shared_load_iterator.h:82

cutlass::AlignedArray
Aligned array type.
Definition: array.h:511

cutlass::epilogue::threadblock::SharedLoadIterator::SharedLoadIterator
CUTLASS_DEVICE SharedLoadIterator(TensorRef ref, int thread_idx)
Constructor.
Definition: shared_load_iterator.h:119

cutlass::MatrixCoord::row
CUTLASS_HOST_DEVICE Index const & row() const
Returns the row of the coordinate.
Definition: matrix_coord.h:77

cutlass::epilogue::threadblock::SharedLoadIterator::kMinAlignment
static int const kMinAlignment
Definition: shared_load_iterator.h:78

cutlass::epilogue::threadblock::SharedLoadIterator::ConstTensorRef
typename TensorRef::ConstTensorRef ConstTensorRef
Definition: shared_load_iterator.h:70

cutlass::TensorRef< Element, Layout >::ConstTensorRef
TensorRef< typename platform::remove_const< Element >::type const, Layout > ConstTensorRef
TensorRef to constant data.
Definition: tensor_ref.h:179

cutlass::epilogue::threadblock::SharedLoadIterator::ThreadMap
ThreadMap_ ThreadMap
Definition: shared_load_iterator.h:63

array.h
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110

cutlass::layout::RowMajor::Index
int32_t Index
Index type used for coordinates.
Definition: layout/matrix.h:59

cutlass::epilogue::threadblock::SharedLoadIterator::kAlignment
static int const kAlignment
Definition: shared_load_iterator.h:80

matrix_shape.h
Defines a Shape template for matrix tiles.

cutlass::sizeof_bits
Defines the size of an element in bits.
Definition: numeric_types.h:42

cutlass::TensorRef< Element, Layout >

cutlass::epilogue::threadblock::SharedLoadIterator::AccessType
AlignedArray< Element, ThreadMap::kElementsPerAccess, kAlignment > AccessType
Memory access size.
Definition: shared_load_iterator.h:97

cutlass::epilogue::threadblock::SharedLoadIterator::Index
typename Layout::Index Index
Definition: shared_load_iterator.h:72

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

numeric_types.h
Top-level include for all CUTLASS numeric types.

output_tile_thread_map.h
Metaprogram for determining the mapping of output elements to threads for epilogue tiles...

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::epilogue::threadblock::SharedLoadIterator::load
CUTLASS_DEVICE void load(Fragment &frag)
Loads a fragment.
Definition: shared_load_iterator.h:189

cutlass::epilogue::threadblock::SharedLoadIterator::Element
Element_ Element
Definition: shared_load_iterator.h:66

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::epilogue::threadblock::SharedLoadIterator::Shape
typename ThreadMap::Shape Shape
Definition: shared_load_iterator.h:64

cutlass::epilogue::threadblock::SharedLoadIterator::add_pointer_offset
CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)
Adds a pointer offset in units of Element.
Definition: shared_load_iterator.h:139

cutlass::epilogue::threadblock::SharedLoadIterator::kElementsPerAccess
static int const kElementsPerAccess
Definition: shared_load_iterator.h:76

cutlass::epilogue::threadblock::SharedLoadIterator
Definition: shared_load_iterator.h:61

cutlass::epilogue::threadblock::SharedLoadIterator::LongIndex
typename Layout::LongIndex LongIndex
Definition: shared_load_iterator.h:73

cutlass.h
Basic include for CUTLASS.

cutlass::MatrixCoord
Definition: matrix_coord.h:39

cutlass::epilogue::threadblock::SharedLoadIterator::add_tile_offset
CUTLASS_DEVICE void add_tile_offset(TensorCoord const &offset)
Definition: shared_load_iterator.h:144