cutlass/tensor_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once
 #include "assert.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/coord.h"
 #include "cutlass/tensor_coord.h"

 namespace cutlass {
 namespace layout {

 //
 // Defines data layouts of various tensor formats usable by TensorRef and other classes.
 //

 class TensorNHWC {
 public:
   static int const kRank = 4;

   static int const kStrideRank = 3;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = Tensor4DCoord;

   using Stride = Coord<kStrideRank>;

 private:
   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorNHWC(Stride const &stride = Stride(0)): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   TensorNHWC(typename Stride::Index c, typename Stride::Index wc, typename Stride::Index hwc): stride_(make_Coord(c, wc, hwc)) { }

   CUTLASS_HOST_DEVICE
   static TensorNHWC packed(TensorCoord const &extent) {
     return TensorNHWC(
       make_Coord(
         extent.c(),
         extent.w() * extent.c(),
         extent.h() * extent.w() * extent.c()
       )
     );
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return coord.c() +
       LongIndex(stride_[0] * coord.w()) +
       LongIndex(stride_[1] * coord.h()) +
       LongIndex(stride_[2] * coord.n());
   }

   CUTLASS_HOST_DEVICE
   explicit operator RowMajor() {
     return RowMajor(stride_[0]);
   }

   CUTLASS_HOST_DEVICE
   TensorCoord inverse(LongIndex index) const {

     int n = 0, h = 0, w = 0, c = 0;

     #if defined(__CUDA_ARCH__)
     int tmp = 0;
     c = int(index % static_cast<int>(stride_[0]));

     unsigned int hw_mul, hw_shr, w_mul, w_shr, c_mul, c_shr;

     find_divisor(hw_mul, hw_shr, stride_[2]);
     find_divisor(w_mul, w_shr, stride_[1]);
     find_divisor(c_mul, c_shr, stride_[0]);

     fast_divmod(n, tmp, index, int(stride_[2]), hw_mul, hw_shr);
     fast_divmod(h, w, tmp, int(stride_[1]), w_mul, w_shr);
     fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);
     #else

     n = int(index / (stride_[0] * stride_[1] * stride_[2]));
     LongIndex residual = index % (stride_[0] * stride_[1] * stride_[2]);

     h = int(residual / (stride_[0] * stride_[1]));
     residual = (residual % (stride_[0] * stride_[1]));

     w = int(residual / stride_[0]);
     c = int(residual % stride_[0]);

     #endif
     return TensorCoord(n, h, w, c);
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     // it does not make sense if the extent is larger than stride
     // and we could not rely on the capacity calculation in such cases
     // we could move this checkers to debug code only
     if ((extent.c() > stride_[0])
         || (extent.w() * stride_[0] > stride_[1])
         || (extent.h() * stride_[1] > stride_[2])) {
       assert(0);
     }
     return extent.n() * stride_[2];
   }
 };


 class TensorNCHW {
 public:
   static int const kRank = 4;

   static int const kStrideRank = 3;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = Tensor4DCoord;

   using Stride = Coord<kStrideRank>;

 private:
   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorNCHW(Stride const &stride = Stride(0)): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static TensorNCHW packed(TensorCoord const &extent) {
     return TensorNCHW(
       make_Coord(
         extent.w(),
         extent.w() * extent.h(),
         extent.h() * extent.w() * extent.c()
       )
     );
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {
     return coord.w() +
       LongIndex(stride_[0] * coord.h()) +
       LongIndex(stride_[1] * coord.c()) +
       LongIndex(stride_[2] * coord.n());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent.n() * stride_[2];
   }
 };


 template <int Interleave>
 class TensorNCxHWx {
 public:

   static int const kInterleave = Interleave;

   static int const kRank = 4;

   static int const kStrideRank = 3;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = Tensor4DCoord;

   using Stride = Coord<kStrideRank>;

 private:
   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static TensorNCxHWx packed(TensorCoord const &extent) {
     return TensorNCxHWx(
       make_Coord(
         kInterleave * extent.w(),
         kInterleave * extent.w() * extent.h(),
         extent.h() * extent.w() * extent.c()
       )
     );
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {

     Index c_minor = (coord.c() % kInterleave);
     Index c_major = (coord.c() / kInterleave);

     return c_minor +
       LongIndex(kInterleave * coord.w()) +
       LongIndex(stride_[0] * coord.h()) +
       LongIndex(stride_[1] * c_major) +
       LongIndex(stride_[2] * coord.n());
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return extent.n() * stride_[2];
   }
 };


 template <int Interleave>
 class TensorCxRSKx {
 public:

   static int const kInterleave = Interleave;

   static int const kRank = 4;

   static int const kStrideRank = 3;

   using Index = int32_t;

   using LongIndex = int64_t;

   using TensorCoord = Tensor4DCoord;

   using Stride = Coord<kStrideRank>;

 private:
   //
   // Data members
   //

   Stride stride_;

 public:
   //
   // Methods
   //

   CUTLASS_HOST_DEVICE
   TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }

   CUTLASS_HOST_DEVICE
   static TensorCxRSKx packed(TensorCoord const &extent) {
     return TensorCxRSKx(
       make_Coord(
         kInterleave * extent.n(),
         kInterleave * extent.n() * extent.w(),
         kInterleave * extent.n() * extent.w() * extent.h()
       )
     );
   }

   CUTLASS_HOST_DEVICE
   LongIndex operator()(TensorCoord const &coord) const {

     Index c_minor = (coord.c() % kInterleave);
     Index c_major = (coord.c() / kInterleave);

     return c_minor +
       LongIndex(kInterleave * coord.n()) +
       LongIndex(stride_[0] * coord.w()) +
       LongIndex(stride_[1] * coord.h()) +
       LongIndex(stride_[2] * c_major);
   }

   CUTLASS_HOST_DEVICE
   Stride stride() const {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   Stride & stride() {
     return stride_;
   }

   CUTLASS_HOST_DEVICE
   LongIndex capacity(TensorCoord const &extent) const {
     return (extent.c() / kInterleave * stride_[2]);
   }
 };


 } // namespace layout
 } // namespace cutlass
cutlass::layout::TensorNHWC::Stride
Coord< kStrideRank > Stride
Stride vector.
Definition: tensor.h:71

cutlass::layout::TensorNCHW::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor.h:246

cutlass::Tensor4DCoord
Defines a canonical 4D coordinate used by tensor operations.
Definition: tensor_coord.h:38

cutlass::layout::TensorCxRSKx::TensorCxRSKx
CUTLASS_HOST_DEVICE TensorCxRSKx(Stride const &stride=Stride(0))
Constructor.
Definition: tensor.h:396

cutlass
Definition: aligned_buffer.h:35

cutlass::fast_divmod
CUTLASS_HOST_DEVICE void fast_divmod(int &quo, int &rem, int src, int div, unsigned int mul, unsigned int shr)
Definition: fast_math.h:176

cutlass::layout::TensorNCxHWx::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor.h:348

cutlass::layout::TensorNCxHWx::TensorNCxHWx
CUTLASS_HOST_DEVICE TensorNCxHWx(Stride const &stride=Stride(0))
Constructor.
Definition: tensor.h:306

cutlass::layout::TensorNHWC::kStrideRank
static int const kStrideRank
Rank of stride vector.
Definition: tensor.h:59

cutlass::layout::TensorNCxHWx::packed
static CUTLASS_HOST_DEVICE TensorNCxHWx packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor.h:310

coord.h
A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::make_Coord
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:387

cutlass::layout::TensorCxRSKx::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Returns the offset of a coordinate in linear memory.
Definition: tensor.h:412

cutlass::layout::TensorNCxHWx::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor.h:336

cutlass::layout::TensorNHWC::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor.h:169

cutlass::layout::TensorNHWC::Index
int32_t Index
Index type used for coordinates.
Definition: tensor.h:62

cutlass::layout::TensorNHWC::TensorCoord
Tensor4DCoord TensorCoord
Logical coordinate (n, h, w, c)
Definition: tensor.h:68

cutlass::layout::TensorNCxHWx
Mapping function for 4-D NC/xHWx tensors.
Definition: tensor.h:267

cutlass::layout::TensorNCHW::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor.h:198

cutlass::Tensor4DCoord::w
CUTLASS_HOST_DEVICE Index const & w() const
Returns the column of the coordinate.
Definition: tensor_coord.h:95

cutlass::layout::TensorNHWC::TensorNHWC
CUTLASS_HOST_DEVICE TensorNHWC(Stride const &stride=Stride(0))
Constructor.
Definition: tensor.h:88

cutlass::Coord< kStrideRank >::Index
int Index
Index type used to store elements.
Definition: coord.h:55

cutlass::layout::TensorNCHW::packed
static CUTLASS_HOST_DEVICE TensorNCHW packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor.h:225

cutlass::layout::TensorNHWC::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor.h:163

cutlass::layout::TensorNCHW::TensorNCHW
CUTLASS_HOST_DEVICE TensorNCHW(Stride const &stride=Stride(0))
Constructor.
Definition: tensor.h:221

cutlass::layout::TensorNHWC::kRank
static int const kRank
Logical rank of tensor.
Definition: tensor.h:56

cutlass::layout::TensorNCxHWx::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor.h:342

cutlass::layout::TensorNCHW::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor.h:258

cutlass::Tensor4DCoord::c
CUTLASS_HOST_DEVICE Index const & c() const
Returns the channel of the coordinate.
Definition: tensor_coord.h:103

cutlass::layout::TensorCxRSKx::Index
int32_t Index
Index type used for coordinates.
Definition: tensor.h:370

cutlass::layout::TensorNHWC::TensorNHWC
CUTLASS_HOST_DEVICE TensorNHWC(typename Stride::Index c, typename Stride::Index wc, typename Stride::Index hwc)
Constructor.
Definition: tensor.h:92

cutlass::layout::TensorCxRSKx::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor.h:432

tensor_coord.h
Defines a canonical coordinate for rank=4 tensors offering named indices.

cutlass::layout::TensorCxRSKx::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor.h:426

cutlass::layout::TensorCxRSKx
Mapping function for 4-D CxRSKx tensors.
Definition: tensor.h:357

cutlass::layout::TensorNHWC::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor.h:65

CUTLASS_HOST_DEVICE
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89

cutlass::layout::TensorNHWC::packed
static CUTLASS_HOST_DEVICE TensorNHWC packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed NHWC tensor.
Definition: tensor.h:96

cutlass::layout::TensorNCHW
Mapping function for 4-D NCHW tensors.
Definition: tensor.h:186

cutlass::layout::TensorNHWC::stride
CUTLASS_HOST_DEVICE Stride stride() const
Returns the stride of the layout.
Definition: tensor.h:157

cutlass::layout::TensorCxRSKx::capacity
CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const
Compute the number of contiguous elements needed to store a tensor with the given size...
Definition: tensor.h:438

cutlass::layout::TensorCxRSKx::packed
static CUTLASS_HOST_DEVICE TensorCxRSKx packed(TensorCoord const &extent)
Helper returns a layout to a tightly packed tensor.
Definition: tensor.h:400

cutlass::layout::TensorNCHW::Index
int32_t Index
Index type used for coordinates.
Definition: tensor.h:195

cutlass::layout::TensorNCHW::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Returns the offset of a coordinate in linear memory.
Definition: tensor.h:237

cutlass::Coord< kStrideRank >

cutlass::layout::RowMajor
Mapping function for row-major matrices.
Definition: layout/matrix.h:50

cutlass::Tensor4DCoord::n
CUTLASS_HOST_DEVICE Index const & n() const
Returns the batch of the coordinate.
Definition: tensor_coord.h:79

cutlass::find_divisor
CUTLASS_HOST_DEVICE void find_divisor(unsigned int &mul, unsigned int &shr, unsigned int denom)
Definition: fast_math.h:159

cutlass::layout::TensorCxRSKx::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor.h:373

cutlass::layout::TensorNCHW::stride
CUTLASS_HOST_DEVICE Stride & stride()
Returns the stride of the layout.
Definition: tensor.h:252

matrix.h
Defines layout functions used by TensorRef and derived classes.

cutlass::layout::TensorNCxHWx::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Returns the offset of a coordinate in linear memory.
Definition: tensor.h:322

fast_math.h
Math utilities.

cutlass::layout::TensorNCxHWx::LongIndex
int64_t LongIndex
Long index type used for offsets.
Definition: tensor.h:283

cutlass::Tensor4DCoord::h
CUTLASS_HOST_DEVICE Index const & h() const
Returns the row of the coordinate.
Definition: tensor_coord.h:87

cutlass::layout::TensorNHWC
Mapping function for 4-D NHWC tensors.
Definition: tensor.h:53

cutlass::layout::TensorNCxHWx::Index
int32_t Index
Index type used for coordinates.
Definition: tensor.h:280

cutlass::layout::TensorNHWC::inverse
CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex index) const
Returns the logical coordinate (n, h, w, c) from a given offset in linear memory. ...
Definition: tensor.h:123

cutlass.h
Basic include for CUTLASS.

cutlass::layout::TensorNHWC::operator()
CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const
Returns the offset of a coordinate (n, h, w, c) in linear memory.
Definition: tensor.h:108