cub/thread/thread_load.cuh

File members: cub/thread/thread_load.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN

//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------

enum CacheLoadModifier
{
  LOAD_DEFAULT,
  LOAD_CA,
  LOAD_CG,
  LOAD_CS,
  LOAD_CV,
  LOAD_LDG,
  LOAD_VOLATILE,
};

template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr);

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

template <int COUNT, int MAX>
struct IterateThreadLoad
{
  template <CacheLoadModifier MODIFIER, typename T>
  CCCL_DEPRECATED_BECAUSE("Use UnrolledThreadLoad() instead")
  static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* ptr, T* vals)
  {
    vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
    IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
  }

  template <typename RandomAccessIterator, typename T>
  CCCL_DEPRECATED_BECAUSE("Use UnrolledCopy() instead")
  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals)
  {
    vals[COUNT] = itr[COUNT];
    IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
  }
};

template <int MAX>
struct IterateThreadLoad<MAX, MAX>
{
  template <CacheLoadModifier MODIFIER, typename T>
  static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/)
  {}

  template <typename RandomAccessIterator, typename T>
  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/)
  {}
};

namespace detail
{
template <CacheLoadModifier MODIFIER, typename T, int... Is>
_CCCL_DEVICE _CCCL_FORCEINLINE void
UnrolledThreadLoadImpl(T const* src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
{
  // TODO(bgruber): replace by fold over comma in C++17
  int dummy[] = {(dst[Is] = ThreadLoad<MODIFIER>(src + Is), 0)...};
  (void) dummy;
}

template <typename RandomAccessIterator, typename T, int... Is>
_CCCL_DEVICE _CCCL_FORCEINLINE void
UnrolledCopyImpl(RandomAccessIterator src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
{
  // TODO(bgruber): replace by fold over comma in C++17
  int dummy[] = {(dst[Is] = src[Is], 0)...};
  (void) dummy;
}
} // namespace detail

template <int Count, CacheLoadModifier MODIFIER, typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledThreadLoad(T const* src, T* dst)
{
  detail::UnrolledThreadLoadImpl<MODIFIER>(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
}

template <int Count, typename RandomAccessIterator, typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledCopy(RandomAccessIterator src, T* dst)
{
  detail::UnrolledCopyImpl(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
}

#  define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                               \
    template <>                                                                                                  \
    _CCCL_DEVICE _CCCL_FORCEINLINE uint4 ThreadLoad<cub_modifier, uint4 const*>(uint4 const* ptr)                \
    {                                                                                                            \
      uint4 retval;                                                                                              \
      asm volatile("ld." #ptx_modifier ".v4.u32 {%0, %1, %2, %3}, [%4];"                                         \
                   : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)                              \
                   : "l"(ptr));                                                                                  \
      return retval;                                                                                             \
    }                                                                                                            \
    template <>                                                                                                  \
    _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const*>(ulonglong2 const* ptr) \
    {                                                                                                            \
      ulonglong2 retval;                                                                                         \
      asm volatile("ld." #ptx_modifier ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr));   \
      return retval;                                                                                             \
    }

#  define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                              \
    template <>                                                                                                \
    _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 ThreadLoad<cub_modifier, ushort4 const*>(ushort4 const* ptr)        \
    {                                                                                                          \
      ushort4 retval;                                                                                          \
      asm volatile("ld." #ptx_modifier ".v4.u16 {%0, %1, %2, %3}, [%4];"                                       \
                   : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)                            \
                   : "l"(ptr));                                                                                \
      return retval;                                                                                           \
    }                                                                                                          \
    template <>                                                                                                \
    _CCCL_DEVICE _CCCL_FORCEINLINE uint2 ThreadLoad<cub_modifier, uint2 const*>(uint2 const* ptr)              \
    {                                                                                                          \
      uint2 retval;                                                                                            \
      asm volatile("ld." #ptx_modifier ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); \
      return retval;                                                                                           \
    }                                                                                                          \
    template <>                                                                                                \
    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long ThreadLoad<cub_modifier, unsigned long long const*>(     \
      unsigned long long const* ptr)                                                                           \
    {                                                                                                          \
      unsigned long long retval;                                                                               \
      asm volatile("ld." #ptx_modifier ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr));                            \
      return retval;                                                                                           \
    }

#  define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                                      \
    template <>                                                                                                        \
    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int ThreadLoad<cub_modifier, unsigned int const*>(unsigned int const* ptr) \
    {                                                                                                                  \
      unsigned int retval;                                                                                             \
      asm volatile("ld." #ptx_modifier ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr));                                    \
      return retval;                                                                                                   \
    }

#  define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
    template <>                                                                                    \
    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short ThreadLoad<cub_modifier, unsigned short const*>( \
      unsigned short const* ptr)                                                                   \
    {                                                                                              \
      unsigned short retval;                                                                       \
      asm volatile("ld." #ptx_modifier ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr));                \
      return retval;                                                                               \
    }

#  define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                \
    template <>                                                                                  \
    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char ThreadLoad<cub_modifier, unsigned char const*>( \
      unsigned char const* ptr)                                                                  \
    {                                                                                            \
      unsigned short retval;                                                                     \
      asm volatile(                                                                              \
        "{"                                                                                      \
        "   .reg .u8 datum;"                                                                     \
        "    ld." #ptx_modifier ".u8 datum, [%1];"                                               \
        "    cvt.u16.u8 %0, datum;"                                                              \
        "}"                                                                                      \
        : "=h"(retval)                                                                           \
        : "l"(ptr));                                                                             \
      return (unsigned char) retval;                                                             \
    }

#  define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
    _CUB_LOAD_16(cub_modifier, ptx_modifier)        \
    _CUB_LOAD_8(cub_modifier, ptx_modifier)         \
    _CUB_LOAD_4(cub_modifier, ptx_modifier)         \
    _CUB_LOAD_2(cub_modifier, ptx_modifier)         \
    _CUB_LOAD_1(cub_modifier, ptx_modifier)

_CUB_LOAD_ALL(LOAD_CA, ca)
_CUB_LOAD_ALL(LOAD_CG, cg)
_CUB_LOAD_ALL(LOAD_CS, cs)
_CUB_LOAD_ALL(LOAD_CV, cv)
_CUB_LOAD_ALL(LOAD_LDG, global.nc)

// Macro cleanup
#  undef _CUB_LOAD_ALL
#  undef _CUB_LOAD_1
#  undef _CUB_LOAD_2
#  undef _CUB_LOAD_4
#  undef _CUB_LOAD_8
#  undef _CUB_LOAD_16

template <typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator>
ThreadLoad(RandomAccessIterator itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
{
  return *itr;
}

template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T
ThreadLoad(const T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
  return *ptr;
}

template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<true> /*is_primitive*/)
{
  T retval = *reinterpret_cast<const volatile T*>(ptr);
  return retval;
}

template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<false> /*is_primitive*/)
{
  // Word type for memcpying
  using VolatileWord              = typename UnitWord<T>::VolatileWord;
  constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);

  T retval;
  VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
  return retval;
}

template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T
ThreadLoad(const T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
  return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
}

template <typename T, int MODIFIER>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
  using DeviceWord              = typename UnitWord<T>::DeviceWord;
  constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);

  DeviceWord words[DEVICE_MULTIPLE];
  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(reinterpret_cast<const DeviceWord*>(ptr), words);
  return *reinterpret_cast<T*>(words);
}

template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr)
{
  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
}

#endif // _CCCL_DOXYGEN_INVOKED

CUB_NAMESPACE_END