cub/thread/thread_load.cuh
File members: cub/thread/thread_load.cuh
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include <cub/config.cuh>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>
#include <cuda/std/type_traits>
CUB_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------
enum CacheLoadModifier
{
LOAD_DEFAULT,
LOAD_CA,
LOAD_CG,
LOAD_CS,
LOAD_CV,
LOAD_LDG,
LOAD_VOLATILE,
};
template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr);
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
template <int COUNT, int MAX>
struct IterateThreadLoad
{
template <CacheLoadModifier MODIFIER, typename T>
CCCL_DEPRECATED_BECAUSE("Use UnrolledThreadLoad() instead")
static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* ptr, T* vals)
{
vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
}
template <typename RandomAccessIterator, typename T>
CCCL_DEPRECATED_BECAUSE("Use UnrolledCopy() instead")
static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals)
{
vals[COUNT] = itr[COUNT];
IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
}
};
template <int MAX>
struct IterateThreadLoad<MAX, MAX>
{
template <CacheLoadModifier MODIFIER, typename T>
static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/)
{}
template <typename RandomAccessIterator, typename T>
static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/)
{}
};
namespace detail
{
template <CacheLoadModifier MODIFIER, typename T, int... Is>
_CCCL_DEVICE _CCCL_FORCEINLINE void
UnrolledThreadLoadImpl(T const* src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
{
// TODO(bgruber): replace by fold over comma in C++17
int dummy[] = {(dst[Is] = ThreadLoad<MODIFIER>(src + Is), 0)...};
(void) dummy;
}
template <typename RandomAccessIterator, typename T, int... Is>
_CCCL_DEVICE _CCCL_FORCEINLINE void
UnrolledCopyImpl(RandomAccessIterator src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
{
// TODO(bgruber): replace by fold over comma in C++17
int dummy[] = {(dst[Is] = src[Is], 0)...};
(void) dummy;
}
} // namespace detail
template <int Count, CacheLoadModifier MODIFIER, typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledThreadLoad(T const* src, T* dst)
{
detail::UnrolledThreadLoadImpl<MODIFIER>(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
}
template <int Count, typename RandomAccessIterator, typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledCopy(RandomAccessIterator src, T* dst)
{
detail::UnrolledCopyImpl(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
}
# define _CUB_LOAD_16(cub_modifier, ptx_modifier) \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE uint4 ThreadLoad<cub_modifier, uint4 const*>(uint4 const* ptr) \
{ \
uint4 retval; \
asm volatile("ld." #ptx_modifier ".v4.u32 {%0, %1, %2, %3}, [%4];" \
: "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) \
: "l"(ptr)); \
return retval; \
} \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const*>(ulonglong2 const* ptr) \
{ \
ulonglong2 retval; \
asm volatile("ld." #ptx_modifier ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); \
return retval; \
}
# define _CUB_LOAD_8(cub_modifier, ptx_modifier) \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE ushort4 ThreadLoad<cub_modifier, ushort4 const*>(ushort4 const* ptr) \
{ \
ushort4 retval; \
asm volatile("ld." #ptx_modifier ".v4.u16 {%0, %1, %2, %3}, [%4];" \
: "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) \
: "l"(ptr)); \
return retval; \
} \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE uint2 ThreadLoad<cub_modifier, uint2 const*>(uint2 const* ptr) \
{ \
uint2 retval; \
asm volatile("ld." #ptx_modifier ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); \
return retval; \
} \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long ThreadLoad<cub_modifier, unsigned long long const*>( \
unsigned long long const* ptr) \
{ \
unsigned long long retval; \
asm volatile("ld." #ptx_modifier ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); \
return retval; \
}
# define _CUB_LOAD_4(cub_modifier, ptx_modifier) \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int ThreadLoad<cub_modifier, unsigned int const*>(unsigned int const* ptr) \
{ \
unsigned int retval; \
asm volatile("ld." #ptx_modifier ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); \
return retval; \
}
# define _CUB_LOAD_2(cub_modifier, ptx_modifier) \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned short ThreadLoad<cub_modifier, unsigned short const*>( \
unsigned short const* ptr) \
{ \
unsigned short retval; \
asm volatile("ld." #ptx_modifier ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); \
return retval; \
}
# define _CUB_LOAD_1(cub_modifier, ptx_modifier) \
template <> \
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned char ThreadLoad<cub_modifier, unsigned char const*>( \
unsigned char const* ptr) \
{ \
unsigned short retval; \
asm volatile( \
"{" \
" .reg .u8 datum;" \
" ld." #ptx_modifier ".u8 datum, [%1];" \
" cvt.u16.u8 %0, datum;" \
"}" \
: "=h"(retval) \
: "l"(ptr)); \
return (unsigned char) retval; \
}
# define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
_CUB_LOAD_16(cub_modifier, ptx_modifier) \
_CUB_LOAD_8(cub_modifier, ptx_modifier) \
_CUB_LOAD_4(cub_modifier, ptx_modifier) \
_CUB_LOAD_2(cub_modifier, ptx_modifier) \
_CUB_LOAD_1(cub_modifier, ptx_modifier)
_CUB_LOAD_ALL(LOAD_CA, ca)
_CUB_LOAD_ALL(LOAD_CG, cg)
_CUB_LOAD_ALL(LOAD_CS, cs)
_CUB_LOAD_ALL(LOAD_CV, cv)
_CUB_LOAD_ALL(LOAD_LDG, global.nc)
// Macro cleanup
# undef _CUB_LOAD_ALL
# undef _CUB_LOAD_1
# undef _CUB_LOAD_2
# undef _CUB_LOAD_4
# undef _CUB_LOAD_8
# undef _CUB_LOAD_16
template <typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator>
ThreadLoad(RandomAccessIterator itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
{
return *itr;
}
template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T
ThreadLoad(const T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
return *ptr;
}
template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<true> /*is_primitive*/)
{
T retval = *reinterpret_cast<const volatile T*>(ptr);
return retval;
}
template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<false> /*is_primitive*/)
{
// Word type for memcpying
using VolatileWord = typename UnitWord<T>::VolatileWord;
constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
T retval;
VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
return retval;
}
template <typename T>
_CCCL_DEVICE _CCCL_FORCEINLINE T
ThreadLoad(const T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
}
template <typename T, int MODIFIER>
_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
{
using DeviceWord = typename UnitWord<T>::DeviceWord;
constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
DeviceWord words[DEVICE_MULTIPLE];
UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(reinterpret_cast<const DeviceWord*>(ptr), words);
return *reinterpret_cast<T*>(words);
}
template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr)
{
return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
}
#endif // _CCCL_DOXYGEN_INVOKED
CUB_NAMESPACE_END