cub::WarpLoad
Defined in cub/warp/warp_load.cuh
-
template<typename InputT, int ITEMS_PER_THREAD, WarpLoadAlgorithm ALGORITHM = WARP_LOAD_DIRECT, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
class WarpLoad The WarpLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread warp.
Overview
The WarpLoad class provides a single data movement abstraction that can be specialized to implement different cub::WarpLoadAlgorithm strategies. This facilitates different performance policies for different architectures, data types, granularity sizes, etc.
WarpLoad can be optionally specialized by different data movement strategies:
cub::WARP_LOAD_DIRECT
: a blocked arrangement of data is read directly from memory.cub::WARP_LOAD_STRIPED
: a striped arrangement of data is read directly from memory.cub::WARP_LOAD_VECTORIZE
: a blocked arrangement of data is read directly from memory using CUDA’s built-in vectorized loads as a coalescing optimization.cub::WARP_LOAD_TRANSPOSE
: a striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
A Simple Example
The code snippet below illustrates the loading of a linear segment of 64 integers into a “blocked” arrangement across 16 threads where each thread owns 4 consecutive items. The load is specialized for
WARP_LOAD_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).#include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh> __global__ void ExampleKernel(int *d_data, ...) { constexpr int warp_threads = 16; constexpr int block_threads = 256; constexpr int items_per_thread = 4; // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each using WarpLoadT = WarpLoad<int, items_per_thread, cub::WARP_LOAD_TRANSPOSE, warp_threads>; constexpr int warps_in_block = block_threads / warp_threads; constexpr int tile_size = items_per_thread * warp_threads; const int warp_id = static_cast<int>(threadIdx.x) / warp_threads; // Allocate shared memory for WarpLoad __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; // Load a segment of consecutive items that are blocked across threads int thread_data[items_per_thread]; WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
Suppose the input
d_data
is0, 1, 2, 3, 4, 5, ...
. The set ofthread_data
across the first logical warp of threads in those threads will be:{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }
.- Template Parameters
InputT – The data type to read into (which must be convertible from the input iterator’s value type).
ITEMS_PER_THREAD – The number of consecutive items partitioned onto each thread.
ALGORITHM – [optional] cub::WarpLoadAlgorithm tuning policy. default: cub::WARP_LOAD_DIRECT.
LOGICAL_WARP_THREADS – [optional] The number of threads per “logical” warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
LEGACY_PTX_ARCH – Unused.
Collective constructors
-
inline WarpLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
-
inline WarpLoad(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Data movement
-
template<typename InputIteratorT>
inline void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) Load a linear segment of items from memory.
A subsequent
__syncwarp()
warp-wide barrier should be invoked after calling this method if the collective’s temporary storage (e.g.,temp_storage
) is to be reused or repurposed.Snippet
#include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh> __global__ void ExampleKernel(int *d_data, ...) { constexpr int warp_threads = 16; constexpr int block_threads = 256; constexpr int items_per_thread = 4; // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each using WarpLoadT = WarpLoad<int, items_per_thread, cub::WARP_LOAD_TRANSPOSE, warp_threads>; constexpr int warps_in_block = block_threads / warp_threads; constexpr int tile_size = items_per_thread * warp_threads; const int warp_id = static_cast<int>(threadIdx.x) / warp_threads; // Allocate shared memory for WarpLoad __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; // Load a segment of consecutive items that are blocked across threads int thread_data[items_per_thread]; WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
Suppose the input
d_data
is0, 1, 2, 3, 4, 5, ...
, The set ofthread_data
across the first logical warp of threads in those threads will be:{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }
.- Parameters
block_itr – [in] The thread block’s base input iterator for loading from
items – [out] Data to load
-
template<typename InputIteratorT>
inline void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) Load a linear segment of items from memory, guarded by range.
A subsequent
__syncwarp()
warp-wide barrier should be invoked after calling this method if the collective’s temporary storage (e.g.,temp_storage
) is to be reused or repurposed.Snippet
#include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh> __global__ void ExampleKernel(int *d_data, int valid_items, ...) { constexpr int warp_threads = 16; constexpr int block_threads = 256; constexpr int items_per_thread = 4; // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each using WarpLoadT = WarpLoad<int, items_per_thread, cub::WARP_LOAD_TRANSPOSE, warp_threads>; constexpr int warps_in_block = block_threads / warp_threads; constexpr int tile_size = items_per_thread * warp_threads; const int warp_id = static_cast<int>(threadIdx.x) / warp_threads; // Allocate shared memory for WarpLoad __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; // Load a segment of consecutive items that are blocked across threads int thread_data[items_per_thread]; WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data, valid_items);
Suppose the input
d_data
is0, 1, 2, 3, 4, 5, ...
andvalid_items
is5
. The set ofthread_data
across the first logical warp of threads in those threads will be:{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }
with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).- Parameters
block_itr – [in] The thread block’s base input iterator for loading from
items – [out] Data to load
valid_items – [in] Number of valid items to load
-
template<typename InputIteratorT, typename DefaultT>
inline void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) Load a linear segment of items from memory, guarded by range.
A subsequent
__syncwarp()
warp-wide barrier should be invoked after calling this method if the collective’s temporary storage (e.g.,temp_storage
) is to be reused or repurposed.Snippet
#include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh> __global__ void ExampleKernel(int *d_data, int valid_items, ...) { constexpr int warp_threads = 16; constexpr int block_threads = 256; constexpr int items_per_thread = 4; // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each using WarpLoadT = WarpLoad<int, items_per_thread, cub::WARP_LOAD_TRANSPOSE, warp_threads>; constexpr int warps_in_block = block_threads / warp_threads; constexpr int tile_size = items_per_thread * warp_threads; const int warp_id = static_cast<int>(threadIdx.x) / warp_threads; // Allocate shared memory for WarpLoad __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; // Load a segment of consecutive items that are blocked across threads int thread_data[items_per_thread]; WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data, valid_items, -1);
Suppose the input
d_data
is0, 1, 2, 3, 4, 5, ...
,valid_items
is5
, and the out-of-bounds default is-1
. The set ofthread_data
across the first logical warp of threads in those threads will be:{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }
with only the first two threads being unmasked to load portions of valid data (and other items are assigned-1
).- Parameters
block_itr – [in] The thread block’s base input iterator for loading from
items – [out] Data to load
valid_items – [in] Number of valid items to load
oob_default – [in] Default value to assign out-of-bound items
-
struct TempStorage : public Uninitialized<_TempStorage>
The operations exposed by WarpLoad require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the
__shared__
keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) orunion
’d with other storage allocation types to facilitate memory reuse.