cuda::experimental::stf::data_place#

class data_place#

Designates where data will be stored (CPU memory vs.

on device 0 (first GPU), device 1 (second GPU), …)

This typed enum is aligned with CUDA device ordinals but does not implicitly convert to int. See device_ordinal below.

Public Functions

data_place() = default#

Default constructor.

The object is initialized as invalid.

inline bool operator==(const data_place &rhs) const#
inline bool operator!=(const data_place &rhs) const#
inline bool is_composite() const#

checks if this data place is a composite data place

inline bool is_extension() const#

checks if this data place has an extension (green context, etc.)

inline bool is_invalid() const#
inline bool is_host() const#
inline bool is_managed() const#
inline bool is_affine() const#
inline bool is_device() const#

checks if this data place corresponds to a specific device

inline bool is_device_auto() const#
inline ::std::string to_string() const#
inline const exec_place_grid &get_grid() const#
inline const get_executor_func_t &get_partitioner() const#
inline exec_place get_affine_exec_place() const#
inline decorated_stream getDataStream(
async_resources_handle &async_resources
) const#
inline bool has_extension() const#

Check if this data place has a custom extension.

inline const ::std::shared_ptr<data_place_extension> &get_extension(
) const#

Get the extension (may be nullptr for standard place types)

inline CUresult mem_create(
CUmemGenericAllocationHandle *handle,
size_t size
) const#

Create a physical memory allocation for this place (VMM API)

This method is used by localized arrays (composite_slice) to create physical memory segments that are then mapped into a contiguous virtual address space. It delegates to the extension’s mem_create if present (enabling custom place types to override memory allocation), otherwise creates a standard pinned allocation on this place’s device or host.

Managed memory is not supported by the VMM API.

See also

allocate() for regular memory allocation

Note

For regular memory allocation (not VMM-based), use the allocate() method instead, which provides stream-ordered allocation via cudaMallocAsync.

Parameters:
  • handle – Output parameter for the allocation handle

  • size – Size of the allocation in bytes

Returns:

CUresult indicating success or failure

inline void *allocate(
::std::ptrdiff_t size,
cudaStream_t stream = nullptr
) const#

Allocate memory at this data place (raw allocation)

This is the low-level allocation interface that handles all place types:

  • For extensions: delegates to extension->allocate()

  • For host: uses cudaMallocHost (immediate, stream ignored)

  • For managed: uses cudaMallocManaged (immediate, stream ignored)

  • For device: uses cudaMallocAsync (stream-ordered)

Parameters:
  • size – Size of the allocation in bytes

  • stream – CUDA stream for stream-ordered allocations (ignored for immediate allocations, defaults to nullptr)

Returns:

Pointer to allocated memory

inline void deallocate(
void *ptr,
size_t size,
cudaStream_t stream = nullptr
) const#

Deallocate memory at this data place (raw deallocation)

For immediate deallocations (host, managed), the stream is ignored. Note that cudaFree (used for managed memory) may introduce implicit synchronization.

Parameters:
  • ptr – Pointer to memory to deallocate

  • size – Size of the allocation

  • stream – CUDA stream for stream-ordered deallocations (ignored for immediate deallocations, defaults to nullptr)

inline bool allocation_is_stream_ordered() const#

Returns true if allocation/deallocation is stream-ordered.

When this returns true, the allocation uses stream-ordered APIs like cudaMallocAsync, and allocators should use stream_async_op to synchronize prerequisites before allocation.

When this returns false, the allocation is immediate (like cudaMallocHost) and the stream parameter is ignored. Note that immediate deallocations (e.g., cudaFree) may introduce implicit synchronization.

Public Static Functions

static inline data_place invalid()#

Represents an invalid data_place object.

static inline data_place host()#

Represents the host CPU as the data_place (pinned host memory, or memory which should be pinned by CUDASTF).

static inline data_place managed()#

Represents a managed memory location as the data_place.

static inline data_place affine()#

This actually does not define a data_place, but means that we should use the data place affine to the execution place.

static inline data_place device_auto()#

Constant representing a placeholder that lets the library automatically select a GPU device as the data_place.

static inline data_place device(int dev_id = 0)#

Data is placed on device with index dev_id.

Two relaxations are allowed: -1 can be passed to create a placeholder for the host, and -2 can be used to create a placeholder for a managed device.

static inline data_place current_device()#

Select the embedded memory of the current device as data_place.

template<typename partitioner_t>
static data_place composite(
partitioner_t p,
const exec_place_grid &g
)#
static inline data_place composite(
get_executor_func_t f,
const exec_place_grid &grid
)#
static inline data_place green_ctx(const green_ctx_view &gc_view)#
static inline data_place green_ctx(
::std::shared_ptr<green_ctx_view> gc_view_ptr
)#
static inline data_place from_extension(
::std::shared_ptr<data_place_extension> ext
)#

Create a data_place from an extension.

This factory method allows custom place types to be created from data_place_extension implementations.

Friends

inline friend size_t to_index(const data_place &p)#

Returns an index guaranteed to be >= 0 (0 for managed CPU, 1 for pinned CPU, 2 for device 0, 3 for device 1, …).

Requires that p is initialized and different from data_place::invalid().

inline friend int device_ordinal(const data_place &p)#

Returns the device ordinal (0 = first GPU, 1 = second GPU, … and by convention the CPU is -1) Requires that p is initialized.

class composite_state#

Public Functions

composite_state() = default#
inline composite_state(
exec_place_grid grid,
get_executor_func_t partitioner_func
)#
inline const exec_place_grid &get_grid() const#
inline const get_executor_func_t &get_partitioner() const#