include/cuda/experimental/__device/attributes.cuh

File members: include/cuda/experimental/__device/attributes.cuh

//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__DEVICE_ATTRIBUTES_
#define _CUDAX__DEVICE_ATTRIBUTES_

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/std/__cccl/attributes.h>
#include <cuda/std/__cuda/api_wrapper.h>

#include <cuda/experimental/__device/device_ref.cuh>

namespace cuda::experimental
{

namespace detail
{

template <::cudaDeviceAttr _Attr, typename _Type>
struct __dev_attr_impl
{
  using type = _Type;

  _CCCL_NODISCARD constexpr operator ::cudaDeviceAttr() const noexcept
  {
    return _Attr;
  }

  _CCCL_NODISCARD type operator()(device_ref __dev_id) const
  {
    int __value = 0;
    _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, __dev_id.get());
    return static_cast<type>(__value);
  }
};

template <::cudaDeviceAttr _Attr>
struct __dev_attr : __dev_attr_impl<_Attr, int>
{};

// TODO: give this a strong type for kilohertz
template <>
struct __dev_attr<::cudaDevAttrClockRate> //
    : __dev_attr_impl<::cudaDevAttrClockRate, int>
{};
template <>
struct __dev_attr<::cudaDevAttrGpuOverlap> //
    : __dev_attr_impl<::cudaDevAttrGpuOverlap, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrKernelExecTimeout> //
    : __dev_attr_impl<::cudaDevAttrKernelExecTimeout, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrIntegrated> //
    : __dev_attr_impl<::cudaDevAttrIntegrated, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrCanMapHostMemory> //
    : __dev_attr_impl<::cudaDevAttrCanMapHostMemory, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrComputeMode> //
    : __dev_attr_impl<::cudaDevAttrComputeMode, ::cudaComputeMode>
{
  static constexpr type default_mode           = cudaComputeModeDefault;
  static constexpr type prohibited_mode        = cudaComputeModeProhibited;
  static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
};
template <>
struct __dev_attr<::cudaDevAttrConcurrentKernels> //
    : __dev_attr_impl<::cudaDevAttrConcurrentKernels, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrEccEnabled> //
    : __dev_attr_impl<::cudaDevAttrEccEnabled, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrTccDriver> //
    : __dev_attr_impl<::cudaDevAttrTccDriver, bool>
{};
// TODO: give this a strong type for kilohertz
template <>
struct __dev_attr<::cudaDevAttrMemoryClockRate> //
    : __dev_attr_impl<::cudaDevAttrMemoryClockRate, int>
{};
// TODO: give this a strong type for bits
template <>
struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> //
    : __dev_attr_impl<::cudaDevAttrGlobalMemoryBusWidth, int>
{};
// TODO: give this a strong type for bytes
template <>
struct __dev_attr<::cudaDevAttrL2CacheSize> //
    : __dev_attr_impl<::cudaDevAttrL2CacheSize, int>
{};
template <>
struct __dev_attr<::cudaDevAttrUnifiedAddressing> //
    : __dev_attr_impl<::cudaDevAttrUnifiedAddressing, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrStreamPrioritiesSupported> //
    : __dev_attr_impl<::cudaDevAttrStreamPrioritiesSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrGlobalL1CacheSupported> //
    : __dev_attr_impl<::cudaDevAttrGlobalL1CacheSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> //
    : __dev_attr_impl<::cudaDevAttrLocalL1CacheSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrManagedMemory> //
    : __dev_attr_impl<::cudaDevAttrManagedMemory, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrIsMultiGpuBoard> //
    : __dev_attr_impl<::cudaDevAttrIsMultiGpuBoard, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrHostNativeAtomicSupported> //
    : __dev_attr_impl<::cudaDevAttrHostNativeAtomicSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrPageableMemoryAccess> //
    : __dev_attr_impl<::cudaDevAttrPageableMemoryAccess, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrConcurrentManagedAccess> //
    : __dev_attr_impl<::cudaDevAttrConcurrentManagedAccess, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrComputePreemptionSupported> //
    : __dev_attr_impl<::cudaDevAttrComputePreemptionSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem> //
    : __dev_attr_impl<::cudaDevAttrCanUseHostPointerForRegisteredMem, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrCooperativeLaunch> //
    : __dev_attr_impl<::cudaDevAttrCooperativeLaunch, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch> //
    : __dev_attr_impl<::cudaDevAttrCooperativeMultiDeviceLaunch, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrCanFlushRemoteWrites> //
    : __dev_attr_impl<::cudaDevAttrCanFlushRemoteWrites, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrHostRegisterSupported> //
    : __dev_attr_impl<::cudaDevAttrHostRegisterSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
    : __dev_attr_impl<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> //
    : __dev_attr_impl<::cudaDevAttrSparseCudaArraySupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrMemoryPoolsSupported> //
    : __dev_attr_impl<::cudaDevAttrMemoryPoolsSupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrGPUDirectRDMASupported> //
    : __dev_attr_impl<::cudaDevAttrGPUDirectRDMASupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported> //
    : __dev_attr_impl<::cudaDevAttrDeferredMappingCudaArraySupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrIpcEventSupport> //
    : __dev_attr_impl<::cudaDevAttrIpcEventSupport, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>
    : __dev_attr_impl<::cudaDevAttrPageableMemoryAccessUsesHostPageTables, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported> //
    : __dev_attr_impl<::cudaDevAttrHostRegisterReadOnlySupported, bool>
{};
template <>
struct __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions> //
    : __dev_attr_impl<::cudaDevAttrGPUDirectRDMAFlushWritesOptions, ::cudaFlushGPUDirectRDMAWritesOptions>
{
  static constexpr type host    = ::cudaFlushGPUDirectRDMAWritesOptionHost;
  static constexpr type mem_ops = ::cudaFlushGPUDirectRDMAWritesOptionMemOps;
};
template <>
struct __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering> //
    : __dev_attr_impl<::cudaDevAttrGPUDirectRDMAWritesOrdering, ::cudaGPUDirectRDMAWritesOrdering>
{
  static constexpr type none        = ::cudaGPUDirectRDMAWritesOrderingNone;
  static constexpr type owner       = ::cudaGPUDirectRDMAWritesOrderingOwner;
  static constexpr type all_devices = ::cudaGPUDirectRDMAWritesOrderingAllDevices;
};
template <>
struct __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes> //
    : __dev_attr_impl<::cudaDevAttrMemoryPoolSupportedHandleTypes, ::cudaMemAllocationHandleType>
{
  static constexpr type none                  = ::cudaMemHandleTypeNone;
  static constexpr type posix_file_descriptor = ::cudaMemHandleTypePosixFileDescriptor;
  static constexpr type win32                 = ::cudaMemHandleTypeWin32;
  static constexpr type win32_kmt             = ::cudaMemHandleTypeWin32Kmt;
#if CUDART_VERSION >= 12040
  static constexpr type fabric = ::cudaMemHandleTypeFabric;
#else
  static constexpr type fabric = static_cast<::cudaMemAllocationHandleType>(0x8);
#endif
};
#if CUDART_VERSION >= 12020
template <>
struct __dev_attr<::cudaDevAttrNumaConfig> //
    : __dev_attr_impl<::cudaDevAttrNumaConfig, ::cudaDeviceNumaConfig>
{
  static constexpr type none      = ::cudaDeviceNumaConfigNone;
  static constexpr type numa_node = ::cudaDeviceNumaConfigNumaNode;
};
#endif

struct __device_attrs
{
  // Maximum number of threads per block
  using max_threads_per_block_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
  static constexpr max_threads_per_block_t max_threads_per_block{};

  // Maximum x-dimension of a block
  using max_block_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimX>;
  static constexpr max_block_dim_x_t max_block_dim_x{};

  // Maximum y-dimension of a block
  using max_block_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimY>;
  static constexpr max_block_dim_y_t max_block_dim_y{};

  // Maximum z-dimension of a block
  using max_block_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxBlockDimZ>;
  static constexpr max_block_dim_z_t max_block_dim_z{};

  // Maximum x-dimension of a grid
  using max_grid_dim_x_t = detail::__dev_attr<::cudaDevAttrMaxGridDimX>;
  static constexpr max_grid_dim_x_t max_grid_dim_x{};

  // Maximum y-dimension of a grid
  using max_grid_dim_y_t = detail::__dev_attr<::cudaDevAttrMaxGridDimY>;
  static constexpr max_grid_dim_y_t max_grid_dim_y{};

  // Maximum z-dimension of a grid
  using max_grid_dim_z_t = detail::__dev_attr<::cudaDevAttrMaxGridDimZ>;
  static constexpr max_grid_dim_z_t max_grid_dim_z{};

  // Maximum amount of shared memory available to a thread block in bytes
  using max_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
  static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};

  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
  using total_constant_memory_t = detail::__dev_attr<::cudaDevAttrTotalConstantMemory>;
  static constexpr total_constant_memory_t total_constant_memory{};

  // Warp size in threads
  using warp_size_t = detail::__dev_attr<::cudaDevAttrWarpSize>;
  static constexpr warp_size_t warp_size{};

  // Maximum pitch in bytes allowed by the memory copy functions that involve
  // memory regions allocated through cudaMallocPitch()
  using max_pitch_t = detail::__dev_attr<::cudaDevAttrMaxPitch>;
  static constexpr max_pitch_t max_pitch{};

  // Maximum 1D texture width
  using max_texture_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DWidth>;
  static constexpr max_texture_1d_width_t max_texture_1d_width{};

  // Maximum width for a 1D texture bound to linear memory
  using max_texture_1d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
  static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};

  // Maximum mipmapped 1D texture width
  using max_texture_1d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
  static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};

  // Maximum 2D texture width
  using max_texture_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DWidth>;
  static constexpr max_texture_2d_width_t max_texture_2d_width{};

  // Maximum 2D texture height
  using max_texture_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DHeight>;
  static constexpr max_texture_2d_height_t max_texture_2d_height{};

  // Maximum width for a 2D texture bound to linear memory
  using max_texture_2d_linear_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
  static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};

  // Maximum height for a 2D texture bound to linear memory
  using max_texture_2d_linear_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
  static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};

  // Maximum pitch in bytes for a 2D texture bound to linear memory
  using max_texture_2d_linear_pitch_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
  static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};

  // Maximum mipmapped 2D texture width
  using max_texture_2d_mipmapped_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
  static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};

  // Maximum mipmapped 2D texture height
  using max_texture_2d_mipmapped_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
  static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};

  // Maximum 3D texture width
  using max_texture_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidth>;
  static constexpr max_texture_3d_width_t max_texture_3d_width{};

  // Maximum 3D texture height
  using max_texture_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeight>;
  static constexpr max_texture_3d_height_t max_texture_3d_height{};

  // Maximum 3D texture depth
  using max_texture_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepth>;
  static constexpr max_texture_3d_depth_t max_texture_3d_depth{};

  // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
  using max_texture_3d_width_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
  static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};

  // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
  using max_texture_3d_height_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
  static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};

  // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
  using max_texture_3d_depth_alt_t = detail::__dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
  static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};

  // Maximum cubemap texture width or height
  using max_texture_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
  static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};

  // Maximum 1D layered texture width
  using max_texture_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
  static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};

  // Maximum layers in a 1D layered texture
  using max_texture_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
  static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};

  // Maximum 2D layered texture width
  using max_texture_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
  static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};

  // Maximum 2D layered texture height
  using max_texture_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
  static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};

  // Maximum layers in a 2D layered texture
  using max_texture_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
  static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};

  // Maximum cubemap layered texture width or height
  using max_texture_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
  static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};

  // Maximum layers in a cubemap layered texture
  using max_texture_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
  static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};

  // Maximum 1D surface width
  using max_surface_1d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DWidth>;
  static constexpr max_surface_1d_width_t max_surface_1d_width{};

  // Maximum 2D surface width
  using max_surface_2d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DWidth>;
  static constexpr max_surface_2d_width_t max_surface_2d_width{};

  // Maximum 2D surface height
  using max_surface_2d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DHeight>;
  static constexpr max_surface_2d_height_t max_surface_2d_height{};

  // Maximum 3D surface width
  using max_surface_3d_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DWidth>;
  static constexpr max_surface_3d_width_t max_surface_3d_width{};

  // Maximum 3D surface height
  using max_surface_3d_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DHeight>;
  static constexpr max_surface_3d_height_t max_surface_3d_height{};

  // Maximum 3D surface depth
  using max_surface_3d_depth_t = detail::__dev_attr<::cudaDevAttrMaxSurface3DDepth>;
  static constexpr max_surface_3d_depth_t max_surface_3d_depth{};

  // Maximum 1D layered surface width
  using max_surface_1d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
  static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};

  // Maximum layers in a 1D layered surface
  using max_surface_1d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
  static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};

  // Maximum 2D layered surface width
  using max_surface_2d_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
  static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};

  // Maximum 2D layered surface height
  using max_surface_2d_layered_height_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
  static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};

  // Maximum layers in a 2D layered surface
  using max_surface_2d_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
  static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};

  // Maximum cubemap surface width
  using max_surface_cubemap_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
  static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};

  // Maximum cubemap layered surface width
  using max_surface_cubemap_layered_width_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
  static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};

  // Maximum layers in a cubemap layered surface
  using max_surface_cubemap_layered_layers_t = detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
  static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};

  // Maximum number of 32-bit registers available to a thread block
  using max_registers_per_block_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
  static constexpr max_registers_per_block_t max_registers_per_block{};

  // Peak clock frequency in kilohertz
  using clock_rate_t = detail::__dev_attr<::cudaDevAttrClockRate>;
  static constexpr clock_rate_t clock_rate{};

  // Alignment requirement; texture base addresses aligned to textureAlign bytes
  // do not need an offset applied to texture fetches
  using texture_alignment_t = detail::__dev_attr<::cudaDevAttrTextureAlignment>;
  static constexpr texture_alignment_t texture_alignment{};

  // Pitch alignment requirement for 2D texture references bound to pitched memory
  using texture_pitch_alignment_t = detail::__dev_attr<::cudaDevAttrTexturePitchAlignment>;
  static constexpr texture_pitch_alignment_t texture_pitch_alignment{};

  // true if the device can concurrently copy memory between host and device
  // while executing a kernel, or false if not
  using gpu_overlap_t = detail::__dev_attr<::cudaDevAttrGpuOverlap>;
  static constexpr gpu_overlap_t gpu_overlap{};

  // Number of multiprocessors on the device
  using multiprocessor_count_t = detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
  static constexpr multiprocessor_count_t multiprocessor_count{};

  // true if there is a run time limit for kernels executed on the device, or
  // false if not
  using kernel_exec_timeout_t = detail::__dev_attr<::cudaDevAttrKernelExecTimeout>;
  static constexpr kernel_exec_timeout_t kernel_exec_timeout{};

  // true if the device is integrated with the memory subsystem, or false if not
  using integrated_t = detail::__dev_attr<::cudaDevAttrIntegrated>;
  static constexpr integrated_t integrated{};

  // true if the device can map host memory into CUDA address space
  using can_map_host_memory_t = detail::__dev_attr<::cudaDevAttrCanMapHostMemory>;
  static constexpr can_map_host_memory_t can_map_host_memory{};

  // Compute mode is the compute mode that the device is currently in.
  using compute_mode_t = detail::__dev_attr<::cudaDevAttrComputeMode>;
  static constexpr compute_mode_t compute_mode{};

  // true if the device supports executing multiple kernels within the same
  // context simultaneously, or false if not. It is not guaranteed that multiple
  // kernels will be resident on the device concurrently so this feature should
  // not be relied upon for correctness.
  using concurrent_kernels_t = detail::__dev_attr<::cudaDevAttrConcurrentKernels>;
  static constexpr concurrent_kernels_t concurrent_kernels{};

  // true if error correction is enabled on the device, 0 if error correction is
  // disabled or not supported by the device
  using ecc_enabled_t = detail::__dev_attr<::cudaDevAttrEccEnabled>;
  static constexpr ecc_enabled_t ecc_enabled{};

  // PCI bus identifier of the device
  using pci_bus_id_t = detail::__dev_attr<::cudaDevAttrPciBusId>;
  static constexpr pci_bus_id_t pci_bus_id{};

  // PCI device (also known as slot) identifier of the device
  using pci_device_id_t = detail::__dev_attr<::cudaDevAttrPciDeviceId>;
  static constexpr pci_device_id_t pci_device_id{};

  // true if the device is using a TCC driver. TCC is only available on Tesla
  // hardware running Windows Vista or later.
  using tcc_driver_t = detail::__dev_attr<::cudaDevAttrTccDriver>;
  static constexpr tcc_driver_t tcc_driver{};

  // Peak memory clock frequency in kilohertz
  using memory_clock_rate_t = detail::__dev_attr<::cudaDevAttrMemoryClockRate>;
  static constexpr memory_clock_rate_t memory_clock_rate{};

  // Global memory bus width in bits
  using global_memory_bus_width_t = detail::__dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
  static constexpr global_memory_bus_width_t global_memory_bus_width{};

  // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
  using l2_cache_size_t = detail::__dev_attr<::cudaDevAttrL2CacheSize>;
  static constexpr l2_cache_size_t l2_cache_size{};

  // Maximum resident threads per multiprocessor
  using max_threads_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
  static constexpr max_threads_per_multiprocessor_t max_threads_per_multiprocessor{};

  // true if the device shares a unified address space with the host, or false
  // if not
  using unified_addressing_t = detail::__dev_attr<::cudaDevAttrUnifiedAddressing>;
  static constexpr unified_addressing_t unified_addressing{};

  // Major compute capability version number
  using compute_capability_major_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMajor>;
  static constexpr compute_capability_major_t compute_capability_major{};

  // Minor compute capability version number
  using compute_capability_minor_t = detail::__dev_attr<::cudaDevAttrComputeCapabilityMinor>;
  static constexpr compute_capability_minor_t compute_capability_minor{};

  // true if the device supports stream priorities, or false if not
  using stream_priorities_supported_t = detail::__dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
  static constexpr stream_priorities_supported_t stream_priorities_supported{};

  // true if device supports caching globals in L1 cache, false if not
  using global_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
  static constexpr global_l1_cache_supported_t global_l1_cache_supported{};

  // true if device supports caching locals in L1 cache, false if not
  using local_l1_cache_supported_t = detail::__dev_attr<::cudaDevAttrLocalL1CacheSupported>;
  static constexpr local_l1_cache_supported_t local_l1_cache_supported{};

  // Maximum amount of shared memory available to a multiprocessor in bytes;
  // this amount is shared by all thread blocks simultaneously resident on a
  // multiprocessor
  using max_shared_memory_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
  static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};

  // Maximum number of 32-bit registers available to a multiprocessor; this
  // number is shared by all thread blocks simultaneously resident on a
  // multiprocessor
  using max_registers_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
  static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};

  // true if device supports allocating managed memory, false if not
  using managed_memory_t = detail::__dev_attr<::cudaDevAttrManagedMemory>;
  static constexpr managed_memory_t managed_memory{};

  // true if device is on a multi-GPU board, false if not
  using is_multi_gpu_board_t = detail::__dev_attr<::cudaDevAttrIsMultiGpuBoard>;
  static constexpr is_multi_gpu_board_t is_multi_gpu_board{};

  // Unique identifier for a group of devices on the same multi-GPU board
  using multi_gpu_board_group_id_t = detail::__dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
  static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};

  // true if the link between the device and the host supports native atomic
  // operations
  using host_native_atomic_supported_t = detail::__dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
  static constexpr host_native_atomic_supported_t host_native_atomic_supported{};

  // Ratio of single precision performance (in floating-point operations per
  // second) to double precision performance
  using single_to_double_precision_perf_ratio_t = detail::__dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
  static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};

  // true if the device supports coherently accessing pageable memory without
  // calling cudaHostRegister on it, and false otherwise
  using pageable_memory_access_t = detail::__dev_attr<::cudaDevAttrPageableMemoryAccess>;
  static constexpr pageable_memory_access_t pageable_memory_access{};

  // true if the device can coherently access managed memory concurrently with
  // the CPU, and false otherwise
  using concurrent_managed_access_t = detail::__dev_attr<::cudaDevAttrConcurrentManagedAccess>;
  static constexpr concurrent_managed_access_t concurrent_managed_access{};

  // true if the device supports Compute Preemption, false if not
  using compute_preemption_supported_t = detail::__dev_attr<::cudaDevAttrComputePreemptionSupported>;
  static constexpr compute_preemption_supported_t compute_preemption_supported{};

  // true if the device can access host registered memory at the same virtual
  // address as the CPU, and false otherwise
  using can_use_host_pointer_for_registered_mem_t = detail::__dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
  static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};

  // true if the device supports launching cooperative kernels via
  // cudaLaunchCooperativeKernel, and false otherwise
  using cooperative_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeLaunch>;
  static constexpr cooperative_launch_t cooperative_launch{};

  // true if the device supports launching cooperative kernels via
  // cudaLaunchCooperativeKernelMultiDevice, and false otherwise
  using cooperative_multi_device_launch_t = detail::__dev_attr<::cudaDevAttrCooperativeMultiDeviceLaunch>;
  static constexpr cooperative_multi_device_launch_t cooperative_multi_device_launch{};

  // true if the device supports flushing of outstanding remote writes, and
  // false otherwise
  using can_flush_remote_writes_t = detail::__dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
  static constexpr can_flush_remote_writes_t can_flush_remote_writes{};

  // true if the device supports host memory registration via cudaHostRegister,
  // and false otherwise
  using host_register_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterSupported>;
  static constexpr host_register_supported_t host_register_supported{};

  // true if the device accesses pageable memory via the host's page tables, and
  // false otherwise
  using pageable_memory_access_uses_host_page_tables_t =
    detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
  static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};

  // true if the host can directly access managed memory on the device without
  // migration, and false otherwise
  using direct_managed_mem_access_from_host_t = detail::__dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
  static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};

  // Maximum per block shared memory size on the device. This value can be opted
  // into when using dynamic_shared_memory with NonPortableSize set to true
  using max_shared_memory_per_block_optin_t = detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
  static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};

  // Maximum number of thread blocks that can reside on a multiprocessor
  using max_blocks_per_multiprocessor_t = detail::__dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
  static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};

  // Maximum L2 persisting lines capacity setting in bytes
  using max_persisting_l2_cache_size_t = detail::__dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
  static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};

  // Maximum value of cudaAccessPolicyWindow::num_bytes
  using max_access_policy_window_size_t = detail::__dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
  static constexpr max_access_policy_window_size_t max_access_policy_window_size{};

  // Shared memory reserved by CUDA driver per block in bytes
  using reserved_shared_memory_per_block_t = detail::__dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
  static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};

  // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
  using sparse_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrSparseCudaArraySupported>;
  static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};

  // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
  // register memory that must be mapped as read-only to the GPU
  using host_register_read_only_supported_t = detail::__dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
  static constexpr host_register_read_only_supported_t host_register_read_only_supported{};

  // true if the device supports using the cudaMallocAsync and cudaMemPool
  // family of APIs, and false otherwise
  using memory_pools_supported_t = detail::__dev_attr<::cudaDevAttrMemoryPoolsSupported>;
  static constexpr memory_pools_supported_t memory_pools_supported{};

  // true if the device supports GPUDirect RDMA APIs, and false otherwise
  using gpu_direct_rdma_supported_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
  static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};

  // bitmask to be interpreted according to the
  // cudaFlushGPUDirectRDMAWritesOptions enum
  using gpu_direct_rdma_flush_writes_options_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
  static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};

  // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
  using gpu_direct_rdma_writes_ordering_t = detail::__dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
  static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};

  // Bitmask of handle types supported with mempool based IPC
  using memory_pool_supported_handle_types_t = detail::__dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
  static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};

  // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
  // arrays.
  using deferred_mapping_cuda_array_supported_t = detail::__dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
  static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};

  // true if the device supports IPC Events, false otherwise.
  using ipc_event_support_t = detail::__dev_attr<::cudaDevAttrIpcEventSupport>;
  static constexpr ipc_event_support_t ipc_event_support{};

#if CUDART_VERSION >= 12020

  // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
  using numa_config_t = detail::__dev_attr<::cudaDevAttrNumaConfig>;
  static constexpr numa_config_t numa_config{};

  // NUMA node ID of the GPU memory
  using numa_id_t = detail::__dev_attr<::cudaDevAttrNumaId>;
  static constexpr numa_id_t numa_id{};

#endif // CUDART_VERSION >= 12020

  // Combines major and minor compute capability in a 100 * major + 10 * minor format, allows to query full compute
  // capability in a single query
  struct compute_capability_t
  {
    _CCCL_NODISCARD int operator()(device_ref __dev_id) const
    {
      return 100 * compute_capability_major(__dev_id) + 10 * compute_capability_minor(__dev_id);
    }
  };
  static constexpr compute_capability_t compute_capability{};
};

} // namespace detail

} // namespace cuda::experimental

#endif // _CUDAX__DEVICE_ATTRIBUTES_