util_device.cuh

File members: /home/runner/work/cccl/cccl/cub/cub/util_device.cuh
/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/detail/device_synchronize.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_type.cuh>
// for backward compatibility
#include <cub/util_temporary_storage.cuh>

#include <cuda/std/type_traits>
#include <cuda/std/utility>

#include <array>
#include <atomic>
#include <cassert>

#include <nv/target>

CUB_NAMESPACE_BEGIN

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

namespace detail
{
template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
struct policy_wrapper_t : PolicyT
{
  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_;
  static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
};
} // namespace detail

template <typename T>
CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
{}

#endif // DOXYGEN_SHOULD_SKIP_THIS

CUB_RUNTIME_FUNCTION inline int CurrentDevice()
{
  int device = -1;
  if (CubDebug(cudaGetDevice(&device)))
  {
    return -1;
  }
  return device;
}

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

struct SwitchDevice
{
private:
  int const old_device;
  bool const needs_reset;

public:
  _CCCL_HOST inline SwitchDevice(int new_device)
      : old_device(CurrentDevice())
      , needs_reset(old_device != new_device)
  {
    if (needs_reset)
    {
      CubDebug(cudaSetDevice(new_device));
    }
  }

  _CCCL_HOST inline ~SwitchDevice()
  {
    if (needs_reset)
    {
      CubDebug(cudaSetDevice(old_device));
    }
  }
};
#endif // DOXYGEN_SHOULD_SKIP_THIS

CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
{
  int count = -1;
  if (CubDebug(cudaGetDeviceCount(&count)))
  {
    // CUDA makes no guarantees about the state of the output parameter if
    // `cudaGetDeviceCount` fails; in practice, they don't, but out of
    // paranoia we'll reset `count` to `-1`.
    count = -1;
  }
  return count;
}

template <typename T, T (*Function)()>
struct ValueCache
{
  T const value;

  _CCCL_HOST inline ValueCache()
      : value(Function())
  {}
};

// Host code, only safely usable in C++11 or newer, where thread-safe
// initialization of static locals is guaranteed.  This is a separate function
// to avoid defining a local static in a host/device function.
_CCCL_HOST inline int DeviceCountCachedValue()
{
  static ValueCache<int, DeviceCountUncached> cache;
  return cache.value;
}

CUB_RUNTIME_FUNCTION inline int DeviceCount()
{
  int result = -1;

  NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();));

  return result;
}

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

struct PerDeviceAttributeCache
{
  struct DevicePayload
  {
    int attribute;
    cudaError_t error;
  };

  // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
  // `DeviceEntryInitializing` state, and then proceeds to the
  // `DeviceEntryReady` state. These are the only state transitions allowed;
  // e.g. a linear sequence of transitions.
  enum DeviceEntryStatus
  {
    DeviceEntryEmpty = 0,
    DeviceEntryInitializing,
    DeviceEntryReady
  };

  struct DeviceEntry
  {
    std::atomic<DeviceEntryStatus> flag;
    DevicePayload payload;
  };

private:
  std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;

public:
  _CCCL_HOST inline PerDeviceAttributeCache()
      : entries_()
  {
    assert(DeviceCount() <= CUB_MAX_DEVICES);
  }

  template <typename Invocable>
  _CCCL_HOST DevicePayload operator()(Invocable&& f, int device)
  {
    if (device >= DeviceCount() || device < 0)
    {
      return DevicePayload{0, cudaErrorInvalidDevice};
    }

    auto& entry   = entries_[device];
    auto& flag    = entry.flag;
    auto& payload = entry.payload;

    DeviceEntryStatus old_status = DeviceEntryEmpty;

    // First, check for the common case of the entry being ready.
    if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
    {
      // Assume the entry is empty and attempt to lock it so we can fill
      // it by trying to set the state from `DeviceEntryReady` to
      // `DeviceEntryInitializing`.
      if (flag.compare_exchange_strong(
            old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire))
      {
        // We successfully set the state to `DeviceEntryInitializing`;
        // we have the lock and it's our job to initialize this entry
        // and then release it.

        // We don't use `CubDebug` here because we let the user code
        // decide whether or not errors are hard errors.
        payload.error = ::cuda::std::forward<Invocable>(f)(payload.attribute);
        if (payload.error)
        {
          // Clear the global CUDA error state which may have been
          // set by the last call. Otherwise, errors may "leak" to
          // unrelated kernel launches.
          cudaGetLastError();
        }

        // Release the lock by setting the state to `DeviceEntryReady`.
        flag.store(DeviceEntryReady, std::memory_order_release);
      }

      // If the `compare_exchange_weak` failed, then `old_status` has
      // been updated with the value of `flag` that it observed.

      else if (old_status == DeviceEntryInitializing)
      {
        // Another execution agent is initializing this entry; we need
        // to wait for them to finish; we'll know they're done when we
        // observe the entry status as `DeviceEntryReady`.
        do
        {
          old_status = flag.load(std::memory_order_acquire);
        } while (old_status != DeviceEntryReady);
        // FIXME: Use `atomic::wait` instead when we have access to
        // host-side C++20 atomics. We could use libcu++, but it only
        // supports atomics for SM60 and up, even if you're only using
        // them in host code.
      }
    }

    // We now know that the state of our entry is `DeviceEntryReady`, so
    // just return the entry's payload.
    return entry.payload;
  }
};
#endif // DOXYGEN_SHOULD_SKIP_THIS

CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
{
  // Instantiate `EmptyKernel<void>` in both host and device code to ensure
  // it can be called.
  using EmptyKernelPtr        = void (*)();
  EmptyKernelPtr empty_kernel = EmptyKernel<void>;

  // This is necessary for unused variable warnings in host compilers. The
  // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
  (void) reinterpret_cast<void*>(empty_kernel);

  // Define a temporary macro that expands to the current target ptx version
  // in device code.
  // <nv/target> may provide an abstraction for this eventually. For now,
  // we have to keep this usage of __CUDA_ARCH__.
#if defined(_NVHPC_CUDA)
#  define CUB_TEMP_GET_PTX __builtin_current_device_sm()
#else
#  define CUB_TEMP_GET_PTX __CUDA_ARCH__
#endif

  cudaError_t result = cudaSuccess;
  NV_IF_TARGET(
    NV_IS_HOST,
    (cudaFuncAttributes empty_kernel_attrs;

     result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast<void*>(empty_kernel)));

     ptx_version = empty_kernel_attrs.ptxVersion * 10;),
    // NV_IS_DEVICE
    (
      // This is necessary to ensure instantiation of EmptyKernel in device
      // code. The `reinterpret_cast` is necessary to suppress a
      // set-but-unused warnings. This is a meme now:
      // https://twitter.com/blelbach/status/1222391615576100864
      (void) reinterpret_cast<EmptyKernelPtr>(empty_kernel);

      ptx_version = CUB_TEMP_GET_PTX;));

#undef CUB_TEMP_GET_PTX

  return result;
}

_CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
{
  SwitchDevice sd(device);
  (void) sd;
  return PtxVersionUncached(ptx_version);
}

template <typename Tag>
_CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
{
  // C++11 guarantees that initialization of static locals is thread safe.
  static PerDeviceAttributeCache cache;
  return cache;
}

struct PtxVersionCacheTag
{};
struct SmVersionCacheTag
{};

_CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
{
  auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
    // If this call fails, then we get the error code back in the payload,
    // which we check with `CubDebug` below.
    [=](int& pv) {
      return PtxVersionUncached(pv, device);
    },
    device);

  if (!CubDebug(payload.error))
  {
    ptx_version = payload.attribute;
  }

  return payload.error;
}

CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
{
  cudaError_t result = cudaErrorUnknown;
  NV_IF_TARGET(
    NV_IS_HOST,
    (auto const device  = CurrentDevice();
     auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
       // If this call fails, then we get the error code back in the payload,
       // which we check with `CubDebug` below.
       [=](int& pv) {
         return PtxVersionUncached(pv, device);
       },
       device);

     if (!CubDebug(payload.error)) { ptx_version = payload.attribute; }

     result = payload.error;),
    ( // NV_IS_DEVICE:
      result = PtxVersionUncached(ptx_version);));

  return result;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
{
  cudaError_t error = cudaSuccess;
  do
  {
    int major = 0, minor = 0;
    error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
    if (cudaSuccess != error)
    {
      break;
    }

    error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
    if (cudaSuccess != error)
    {
      break;
    }
    sm_version = major * 100 + minor * 10;
  } while (0);

  return error;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
{
  cudaError_t result = cudaErrorUnknown;

  NV_IF_TARGET(
    NV_IS_HOST,
    (auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
       // If this call fails, then we get the error code back in
       // the payload, which we check with `CubDebug` below.
       [=](int& pv) {
         return SmVersionUncached(pv, device);
       },
       device);

     if (!CubDebug(payload.error)) { sm_version = payload.attribute; };

     result = payload.error;),
    ( // NV_IS_DEVICE
      result = SmVersionUncached(sm_version, device);));

  return result;
}

CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
{
  cudaError_t result = cudaErrorNotSupported;

  NV_IF_TARGET(NV_IS_HOST,
               (result = CubDebug(cudaStreamSynchronize(stream));),
               ((void) stream; result = CubDebug(cub::detail::device_synchronize());));

  return result;
}

namespace detail
{

CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
{
#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC

  (void) stream;
  return cudaSuccess;

#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:

#  define CUB_TMP_SYNC_AVAILABLE         \
    _CubLog("%s\n", "Synchronizing..."); \
    return SyncStream(stream)

#  define CUB_TMP_DEVICE_SYNC_UNAVAILABLE                                        \
    (void) stream;                                                               \
    _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
            "device-side sync requires <sm_90, RDC, and CDPv1");                 \
    return cudaSuccess

#  ifdef CUB_DETAIL_CDPv1

  // Can sync everywhere but SM_90+
  NV_IF_TARGET(NV_PROVIDES_SM_90, (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;), (CUB_TMP_SYNC_AVAILABLE;));

#  else // CDPv2 or no CDP:

  // Can only sync on host
  NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));

#  endif // CDP version

#  undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
#  undef CUB_TMP_SYNC_AVAILABLE

#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
}

CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
{
  has_uva           = false;
  cudaError_t error = cudaSuccess;
  int device        = -1;
  error             = CubDebug(cudaGetDevice(&device));
  if (cudaSuccess != error)
  {
    return error;
  }

  int uva = 0;
  error   = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device));
  if (cudaSuccess != error)
  {
    return error;
  }
  has_uva = uva == 1;
  return error;
}

} // namespace detail

template <typename KernelPtr>
_CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
{
  return CubDebug(
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes));
}

/******************************************************************************
 * Policy management
 ******************************************************************************/

struct KernelConfig
{
  int block_threads;
  int items_per_thread;
  int tile_size;
  int sm_occupancy;

  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE KernelConfig()
      : block_threads(0)
      , items_per_thread(0)
      , tile_size(0)
      , sm_occupancy(0)
  {}

  template <typename AgentPolicyT, typename KernelPtrT>
  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Init(KernelPtrT kernel_ptr)
  {
    block_threads      = AgentPolicyT::BLOCK_THREADS;
    items_per_thread   = AgentPolicyT::ITEMS_PER_THREAD;
    tile_size          = block_threads * items_per_thread;
    cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
    return retval;
  }
};

template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
struct ChainedPolicy
{
  using ActivePolicy =
    cub::detail::conditional_t<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;

  template <typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
  {
    if (device_ptx_version < PolicyPtxVersion)
    {
      return PrevPolicyT::Invoke(device_ptx_version, op);
    }
    return op.template Invoke<PolicyT>();
  }
};

template <int PTX_VERSION, typename PolicyT>
struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
{
  using ActivePolicy = PolicyT;

  template <typename FunctorT>
  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
  {
    return op.template Invoke<PolicyT>();
  }
};

CUB_NAMESPACE_END