include/cuda/experimental/__stf/stream/reduction.cuh
File members: include/cuda/experimental/__stf/stream/reduction.cuh
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#pragma once
#include <cuda/__cccl_config>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include <cuda/experimental/__stf/internal/logical_data.cuh>
#include <cuda/experimental/__stf/stream/internal/event_types.cuh>
namespace cuda::experimental::stf
{
class stream_reduction_operator_untyped : public reduction_operator_base
{
public:
stream_reduction_operator_untyped(bool is_commutative = true)
: reduction_operator_base(is_commutative)
{}
virtual void stream_redux_op(
logical_data_untyped& d,
const data_place& inout_memory_node,
instance_id_t inout_instance_id,
const data_place& in_memory_node,
instance_id_t in_instance_id,
const exec_place& e,
cudaStream_t s) = 0;
virtual void stream_init_op(
logical_data_untyped& d,
const data_place& out_memory_node,
instance_id_t out_instance_id,
const exec_place& e,
cudaStream_t s) = 0;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code
void op_untyped(
logical_data_untyped& d,
const data_place& inout_memory_node,
instance_id_t inout_instance_id,
const data_place& in_memory_node,
instance_id_t in_instance_id,
const exec_place& ep,
event_list& prereqs) override
{
auto dstream = inout_memory_node.getDataStream(d.get_ctx().async_resources());
auto async_op = stream_async_op(d.get_ctx(), dstream, prereqs);
if (d.get_ctx().generate_event_symbols())
{
async_op.set_symbol("redux op " + d.get_symbol());
}
# ifdef REDUCTION_DEBUG
fprintf(stderr, "stream_redux_op inout %d in %d\n", inout_instance_id, in_instance_id);
# endif
stream_redux_op(d, inout_memory_node, inout_instance_id, in_memory_node, in_instance_id, ep, dstream.stream);
prereqs = async_op.end(d.get_ctx());
}
void init_op_untyped(logical_data_untyped& d,
const data_place& out_memory_node,
instance_id_t out_instance_id,
const exec_place& ep,
event_list& prereqs) override
{
auto dstream = out_memory_node.getDataStream(d.get_ctx().async_resources());
auto async_op = stream_async_op(d.get_ctx(), dstream, prereqs);
if (d.get_ctx().generate_event_symbols())
{
async_op.set_symbol("redux init op " + d.get_symbol());
}
# ifdef REDUCTION_DEBUG
fprintf(stderr, "stream_init_op out %d\n", out_instance_id);
# endif
stream_init_op(d, out_memory_node, out_instance_id, ep, dstream.stream);
prereqs = async_op.end(d.get_ctx());
}
#endif // DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code
};
template <typename T>
class stream_reduction_operator : public stream_reduction_operator_untyped
{
virtual void op(const T& in, T& inout, const exec_place& e, cudaStream_t s) = 0;
virtual void init_op(T& out, const exec_place& e, cudaStream_t s) = 0;
void stream_redux_op(
logical_data_untyped& d,
const data_place& /*unused*/,
instance_id_t inout_instance_id,
const data_place&,
instance_id_t in_instance_id,
const exec_place& e,
cudaStream_t s)
{
const auto& in_instance = d.instance<T>(in_instance_id);
auto& inout_instance = d.instance<T>(inout_instance_id);
op(in_instance, inout_instance, e, s);
}
void stream_init_op(
logical_data_untyped& d, const data_place&, instance_id_t out_instance_id, const exec_place& e, cudaStream_t s)
{
auto& out_instance = d.instance<T>(out_instance_id);
init_op(out_instance, e, s);
}
};
} // end namespace cuda::experimental::stf