49 template <
typename Element>
56 size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
58 for (; idx < capacity; idx += gridDim.x * blockDim.x) {
59 if (ptr_A[idx] != ptr_B[idx]) {
66 template <
typename Element>
73 Element nonzero_floor) {
75 size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
77 for (; idx < capacity; idx += gridDim.x * blockDim.x) {
79 Element a = ptr_A[idx];
80 Element b = ptr_B[idx];
95 template <
typename Element>
101 int block_size = 0) {
104 int *device_equal_flag =
nullptr;
106 if (cudaMalloc((
void **)&device_equal_flag,
sizeof(
int)) != cudaSuccess) {
107 throw std::runtime_error(
"Failed to allocate device flag.");
114 cudaMemcpyHostToDevice) != cudaSuccess) {
116 throw std::runtime_error(
"Failed to copy equality flag to device.");
119 if (!grid_size || !block_size) {
122 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
125 reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
127 if (result != cudaSuccess) {
128 throw std::runtime_error(
"Failed to query occupancy.");
133 block_size = (block_size < 128 ? block_size : 128);
136 dim3 grid(grid_size, 1, 1);
137 dim3 block(block_size, 1, 1);
139 kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
145 cudaMemcpyDeviceToHost) != cudaSuccess) {
147 cudaFree(device_equal_flag);
149 throw std::runtime_error(
"Failed to copy equality flag from device.");
152 cudaFree(device_equal_flag);
160 template <
typename Element>
162 Element
const *ptr_A,
163 Element
const *ptr_B,
166 Element nonzero_floor,
168 int block_size = 0) {
171 int *device_equal_flag =
nullptr;
173 if (cudaMalloc((
void **)&device_equal_flag,
sizeof(
int)) != cudaSuccess) {
174 throw std::runtime_error(
"Failed to allocate device flag.");
181 cudaMemcpyHostToDevice) != cudaSuccess) {
183 throw std::runtime_error(
"Failed to copy equality flag to device.");
186 if (!grid_size || !block_size) {
189 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
192 reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
194 if (result != cudaSuccess) {
195 throw std::runtime_error(
"Failed to query occupancy.");
200 block_size = (block_size < 128 ? block_size : 128);
203 dim3 grid(grid_size, 1, 1);
204 dim3 block(block_size, 1, 1);
206 kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
219 cudaMemcpyDeviceToHost) != cudaSuccess) {
221 cudaFree(device_equal_flag);
223 throw std::runtime_error(
"Failed to copy equality flag from device.");
226 cudaFree(device_equal_flag);
Definition: aligned_buffer.h:35
__global__ void BlockCompareRelativelyEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity, Element epsilon, Element nonzero_floor)
Definition: device/tensor_compare.h:67
CUTLASS_HOST_DEVICE bool relatively_equal(T a, T b, T epsilon, T nonzero_floor)
__global__ void BlockCompareEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity)
Definition: device/tensor_compare.h:50
This header contains a class to parametrize a statistical distribution function.
Basic include for CUTLASS.