CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
device/tensor_compare.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 /* \file
26  \brief Defines host-side elementwise operations on TensorView.
27 */
28 
29 #pragma once
30 // Standard Library includes
31 #include <utility>
32 
33 // Cutlass includes
34 #include "cutlass/cutlass.h"
36 
38 
39 #include "tensor_foreach.h"
40 
41 namespace cutlass {
42 namespace reference {
43 namespace device {
44 
46 
47 namespace kernel {
48 
49 template <typename Element>
50 __global__ void BlockCompareEqual(
51  int *equal,
52  Element const *ptr_A,
53  Element const *ptr_B,
54  size_t capacity) {
55 
56  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
57 
58  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
59  if (ptr_A[idx] != ptr_B[idx]) {
60  *equal = 0;
61  return;
62  }
63  }
64 }
65 
66 template <typename Element>
68  int *equal,
69  Element const *ptr_A,
70  Element const *ptr_B,
71  size_t capacity,
72  Element epsilon,
73  Element nonzero_floor) {
74 
75  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
76 
77  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
78 
79  Element a = ptr_A[idx];
80  Element b = ptr_B[idx];
81 
82  if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
83  *equal = 0;
84  return;
85  }
86  }
87 }
88 
89 } // namespace kernel
90 
91 
93 
95 template <typename Element>
97  Element const *ptr_A,
98  Element const *ptr_B,
99  size_t capacity,
100  int grid_size = 0,
101  int block_size = 0) {
102 
103  int equal_flag = 1;
104  int *device_equal_flag = nullptr;
105 
106  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
107  throw std::runtime_error("Failed to allocate device flag.");
108  }
109 
110  if (cudaMemcpy(
111  device_equal_flag,
112  &equal_flag,
113  sizeof(int),
114  cudaMemcpyHostToDevice) != cudaSuccess) {
115 
116  throw std::runtime_error("Failed to copy equality flag to device.");
117  }
118 
119  if (!grid_size || !block_size) {
120 
121  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
122  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
123  &grid_size,
124  &block_size,
125  reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
126 
127  if (result != cudaSuccess) {
128  throw std::runtime_error("Failed to query occupancy.");
129  }
130 
131  // Limit block size. This has the effect of increasing the number of items processed by a
132  // single thread and reduces the impact of initialization overhead.
133  block_size = (block_size < 128 ? block_size : 128);
134  }
135 
136  dim3 grid(grid_size, 1, 1);
137  dim3 block(block_size, 1, 1);
138 
139  kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
140 
141  if (cudaMemcpy(
142  &equal_flag,
143  device_equal_flag,
144  sizeof(int),
145  cudaMemcpyDeviceToHost) != cudaSuccess) {
146 
147  cudaFree(device_equal_flag);
148 
149  throw std::runtime_error("Failed to copy equality flag from device.");
150  }
151 
152  cudaFree(device_equal_flag);
153 
154  return equal_flag;
155 }
156 
158 
160 template <typename Element>
162  Element const *ptr_A,
163  Element const *ptr_B,
164  size_t capacity,
165  Element epsilon,
166  Element nonzero_floor,
167  int grid_size = 0,
168  int block_size = 0) {
169 
170  int equal_flag = 1;
171  int *device_equal_flag = nullptr;
172 
173  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
174  throw std::runtime_error("Failed to allocate device flag.");
175  }
176 
177  if (cudaMemcpy(
178  device_equal_flag,
179  &equal_flag,
180  sizeof(int),
181  cudaMemcpyHostToDevice) != cudaSuccess) {
182 
183  throw std::runtime_error("Failed to copy equality flag to device.");
184  }
185 
186  if (!grid_size || !block_size) {
187 
188  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
189  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
190  &grid_size,
191  &block_size,
192  reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
193 
194  if (result != cudaSuccess) {
195  throw std::runtime_error("Failed to query occupancy.");
196  }
197 
198  // Limit block size. This has the effect of increasing the number of items processed by a
199  // single thread and reduces the impact of initialization overhead.
200  block_size = (block_size < 128 ? block_size : 128);
201  }
202 
203  dim3 grid(grid_size, 1, 1);
204  dim3 block(block_size, 1, 1);
205 
206  kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
207  device_equal_flag,
208  ptr_A,
209  ptr_B,
210  capacity,
211  epsilon,
212  nonzero_floor
213  );
214 
215  if (cudaMemcpy(
216  &equal_flag,
217  device_equal_flag,
218  sizeof(int),
219  cudaMemcpyDeviceToHost) != cudaSuccess) {
220 
221  cudaFree(device_equal_flag);
222 
223  throw std::runtime_error("Failed to copy equality flag from device.");
224  }
225 
226  cudaFree(device_equal_flag);
227 
228  return equal_flag;
229 }
230 
232 
233 } // device
234 } // reference
235 } // cutlass
Definition: aligned_buffer.h:35
__global__ void BlockCompareRelativelyEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity, Element epsilon, Element nonzero_floor)
Definition: device/tensor_compare.h:67
CUTLASS_HOST_DEVICE bool relatively_equal(T a, T b, T epsilon, T nonzero_floor)
__global__ void BlockCompareEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity)
Definition: device/tensor_compare.h:50
This header contains a class to parametrize a statistical distribution function.
Basic include for CUTLASS.