CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
device_memory.h
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are not permitted.
6  *
7  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
8  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
9  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
10  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
11  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
12  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
13  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
14  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
15  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
16  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
17  *
18  ******************************************************************************/
19 
20 #pragma once
21 
27 #include <memory>
28 
30 #include "cutlass/numeric_types.h"
31 #include "exceptions.h"
32 
33 namespace cutlass {
34 namespace device_memory {
35 
36 /******************************************************************************
37  * Allocation lifetime
38  ******************************************************************************/
39 
41 template <typename T>
42 T* allocate(size_t count = 1) {
43  T* ptr = 0;
44  size_t bytes = sizeof(T) * count;
45 
46  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
47  if (cuda_error != cudaSuccess) {
48  throw cuda_exception("Failed to allocate memory", cuda_error);
49  }
50 
51  return ptr;
52 }
53 
55 template <typename T>
56 void free(T* ptr) {
57  if (ptr) {
58  cudaError_t cuda_error = (cudaFree(ptr));
59  if (cuda_error != cudaSuccess) {
60  throw cuda_exception("Failed to free device memory", cuda_error);
61  }
62  }
63 }
64 
65 /******************************************************************************
66  * Data movement
67  ******************************************************************************/
68 
69 template <typename T>
70 void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
71  size_t bytes = count * sizeof_bits<T>::value / 8;
72  if (bytes == 0 && count > 0)
73  bytes = 1;
74  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
75  if (cuda_error != cudaSuccess) {
76  throw cuda_exception("cudaMemcpy() failed", cuda_error);
77  }
78 }
79 
80 template <typename T>
81 void copy_to_device(T* dst, T const* src, size_t count = 1) {
82  copy(dst, src, count, cudaMemcpyHostToDevice);
83 }
84 
85 template <typename T>
86 void copy_to_host(T* dst, T const* src, size_t count = 1) {
87  copy(dst, src, count, cudaMemcpyDeviceToHost);
88 }
89 
90 template <typename T>
91 void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
92  copy(dst, src, count, cudaMemcpyDeviceToDevice);
93 }
94 
95 template <typename T>
96 void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
97  copy(dst, src, count, cudaMemcpyHostToHost);
98 }
99 
101 template <typename OutputIterator, typename T>
102 void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
103  size_t elements = end - begin;
104  copy_to_host(&*begin, device_begin, elements);
105 }
106 
108 template <typename T, typename InputIterator>
109 void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
110  size_t elements = end - begin;
111  copy_to_device(device_begin, &*begin, elements);
112 }
113 
114 /******************************************************************************
115  * "Smart" device memory allocation
116  ******************************************************************************/
117 
119 template <typename T>
120 struct allocation {
122  struct deleter {
123  void operator()(T* ptr) {
124  cudaError_t cuda_error = (cudaFree(ptr));
125  if (cuda_error != cudaSuccess) {
126  // noexcept
127  // throw cuda_exception("cudaFree() failed", cuda_error);
128  return;
129  }
130  }
131  };
132 
133  //
134  // Data members
135  //
136 
138  size_t capacity;
139 
142 
143  //
144  // Methods
145  //
146 
148  allocation() : capacity(0) {}
149 
151  allocation(size_t _capacity) : smart_ptr(allocate<T>(_capacity)), capacity(_capacity) {}
152 
154  allocation(allocation const &p): smart_ptr(allocate<T>(p.capacity)), capacity(p.capacity) {
155  copy_device_to_device(smart_ptr.get(), p.get(), capacity);
156  }
157 
160 
162  T* get() const { return smart_ptr.get(); }
163 
165  T* release() {
166  capacity = 0;
167  return smart_ptr.release();
168  }
169 
171  void reset() {
172  capacity = 0;
173  smart_ptr.reset();
174  }
175 
177  void reset(T* _ptr, size_t _capacity) {
178  smart_ptr.reset(_ptr);
179  capacity = _capacity;
180  }
181 
183  T* operator->() const { return smart_ptr.get(); }
184 
186  deleter& get_deleter() { return smart_ptr.get_deleter(); }
187 
189  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
190 
193  if (capacity != p.capacity) {
194  smart_ptr.reset(allocate<T>(p.capacity));
195  capacity = p.capacity;
196  }
197  copy_device_to_device(smart_ptr.get(), p.get(), capacity);
198  return *this;
199  }
200 };
201 
202 } // namespace device_memory
203 } // namespace cutlass
Definition: aligned_buffer.h:35
allocation(size_t _capacity)
Constructor: allocates capacity elements on the current CUDA device.
Definition: device_memory.h:151
void insert_to_device(T *device_begin, InputIterator begin, InputIterator end)
Copies elements to device memory from host-side range.
Definition: device_memory.h:109
deleter & get_deleter()
Returns the deleter object which would be used for destruction of the managed object.
Definition: device_memory.h:186
void copy_to_device(T *dst, T const *src, size_t count=1)
Definition: device_memory.h:81
void copy(T *dst, T const *src, size_t count, cudaMemcpyKind kind)
Definition: device_memory.h:70
void operator()(T *ptr)
Definition: device_memory.h:123
T * get() const
Returns a pointer to the managed object.
Definition: device_memory.h:162
void reset()
Deletes the managed object and resets capacity to zero.
Definition: device_memory.h:171
C++ features that may be otherwise unimplemented for CUDA device functions.
Delete functor for CUDA device memory.
Definition: device_memory.h:122
std::unique_ptr
Definition: platform.h:712
T * release()
Releases the ownership of the managed object (without deleting) and resets capacity to zero...
Definition: device_memory.h:165
pointer get() const noexcept
Returns a pointer to the managed object or nullptr if no object is owned.
Definition: platform.h:735
T * allocate(size_t count=1)
Allocate a buffer of count elements of type T on the current CUDA device.
Definition: device_memory.h:42
platform::unique_ptr< T, deleter > smart_ptr
Smart pointer.
Definition: device_memory.h:141
size_t capacity
Number of elements of T allocated on the current CUDA device.
Definition: device_memory.h:138
Defines the size of an element in bits.
Definition: numeric_types.h:42
void copy_host_to_host(T *dst, T const *src, size_t count=1)
Definition: device_memory.h:96
C++ exception semantics for CUDA error codes.
Top-level include for all CUTLASS numeric types.
Deleter & get_deleter() noexcept
Returns the deleter object.
Definition: platform.h:757
T * operator->() const
Returns a pointer to the object owned by *this.
Definition: device_memory.h:183
void reset(pointer p=pointer()) noexcept
Replaces the managed object, deleting the old object.
Definition: platform.h:745
void copy_to_host(T *dst, T const *src, size_t count=1)
Definition: device_memory.h:86
~allocation()
Destructor.
Definition: device_memory.h:159
const deleter & get_deleter() const
Returns the deleter object which would be used for destruction of the managed object (const) ...
Definition: device_memory.h:189
C++ exception wrapper for CUDA cudaError_t.
Definition: exceptions.h:36
allocation & operator=(allocation const &p)
Copies a device-side memory allocation.
Definition: device_memory.h:192
allocation()
Constructor: allocates no memory.
Definition: device_memory.h:148
void reset(T *_ptr, size_t _capacity)
Deletes managed object, if owned, and replaces its reference with a given pointer and capacity...
Definition: device_memory.h:177
void free(T *ptr)
Free the buffer pointed to by ptr.
Definition: device_memory.h:56
void insert_to_host(OutputIterator begin, OutputIterator end, T const *device_begin)
Copies elements from device memory to host-side range.
Definition: device_memory.h:102
Device allocation abstraction that tracks size and capacity.
Definition: device_memory.h:120
pointer release() noexcept
Releases ownership of the managed object, if any.
Definition: platform.h:738
allocation(allocation const &p)
Copy constructor.
Definition: device_memory.h:154
void copy_device_to_device(T *dst, T const *src, size_t count=1)
Definition: device_memory.h:91