Use of unique_ptr and cudaMalloc
I've been thinking about playing around with using std::unique_ptr with device pointers in CUDA. What I was wondering is if the current c++11 unique_ptr can be used in conjunction with cudaMalloc. I know it can be used with normal malloc (Is it possible to use a C++ smart pointers together with C's malloc?), but cudaMalloc doesn't return the pointer in the function's return statement. Instead, it returns an error code. The pointer is returned in a reference.
This blog post recommends the following technique:
auto deleter=[&](float* ptr){ cudaFree(ptr); };
std::unique_ptr<float[], decltype(deleter)> d_in(new float[size], deleter);
cudaMalloc((void **) &d_in, size * sizeof(float));
Question: However, I'm concerned that this creates host memory that never gets deleted (i.e. d_in(new float[size], deleter);
)? Unless new float[size]
doesn't actually generate host memory or is overridden? If the above doesn't in fact work, could defining my own cudaMalloc wrapper work? - to pass the pointer to unique_ptr?
Something like:
void* myCudaMalloc(size_t mySize){
void * p;
checkCUDAerrorMacro(cudaMalloc((void**) &p, size);)
return p;
}
...
auto deleter=[](float* ptr){ cudaFree(ptr); };
std::unique_ptr<float[], decltype(deleter)> d_in(myCudaMalloc(size_t mySize), deleter);
After some work I figured out how to test 3 versions of it - tl;dr the blog post's version (v1) does indeed leak, but can be tweaked so that it doesn't (v2) and improved (v3):
common code:
template <typename Deleter>
using unique_p = std::unique_ptr<float[], Deleter>;
constexpr int length = 20;
v1: (what is recommended in the blog post)
void version1(){
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted1\n"; };
unique_p<decltype(deleter)> d_in(new float[length],deleter);
cudaMalloc((void **) &d_in, length * sizeof(float));
...
}
v2: (similar to above, but initializes d_in with nullptr)
void version2(){
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted2\n"; };
unique_p<decltype(deleter)> d_in(nullptr,deleter);
cudaMalloc((void **) &d_in, length * sizeof(float));
...
}
v3: (d_in "adopts" pointer initialized with cudaMalloc)
void version3(){
auto myCudaMalloc = [](size_t mySize) { void* ptr; cudaMalloc((void**)&ptr, mySize); return ptr; };
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted3\n"; };
unique_p<decltype(deleter)> d_in((float*)myCudaMalloc(length*sizeof(float)),deleter);
...
}
All 3 create proper device pointers. However, version 1 definitely leaks host memory (tested using valgrind with the cuda warnings suppressed: Valgrind and CUDA: Are reported leaks real?). Neither v2 nor v3 leak host memory. cuda-memcheck also confirmed that there were no device-side memory leaks for any of the versions.
Between version 2 and 3, I prefer version 3 as it makes it more clear that unique_ptr owns the pointer and it follows the idiom of new
and malloc
in the unique_ptr constructor. You also only have to define the constructing function/lambda once and then can use it over and over again, so it is fewer lines of code.
========================
Full test code (compiled with nvcc -std=c++14):
#include <cuda_runtime.h>
#include <memory>
#include <iostream>
template <typename Deleter>
using unique_p = std::unique_ptr<float[], Deleter>;
__global__ void printArray(float * d_in, int num){
for(int i = 0; i < num; i++){ printf("%f\t",d_in[i]); }
printf("\n");
}
struct myDeleter{
void operator()(float* ptr){ cudaFree(ptr); std::cout<<"\nDeleted\n"; }
};
constexpr int length = 20;
void version1(){
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted1\n"; };
unique_p<decltype(deleter)> d_in(new float[length],deleter);
cudaMalloc((void **) &d_in, length * sizeof(float));
std::unique_ptr<float[]> h_out(new float[length]);
for(int i = 0; i < length; i++){ h_out[i] = i; }
cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);
printArray<<<1,1>>>(d_in.get(),length);
}
void version2(){
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted2\n"; };
unique_p<decltype(deleter)> d_in(nullptr,deleter);
cudaMalloc((void **) &d_in, length * sizeof(float));
std::unique_ptr<float[]> h_out(new float[length]);
for(int i = 0; i < length; i++){ h_out[i] = i; }
cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);
printArray<<<1,1>>>(d_in.get(),length);
}
void version3(){
auto myCudaMalloc = [](size_t mySize) { void* ptr; cudaMalloc((void**)&ptr, mySize); return ptr; };
auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted3\n"; };
unique_p<decltype(deleter)> d_in((float*)myCudaMalloc(length*sizeof(float)),deleter);
//unique_p<myDeleter> d_in((float*)myCudaMalloc(20*sizeof(float)));
std::unique_ptr<float[]> h_out(new float[length]);
for(int i = 0; i < length; i++){ h_out[i] = i; }
cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);
printArray<<<1,1>>>(d_in.get(),length);
}
int main(){
version1();
version2();
version3();
cudaDeviceReset();
return 0;
}
This pattern has worked for me pretty well:
int main(){
float* deviceArray_raw;
gpuErrchk(cudaMalloc((void**)&deviceArray_raw, 100 * sizeof(float)));
auto deleter = [](float* ptr) { gpuErrchk(cudaFree(ptr)); };
std::unique_ptr<float[], decltype(deleter)> deviceArray(deviceArray_raw, deleter);
...
...
return 0;
}
Apart from host memory leaks, one also needs to be careful about device memory leaks.
Wrapping up cuda API calls in gpuErrchk
helps with this. I was able to catch some weird behavior using this.