In this part 2 post I want to illustrate the difference in technique between the common 'device copy' method and the 'unified memory' method which is more suitable for memory architectures such as NVidia's Tegra K1/X1 processors used on NVidia Jetson development kits. I wanted to show an example using just a CUDA kernel as well as an example utilizing OpenCV gpu::functions().
1. CUDA kernels: Device Copy method
For this example, I've written a simple CUDA kernel that will take a fixed matrix (640x480) of depth values (delivered by Xbox 360's Kinect) and simultaneously convert to XYZ coordinates while rotating the points. This example only computes the Y dimension, but I can provide a full XYZ function as well, the math is fairly simple. The code may seem a bit intense, but try not to think of what's inside the CUDA kernel for now.
Kernel Code:
__global__ void cudaCalcXYZ_R2( float *dst, float *src, float *M, float heightCenter, float widthCenter, float scaleFactor, float minDistance)
{
//__shared__ float jFactor;}
__shared__ float shM[3];
float nx,ny,nz, nzpminD, jFactor;
int blockCapacity;
int index;
if(threadIdx.x == 0)
{
shM[0] = M[4];
shM[1] = M[5];
shM[2] = M[6];
}
index = blockIdx.x*blockDim.x + threadIdx.x;
nz = src[index];
jFactor = ((float)blockIdx.x - heightCenter)*scaleFactor;
nzpminD = nz + minDistance;
nx = ((float)threadIdx.x - widthCenter )*(nzpminD)*scaleFactor;
ny = (jFactor)*(nzpminD);
//Solve for only Y matrix (height vlaues)
__syncthreads();
dst[index] = nx*shM[0] + ny*shM[1] + nz*shM[2];
Basically, a float pointer is sent as src (Depth data), it is manipulated to acquire the 'Y' parameter which is then stored in another float* dst. In a device copy implementation of the CUDA kernel, the data pointed to by src must first be copied to device memory using CUDA method cudaMemcpy(). Below is an example of how to do this ('h' generally means host (cpu) while 'd' means 'device' (gpu) ):
{
int rows = 480;
int cols = 640;
float* h_src, h_dst; //Host matrices
float* d_src, d_dst; //Device matrices
float* h_m, d_m; //4x4 rotation matrix (host/device)
//Allocate device copies using cudaMalloc
cudaMalloc( (void **)&d_src, sizeof(float)*rows*480);
cudaMalloc( (void **)&d_dst, sizeof(float)*rows*480);
cudaMalloc( (void **)&d_m, sizeof(float)*16);
//Allocate host pointers
h_src = (float*)malloc(sizeof(float)*rows*cols);
h_dst = (float*)malloc(sizeof(float)*rows*cols);
h_m = (float*)malloc(sizeof(float)*4*4);
//Copy all matrices from host to device
cudaMemcpy( d_src, h_src, sizeof(float)*rows*cols, cudaMemcpyHostToDevice);
cudaMemcpy( d_m, h_m, sizeof(float)*16, cudaMemcpyHostToDevice);
//Run the kernel
cudaCalcXYZ_R2<<< rows , cols>>>(d_dst, d_src, d_m, 240, 320, 0.0021, -10);
//Wait for GPU to finish
cudaDeviceSynchronize();
//Copy the result back to host memory
cudaMemcpy( h_dst, d_dst, sizeof(float)*rows*cols, cudaMemcpyDeviceToHost);
}
Here we are going to utilize the same kernel as the above example, but this time we are going to avoid any memory copy altogether by utilizing the CUDA_UVA technique. Here, instead of using cudaMallac() we have to use cudaMallocManaged();
{
cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations
int rows = 480;
int cols = 640;
float* h_src, h_dst; //Src and Dst matrices
float* h_m; //4x4 rotation matrix
//Allocate float*s for CUDA. No need to allocate host and device separately
cudaMallocManaged(&h_src, sizeof(float)*ros*cols);
cudaMallocManaged(&h_M, sizeof(float)*4*4);
cudaMallocManaged(&h_dst, sizeof(float)*ros*cols);
//Run the kernel
//Wait for GPU to finish
cudaDeviceSynchronize();
//Copy all matrices from host to device
cudaMemcpy( d_src, h_src, sizeof(float)*rows*cols, cudaMemcpyHostToDevice);
cudaMemcpy( d_m, h_m, sizeof(float)*16, cudaMemcpyHostToDevice);
//Run the kernel
cudaCalcXYZ_R2<<< rows , cols>>>(d_dst, d_src, d_m, 240, 320, 0.0021, -10);
//Wait for GPU to finish
cudaDeviceSynchronize();
//Copy the result back to host memory
cudaMemcpy( h_dst, d_dst, sizeof(float)*rows*cols, cudaMemcpyDeviceToHost);
}
2. CUDA kernels: Unified Memory method
Here we are going to utilize the same kernel as the above example, but this time we are going to avoid any memory copy altogether by utilizing the CUDA_UVA technique. Here, instead of using cudaMallac() we have to use cudaMallocManaged();
{
cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations
int rows = 480;
int cols = 640;
float* h_src, h_dst; //Src and Dst matrices
float* h_m; //4x4 rotation matrix
//Allocate float*s for CUDA. No need to allocate host and device separately
cudaMallocManaged(&h_src, sizeof(float)*ros*cols);
cudaMallocManaged(&h_M, sizeof(float)*4*4);
cudaMallocManaged(&h_dst, sizeof(float)*ros*cols);
//Run the kernel
cudaCalcXYZ_R2<<< rows , cols>>>(h_dst, h_src, h_m, 240, 320, 0.0021, -10);
//Wait for GPU to finish
cudaDeviceSynchronize();
//Done, now h_dst contains the results}
}
So now we have completely eliminated copying over 1.23MB (640x480x4 bytes) prior to running the kernel as well as eliminated copying 1.23MB (640x480x4 bytes) after the kernel has finished. Imagine trying to achieve real-time performance on a robot reading a Kinect sensor at 30FPS, needlessly copying more than 73.3MB a second into the same RAM!
[Note]: This code would only function on an architecture such as the NVidia Tegra X/K processors, so no sense in trying to run it on your discrete GPU in your laptop or desktop (it just won't work!).
{
//variables/pointers
int rows = 480;
int cols = 640;
float* h_a, h_b, h_c;
float* d_a, d_b, d_c;
//Allocate memory for host pointers
h_a = (float*)malloc(sizeof(float)*rows*cols);
h_b = (float*)malloc(sizeof(float)*rows*cols);
h_c = (float*)malloc(sizeof(float)*rows*cols);
//Allocate memory for device pointers
cudaMalloc( (void **)&d_a, sizeof(float)*rows*cols);
cudaMalloc( (void **)&d_b, sizeof(float)*rows*cols);
cudaMalloc( (void **)&d_c, sizeof(float)*rows*cols);
//Mats (declaring them using available pointers)
Mat hmat_a(cvSize(cols, rows), CV_32F, h_a);
Mat hmat_b(cvSize(cols, rows), CV_32F, h_b);
Mat hmat_c(cvSize(cols, rows), CV_32F, h_c);
//Gpu Mats (declaring with available pointers)
gpu::GpuMat dmat_a(cvSize(cols, rows), CV_32F, d_a);
gpu::GpuMat dmat_b(cvSize(cols, rows), CV_32F, d_b);
gpu::GpuMat dmat_c(cvSize(cols, rows), CV_32F, d_c);
//Let's assume our host matrices are filled with actual data, then copy them to the device matrices
dmat_a.upload(hmat_a);
dmat_b.upload(hmat_b);
//Run gpu::multiply()
gpu::multiply(dmat_a, dmat_b, dmat_c);
//Copy the result back to the host
dmat_c.download(hmat_c);
//Result now in hmat_c, required copying matrix a, b and c...
}
You'll notice that in the above example I've been allocating memory to pointers for my images, rather than just using OpenCV to allocate memory upon declaration of a Mat or GpuMat. This is required for this section on utilizing OpenCV GpuMats without having to upload and download data to and from the GPU memory on chips such as the Jetson IC. There is another less obvious reason I use this method. For real-time performance on embedded processors, it is more efficient to allocate memory for objects early on prior to any operations that run cyclically. As long as you can spare the memory, this becomes an effective way to increase performance (granted the trade off is sacrificing some RAM which won't be freed up etc.). If you find yourself in need of dynamically freeing up space from these allocated methods, you can look into cudaFree() and cudaFreeHost().
Now on eliminating download() and upload() OpenCV function calls.
{
cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations
//variables/pointers
int rows = 480;
int cols = 640;
float* h_a, h_b, h_c;
//Allocate memory for device pointers
cudaMallocManaged(&h_a, sizeof(float)*rows*cols);
cudaMallocManaged(&h_b, sizeof(float)*rows*cols);
cudaMallocManaged(&h_c, sizeof(float)*rows*cols);
//Mats (declaring them using pointers)
Mat hmat_a(cvSize(cols, rows), CV_32F, h_a);
Mat hmat_b(cvSize(cols, rows), CV_32F, h_b);
Mat hmat_c(cvSize(cols, rows), CV_32F, h_c);
//Gpu Mats (declaring with the same pointers!)
gpu::GpuMat dmat_a(cvSize(cols, rows), CV_32F, h_a);
gpu::GpuMat dmat_b(cvSize(cols, rows), CV_32F, h_b);
gpu::GpuMat dmat_c(cvSize(cols, rows), CV_32F, h_c);
//Run gpu::multiply()
gpu::multiply(dmat_a, dmat_b, dmat_c);
//Result now in hmat_c, no copying required!
}
Much like in the CUDA unified memory example, this method will only function on hardware with unified memory architecture (Jetson ICs for example). Now you do not need to bother using OpenCV download and upload methods for your algorithms.
Enjoy the speedups!
}
So now we have completely eliminated copying over 1.23MB (640x480x4 bytes) prior to running the kernel as well as eliminated copying 1.23MB (640x480x4 bytes) after the kernel has finished. Imagine trying to achieve real-time performance on a robot reading a Kinect sensor at 30FPS, needlessly copying more than 73.3MB a second into the same RAM!
[Note]: This code would only function on an architecture such as the NVidia Tegra X/K processors, so no sense in trying to run it on your discrete GPU in your laptop or desktop (it just won't work!).
3. OpenCV GPU functions: Device Copy method
There is a module available with OpenCV called GPU written in CUDA, for those to take advantage of GPU acceleration of various functions. There is plenty of documentation online to understand how to use OpenCV's CUDA, I will go over the very basics. The example we will use is the per-element multiplication of two matrices a and b, where the result is stored in c. Using the 'device copy' method, here is how to do so with OpenCV's gpu function gpu::multiply():{
//variables/pointers
int rows = 480;
int cols = 640;
float* h_a, h_b, h_c;
float* d_a, d_b, d_c;
//Allocate memory for host pointers
h_a = (float*)malloc(sizeof(float)*rows*cols);
h_b = (float*)malloc(sizeof(float)*rows*cols);
h_c = (float*)malloc(sizeof(float)*rows*cols);
//Allocate memory for device pointers
cudaMalloc( (void **)&d_a, sizeof(float)*rows*cols);
cudaMalloc( (void **)&d_b, sizeof(float)*rows*cols);
cudaMalloc( (void **)&d_c, sizeof(float)*rows*cols);
//Mats (declaring them using available pointers)
Mat hmat_a(cvSize(cols, rows), CV_32F, h_a);
Mat hmat_b(cvSize(cols, rows), CV_32F, h_b);
Mat hmat_c(cvSize(cols, rows), CV_32F, h_c);
//Gpu Mats (declaring with available pointers)
gpu::GpuMat dmat_a(cvSize(cols, rows), CV_32F, d_a);
gpu::GpuMat dmat_b(cvSize(cols, rows), CV_32F, d_b);
gpu::GpuMat dmat_c(cvSize(cols, rows), CV_32F, d_c);
//Let's assume our host matrices are filled with actual data, then copy them to the device matrices
dmat_a.upload(hmat_a);
dmat_b.upload(hmat_b);
//Run gpu::multiply()
gpu::multiply(dmat_a, dmat_b, dmat_c);
//Copy the result back to the host
dmat_c.download(hmat_c);
//Result now in hmat_c, required copying matrix a, b and c...
}
4. OpenCV GPU functions: Unified Memory method
Now on eliminating download() and upload() OpenCV function calls.
{
cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations
//variables/pointers
int rows = 480;
int cols = 640;
float* h_a, h_b, h_c;
//Allocate memory for device pointers
cudaMallocManaged(&h_a, sizeof(float)*rows*cols);
cudaMallocManaged(&h_b, sizeof(float)*rows*cols);
cudaMallocManaged(&h_c, sizeof(float)*rows*cols);
//Mats (declaring them using pointers)
Mat hmat_a(cvSize(cols, rows), CV_32F, h_a);
Mat hmat_b(cvSize(cols, rows), CV_32F, h_b);
Mat hmat_c(cvSize(cols, rows), CV_32F, h_c);
//Gpu Mats (declaring with the same pointers!)
gpu::GpuMat dmat_a(cvSize(cols, rows), CV_32F, h_a);
gpu::GpuMat dmat_b(cvSize(cols, rows), CV_32F, h_b);
gpu::GpuMat dmat_c(cvSize(cols, rows), CV_32F, h_c);
//Run gpu::multiply()
gpu::multiply(dmat_a, dmat_b, dmat_c);
//Result now in hmat_c, no copying required!
}
Much like in the CUDA unified memory example, this method will only function on hardware with unified memory architecture (Jetson ICs for example). Now you do not need to bother using OpenCV download and upload methods for your algorithms.
Enjoy the speedups!