Saturday, November 12, 2016

MSVC Cuda

(1) cuda is nvidia GPU heterogeneous parallel API integrated in vs 2015
(2) must turn on .net 3.5 window 10 features to compile
(3) search msvc cuda to download nvidia toolkit and vs integration
(4) Control panel, device manager, display adapter shows nvida 5200m for me and cuda capable
(5) must set Cuda/C++ device to Compute_20,sm_20 for NVS 5200M, my labtop cannot do CC=35.
(6) https://en.wikipedia.org/wiki/CUDA#Supported_GPUs list CC=compute capability
(7) can search Cuda.8.0.props file to change CodeGeneration tag to compute=35,sm=35 for all projects.
e.g.C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V140\BuildCustomizations
(8) <<<blockSz,WarpSz>>> host to device call with blkSz and Hardware thread size.
(9) __global__ kernel function run on device. __ldg __shfl_down needs 35 so my labtop too old
(10) grid + Block +warp, segments GPU hardware thread into 3-dim cubes. 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

cudaError_t full_Reduce(int *out, const int *in, size_t n);

__global__ void full_reduce_kernel(int *out, const int *in,size_t n)
{
 int sum = 0; // in hardware,it is a vector<int> width=32,init 0

 // a thread coming choose a block to start, only consider dim-x, y=z=1
 // 4= each thread process 4 elements
 size_t start = (threadIdx.x + blockIdx.x*blockDim.x) * 4; 

 for (size_t i = start; i < start+4 && i < n; i++)
 {
  sum += in[i]; //again i is vector of 32 addresses
  //sum += __ldg(in + i); //_ldg supported in 35, not 20 but NVS 5200M Compute Capability=2.1
 } 
 // shift of SIMD registers, 32 threads move data half by half to front, invalidate back half
 // __shfl_down support in 35, not 20
 /*sum += __shfl_down(sum, 16);
 sum += __shfl_down(sum, 8);
 sum += __shfl_down(sum, 4);
 sum += __shfl_down(sum, 2);
 sum += __shfl_down(sum, 1);*/
 __shared__ int shared_sum; //not vector
 shared_sum = 0;
 __syncthreads(); // only sync warp in a block
 if (threadIdx.x % 32 == 0)
  atomicAdd(&shared_sum, sum);
 __syncthreads();
 if (threadIdx.x == 0) // shfl_down will make only idx=0 valid
  atomicAdd(out, shared_sum);
}

int main()
{
 const int n = 5;
 const int in[n] = { 1,2,3,4,5 };
 int out[n] = { 0 };
 cudaError_t cudaStatus1 = full_Reduce(out, in, n);
 if (cudaStatus1 != cudaSuccess) {
  fprintf(stderr, "full reduce failed!");
  return 1;
 }

 printf("%d %d %d %d %d\n",out[0],out[1],out[2],out[3],out[4]);
 cudaStatus1 = cudaDeviceReset();
 return 0;
}

cudaError_t full_Reduce(int *out, const int *in, size_t n)
{
 int *dev_in = 0;
 int *dev_out = 0;
 int sz = n * sizeof(int);
 cudaError_t cudaStatus;

 cudaStatus = cudaSetDevice(0);

 cudaStatus = cudaMalloc((void**)&dev_out, sz);
 cudaStatus = cudaMalloc((void**)&dev_in, sz);

 cudaStatus = cudaMemcpy(dev_in, in, sz, cudaMemcpyHostToDevice);
 full_reduce_kernel<<<1,n>>>(dev_out, dev_in,n);  // call dev not int
 cudaStatus = cudaGetLastError();
 if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "full redu kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  goto FreeDev;
 }

 cudaStatus = cudaDeviceSynchronize();
 cudaStatus = cudaMemcpy(out, dev_out, sz, cudaMemcpyDeviceToHost);

FreeDev:
 cudaFree(dev_out);
 cudaFree(dev_in);

 return cudaStatus;
}

No comments:

Post a Comment