makefile
INCLUDE=-I/usr/local/cuda-9.2/includeLIB= -lcudadevrt # -lcudart -lcublasCUDA_FLAG=-rdc=true -gencode=arch=compute_35,code=\"sm_35,compute_35\"#all:TestGpuNPV.a:link.o cashflow_3.o qss_date4c_cu.o ar crv $@ $^oTestGpu:TestGPU.cu nvcc -o $@ $(INCLUDE) $^cashflow_3:qss_date4c_cu.o cashflow_3.o nvcc -o $@ $(INCLUDE) $(CUDA_FLAG) $(LIB) $^Testcashflow_3:qss_date4c_cu.o cashflow_3.o Testcase_CashFlow_cuda.o excel.o nvcc -o $@ $(INCLUDE) $(CUDA_FLAG) $(LIB) $^link.o:cashflow_3.o qss_date4c_cu.o nvcc $(INCLUDE) $(CUDA_FLAG) $(LIB) -dlink $^ -o $@Testcase_CashFlow_cuda.o:Testcase_CashFlow_cuda.cu nvcc -c $@ $(INCLUDE) $(CUDA_FLAG) $(LIB) $^excel.o:excel.cc g++ -c -std=c++11 $^cashflow_3.o:cashflow_3.cu nvcc -c $@ $(INCLUDE) $(CUDA_FLAG) $(LIB) $^qss_date4c_cu.o:qss_date4c_cu.cu nvcc -c $@ $(INCLUDE) $(CUDA_FLAG) $(LIB) $^clean: rm *.o#lspci | grep -i nvidia#lspci -v -s 09:00.0#nvidia-smi#
GPUDEMO
// Helper function for using CUDA to add vectors in parallel.cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size){ int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel <<<1, size >>> (dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus;}
posted on 2019-06-14 16:20 阅读( ...) 评论( ...)