Better AXPBY implementation in CUDA.

11 months ago · a41b209144
parent f4c7604f61
commit a41b209144
4 changed files with 124 additions and 16 deletions
--- a/cuda/spgpu/kernels/caxpby.cu
+++ b/cuda/spgpu/kernels/caxpby.cu
@ -32,8 +32,9 @@ extern "C"
 __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
+	unsigned int gridSize = blockDim.x * gridDim.x;
-	if (id < n)
+	for ( ; id < n; id +=gridSize)
 		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -45,7 +46,30 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
 	}
 }
 #if 1
 void spgpuCaxpby(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
 	cuFloatComplex beta,
 	__device cuFloatComplex *y,
 	cuFloatComplex alpha,
 	__device cuFloatComplex* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
 void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
@ -55,9 +79,15 @@ void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
-
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@ -86,7 +116,7 @@ void spgpuCaxpby(spgpuHandle_t handle,
 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuCmaxpby(spgpuHandle_t handle,
 		  __device cuFloatComplex *z,
 		  int n,
--- a/cuda/spgpu/kernels/daxpby.cu
+++ b/cuda/spgpu/kernels/daxpby.cu
@ -16,6 +16,7 @@
 #include "cudadebug.h"
 #include "cudalang.h"
 #include <cuda_runtime.h>
 extern "C"
 {
@ -31,8 +32,9 @@ extern "C"
 __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
+	unsigned int gridSize = blockDim.x * gridDim.x;
-	if (id < n)
+	for ( ; id < n; id +=gridSize)
 		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,8 +46,9 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
 	}
 }
 #if 1
-void spgpuDaxpby_(spgpuHandle_t handle,
+void spgpuDaxpby(spgpuHandle_t handle,
 	__device double *z,
 	int n,
 	double beta,
@ -54,9 +57,37 @@ void spgpuDaxpby_(spgpuHandle_t handle,
 	__device double* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
 void spgpuDaxpby_(spgpuHandle_t handle,
 	__device double *z,
 	int n,
 	double beta,
 	__device double *y,
 	double alpha,
 	__device double* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@ -84,7 +115,7 @@ void spgpuDaxpby(spgpuHandle_t handle,
 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuDmaxpby(spgpuHandle_t handle,
 		  __device double *z,
 		  int n,
--- a/cuda/spgpu/kernels/saxpby.cu
+++ b/cuda/spgpu/kernels/saxpby.cu
@ -30,8 +30,9 @@ extern "C"
 __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
+	unsigned int gridSize = blockDim.x * gridDim.x;
-	if (id < n)
+	for ( ; id < n; id +=gridSize)
 		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,6 +45,29 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
 }
 #if 1
 void spgpuSaxpby(spgpuHandle_t handle,
 	__device float *z,
 	int n,
 	float beta,
 	__device float *y,
 	float alpha,
 	__device float* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
 void spgpuSaxpby_(spgpuHandle_t handle,
 	__device float *z,
 	int n,
@ -83,7 +107,7 @@ void spgpuSaxpby(spgpuHandle_t handle,
 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuSmaxpby(spgpuHandle_t handle,
 		  __device float *z,
 		  int n,
--- a/cuda/spgpu/kernels/zaxpby.cu
+++ b/cuda/spgpu/kernels/zaxpby.cu
@ -33,8 +33,9 @@ extern "C"
 __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
+	unsigned int gridSize = blockDim.x * gridDim.x;
-	if (id < n)
+	for ( ; id < n; id +=gridSize)
 		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -46,7 +47,29 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
 	}
 }
 #if 1
 void spgpuZaxpby(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,
 	cuDoubleComplex beta,
 	__device cuDoubleComplex *y,
 	cuDoubleComplex alpha,
 	__device cuDoubleComplex* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, 0);
 	num_mp         = deviceProp.multiProcessorCount;
 	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 	spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
 void spgpuZaxpby_(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,
@ -86,7 +109,7 @@ void spgpuZaxpby(spgpuHandle_t handle,
 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuZmaxpby(spgpuHandle_t handle,
 		  __device cuDoubleComplex *z,
 		  int n,