Better AXPBY implementation in CUDA.

8 months ago · a41b209144
parent f4c7604f61
commit a41b209144
4 changed files with 124 additions and 16 deletions
--- a/cuda/spgpu/kernels/caxpby.cu
+++ b/cuda/spgpu/kernels/caxpby.cu
@ -32,8 +32,9 @@ extern "C"
 __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -45,7 +46,30 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
 	}
 }

+#if 1
+void spgpuCaxpby(spgpuHandle_t handle,
+	__device cuFloatComplex *z,
+	int n,
+	cuFloatComplex beta,
+	__device cuFloatComplex *y,
+	cuFloatComplex alpha,
+	__device cuFloatComplex* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}

+#else
 void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
@ -55,9 +79,15 @@ void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
-
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);

 	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@ -86,7 +116,7 @@ void spgpuCaxpby(spgpuHandle_t handle,

 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuCmaxpby(spgpuHandle_t handle,
 		  __device cuFloatComplex *z,
 		  int n,
--- a/cuda/spgpu/kernels/daxpby.cu
+++ b/cuda/spgpu/kernels/daxpby.cu
@ -16,6 +16,7 @@

 #include "cudadebug.h"
 #include "cudalang.h"
+#include <cuda_runtime.h>

 extern "C"
 {
@ -31,8 +32,9 @@ extern "C"
 __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,8 +46,9 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
 	}
 }

+#if 1

-void spgpuDaxpby_(spgpuHandle_t handle,
+void spgpuDaxpby(spgpuHandle_t handle,
 	__device double *z,
 	int n,
 	double beta,
@ -54,9 +57,37 @@ void spgpuDaxpby_(spgpuHandle_t handle,
 	__device double* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);

+	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
+void spgpuDaxpby_(spgpuHandle_t handle,
+	__device double *z,
+	int n,
+	double beta,
+	__device double *y,
+	double alpha,
+	__device double* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);

 	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@ -84,7 +115,7 @@ void spgpuDaxpby(spgpuHandle_t handle,

 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuDmaxpby(spgpuHandle_t handle,
 		  __device double *z,
 		  int n,
--- a/cuda/spgpu/kernels/saxpby.cu
+++ b/cuda/spgpu/kernels/saxpby.cu
@ -30,8 +30,9 @@ extern "C"
 __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,6 +45,29 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
 }


+#if 1
+void spgpuSaxpby(spgpuHandle_t handle,
+	__device float *z,
+	int n,
+	float beta,
+	__device float *y,
+	float alpha,
+	__device float* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
 void spgpuSaxpby_(spgpuHandle_t handle,
 	__device float *z,
 	int n,
@ -83,7 +107,7 @@ void spgpuSaxpby(spgpuHandle_t handle,

 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuSmaxpby(spgpuHandle_t handle,
 		  __device float *z,
 		  int n,
--- a/cuda/spgpu/kernels/zaxpby.cu
+++ b/cuda/spgpu/kernels/zaxpby.cu
@ -33,8 +33,9 @@ extern "C"
 __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -46,7 +47,29 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
 	}
 }

+#if 1
+void spgpuZaxpby(spgpuHandle_t handle,
+	__device cuDoubleComplex *z,
+	int n,
+	cuDoubleComplex beta,
+	__device cuDoubleComplex *y,
+	cuDoubleComplex alpha,
+	__device cuDoubleComplex* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);

+	spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
 void spgpuZaxpby_(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,
@ -86,7 +109,7 @@ void spgpuZaxpby(spgpuHandle_t handle,

 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuZmaxpby(spgpuHandle_t handle,
 		  __device cuDoubleComplex *z,
 		  int n,