Switch FOR and IF in AXPBY

10 months ago · 1ba8dfc7b7
parent f9677bc892
commit 1ba8dfc7b7
4 changed files with 60 additions and 40 deletions
--- a/cuda/spgpu/kernels/caxpby.cu
+++ b/cuda/spgpu/kernels/caxpby.cu
@ -33,16 +33,21 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
 	unsigned int gridSize = blockDim.x * gridDim.x;
-	for ( ; id < n; id +=gridSize)
+	if (cuFloatComplex_isZero(beta)) {
-		//if (id,n) 
+	  for ( ; id < n; id +=gridSize)
-	{
+	    //if (id,n) 
-		// Since z, x and y are accessed with the same offset by the same thread,
+	    {
-		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+	      // Since z, x and y are accessed with the same offset by the same thread,
-
+	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
-		if (cuFloatComplex_isZero(beta))
+	      
-			z[id] = cuCmulf(alpha,x[id]);
+	      z[id] = cuCmulf(alpha,x[id]);
-		else
+	    }
-			z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
+	} else {
 	  for ( ; id < n; id +=gridSize)
 	    //if (id,n) 
 	    {
 	      z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
 	    }
 	}
 }
--- a/cuda/spgpu/kernels/daxpby.cu
+++ b/cuda/spgpu/kernels/daxpby.cu
@ -33,16 +33,21 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
 	unsigned int gridSize = blockDim.x * gridDim.x;
-	for ( ; id < n; id +=gridSize)
+	if (beta == 0.0) {
-		//if (id,n) 
+	  for ( ; id < n; id +=gridSize)
-	{
+	    {
-		// Since z, x and y are accessed with the same offset by the same thread,
+	      // Since z, x and y are accessed with the same offset by the same thread,
-		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
-
+	      
-		if (beta == 0.0)
+	      z[id] = PREC_DMUL(alpha,x[id]);
-			z[id] = PREC_DMUL(alpha,x[id]);
+	    }
-		else
+	} else {
-			z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
+	  for ( ; id < n; id +=gridSize)
 	    {
 	      // Since z, x and y are accessed with the same offset by the same thread,
 	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
 	      z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
 	    }
 	}
 }
--- a/cuda/spgpu/kernels/saxpby.cu
+++ b/cuda/spgpu/kernels/saxpby.cu
@ -31,16 +31,21 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
 	unsigned int gridSize = blockDim.x * gridDim.x;
-	for ( ; id < n; id +=gridSize)
+	if (beta == 0.0f) {
-		//if (id,n) 
+	  for ( ; id < n; id +=gridSize)
-	{
+	    {
-		// Since z, x and y are accessed with the same offset by the same thread,
+	      // Since z, x and y are accessed with the same offset by the same thread,
-		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
-
+	      
-		if (beta == 0.0f)
+	      z[id] = PREC_FMUL(alpha,x[id]);
-			z[id] = PREC_FMUL(alpha,x[id]);
+	    }
-		else
+	} else {
-			z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
+	  for ( ; id < n; id +=gridSize)
 	    {
 	      // Since z, x and y are accessed with the same offset by the same thread,
 	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
 	      z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
 	    }
 	}
 }
--- a/cuda/spgpu/kernels/zaxpby.cu
+++ b/cuda/spgpu/kernels/zaxpby.cu
@ -34,16 +34,21 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
 	unsigned int gridSize = blockDim.x * gridDim.x;
-	for ( ; id < n; id +=gridSize)
+	if (cuDoubleComplex_isZero(beta)) {
-		//if (id,n) 
+	  for ( ; id < n; id +=gridSize)
-	{
+	    //if (id,n) 
-		// Since z, x and y are accessed with the same offset by the same thread,
+	    {
-		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+	      // Since z, x and y are accessed with the same offset by the same thread,
-
+	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
-		if (cuDoubleComplex_isZero(beta))
+	      
-			z[id] = cuCmul(alpha,x[id]);
+	      z[id] = cuCmul(alpha,x[id]);
-		else
+	    }
-			z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id]));
+	} else {
 	  for ( ; id < n; id +=gridSize)
 	    //if (id,n) 
 	    {
 	      z[id] = cuCfma(beta, y[id], cuCmul(alpha, x[id]));
 	    }
 	}
 }