diff --git a/cuda/spgpu/kernels/caxpby.cu b/cuda/spgpu/kernels/caxpby.cu
index 33deecbc..3e97f75f 100644
--- a/cuda/spgpu/kernels/caxpby.cu
+++ b/cuda/spgpu/kernels/caxpby.cu
@@ -22,6 +22,9 @@ extern "C"
 {
 #include "core.h"
 #include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+  //#include "cuda_util.h"
 }
 
 
@@ -29,6 +32,8 @@ extern "C"
 
 #define BLOCK_SIZE 512
 
+#if 1
+
 __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@@ -51,7 +56,6 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
 	}
 }
 
-#if 1
 void spgpuCaxpby(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
@@ -63,10 +67,8 @@ void spgpuCaxpby(spgpuHandle_t handle,
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-	num_mp         = deviceProp.multiProcessorCount;
-	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
@@ -75,6 +77,23 @@ void spgpuCaxpby(spgpuHandle_t handle,
 }
 
 #else
+
+__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	
+	if (id < n)
+	{
+		// Since z, x and y are accessed with the same offset by the same thread,
+		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+
+		if (cuFloatComplex_isZero(beta))
+			z[id] = cuCmulf(alpha,x[id]);
+		else
+			z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
+	}
+}
+
 void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
diff --git a/cuda/spgpu/kernels/dabgdxyz.cu b/cuda/spgpu/kernels/dabgdxyz.cu
index 525371d3..f2b18e02 100644
--- a/cuda/spgpu/kernels/dabgdxyz.cu
+++ b/cuda/spgpu/kernels/dabgdxyz.cu
@@ -22,6 +22,8 @@ extern "C"
 {
 #include "core.h"
 #include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
 }
 
 
@@ -65,10 +67,8 @@ void spgpuDabgdxyz(spgpuHandle_t handle,
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-	num_mp         = deviceProp.multiProcessorCount;
-	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
diff --git a/cuda/spgpu/kernels/daxpby.cu b/cuda/spgpu/kernels/daxpby.cu
index ce7c0dd4..fa87d996 100644
--- a/cuda/spgpu/kernels/daxpby.cu
+++ b/cuda/spgpu/kernels/daxpby.cu
@@ -22,6 +22,9 @@ extern "C"
 {
 #include "core.h"
 #include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+  //#include "cuda_util.h"
 }
 
 
@@ -29,6 +32,8 @@ extern "C"
 
 #define BLOCK_SIZE 512
 
+
+#if 1
 __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@@ -36,23 +41,17 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
 	if (beta == 0.0) {
 	  for ( ; id < n; id +=gridSize)
 	    {
-	      // Since z, x and y are accessed with the same offset by the same thread,
-	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
 	      
 	      z[id] = PREC_DMUL(alpha,x[id]);
 	    }
 	} else {
 	  for ( ; id < n; id +=gridSize)
 	    {
-	      // Since z, x and y are accessed with the same offset by the same thread,
-	      // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
 	      z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
 	    }
 	}
 }
 
-#if 1
-
 void spgpuDaxpby(spgpuHandle_t handle,
 	__device double *z,
 	int n,
@@ -64,10 +63,8 @@ void spgpuDaxpby(spgpuHandle_t handle,
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-	num_mp         = deviceProp.multiProcessorCount;
-	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
@@ -75,6 +72,23 @@ void spgpuDaxpby(spgpuHandle_t handle,
 	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
+
+__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	
+	if (id < n)
+	{
+		// Since z, x and y are accessed with the same offset by the same thread,
+		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+
+		if (beta == 0.0)
+			z[id] = PREC_DMUL(alpha,x[id]);
+		else
+			z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
+	}
+}
+
 void spgpuDaxpby_(spgpuHandle_t handle,
 	__device double *z,
 	int n,
diff --git a/cuda/spgpu/kernels/saxpby.cu b/cuda/spgpu/kernels/saxpby.cu
index 36c3cdbe..2f06e39c 100644
--- a/cuda/spgpu/kernels/saxpby.cu
+++ b/cuda/spgpu/kernels/saxpby.cu
@@ -20,6 +20,9 @@ extern "C"
 {
 #include "core.h"
 #include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+  //#include "cuda_util.h"
 }
 
 
@@ -27,6 +30,8 @@ extern "C"
 
 #define BLOCK_SIZE 512
 
+
+#if 1
 __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@@ -49,8 +54,6 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
 	}
 }
 
-
-#if 1
 void spgpuSaxpby(spgpuHandle_t handle,
 	__device float *z,
 	int n,
@@ -62,17 +65,35 @@ void spgpuSaxpby(spgpuHandle_t handle,
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-	num_mp         = deviceProp.multiProcessorCount;
-	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
 
 	spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
+
 #else
+
+__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	
+	if (id < n)
+	{
+		// Since z, x and y are accessed with the same offset by the same thread,
+		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+
+		if (beta == 0.0f)
+			z[id] = PREC_FMUL(alpha,x[id]);
+		else
+			z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
+	}
+}
+
+
+
 void spgpuSaxpby_(spgpuHandle_t handle,
 	__device float *z,
 	int n,
diff --git a/cuda/spgpu/kernels/zaxpby.cu b/cuda/spgpu/kernels/zaxpby.cu
index 8aec3e17..8efc40d2 100644
--- a/cuda/spgpu/kernels/zaxpby.cu
+++ b/cuda/spgpu/kernels/zaxpby.cu
@@ -23,6 +23,9 @@ extern "C"
 {
 #include "core.h"
 #include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+  //#include "cuda_util.h"
 }
 
 
@@ -30,6 +33,7 @@ extern "C"
 
 #define BLOCK_SIZE 512
 
+#if 1
 __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@@ -52,7 +56,6 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
 	}
 }
 
-#if 1
 void spgpuZaxpby(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,
@@ -64,10 +67,8 @@ void spgpuZaxpby(spgpuHandle_t handle,
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
 	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-	num_mp         = deviceProp.multiProcessorCount;
-	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
 	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
 	num_blocks     = num_blocks_mp*num_mp;
 	dim3 grid(num_blocks);
@@ -75,6 +76,23 @@ void spgpuZaxpby(spgpuHandle_t handle,
 	spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
 #else
+__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	
+	if (id < n)
+	{
+		// Since z, x and y are accessed with the same offset by the same thread,
+		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
+
+		if (cuDoubleComplex_isZero(beta))
+			z[id] = cuCmul(alpha,x[id]);
+		else
+			z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id]));
+	}
+}
+
+
 void spgpuZaxpby_(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,