diff --git a/cuda/spgpu/kernels/caxpby.cu b/cuda/spgpu/kernels/caxpby.cu
index d3d326ef..16eb87ed 100644
--- a/cuda/spgpu/kernels/caxpby.cu
+++ b/cuda/spgpu/kernels/caxpby.cu
@@ -32,8 +32,9 @@ extern "C"
 __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@@ -45,7 +46,30 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
 	}
 }
 
+#if 1
+void spgpuCaxpby(spgpuHandle_t handle,
+	__device cuFloatComplex *z,
+	int n,
+	cuFloatComplex beta,
+	__device cuFloatComplex *y,
+	cuFloatComplex alpha,
+	__device cuFloatComplex* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
 
+#else
 void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex *z,
 	int n,
@@ -55,9 +79,15 @@ void spgpuCaxpby_(spgpuHandle_t handle,
 	__device cuFloatComplex* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
-
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
 
 	spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@@ -86,7 +116,7 @@ void spgpuCaxpby(spgpuHandle_t handle,
 
 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuCmaxpby(spgpuHandle_t handle,
 		  __device cuFloatComplex *z,
 		  int n,
diff --git a/cuda/spgpu/kernels/daxpby.cu b/cuda/spgpu/kernels/daxpby.cu
index 83724ce2..a0a163a2 100644
--- a/cuda/spgpu/kernels/daxpby.cu
+++ b/cuda/spgpu/kernels/daxpby.cu
@@ -16,6 +16,7 @@
 
 #include "cudadebug.h"
 #include "cudalang.h"
+#include <cuda_runtime.h>
 
 extern "C"
 {
@@ -31,8 +32,9 @@ extern "C"
 __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@@ -44,8 +46,9 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
 	}
 }
 
+#if 1
 
-void spgpuDaxpby_(spgpuHandle_t handle,
+void spgpuDaxpby(spgpuHandle_t handle,
 	__device double *z,
 	int n,
 	double beta,
@@ -54,9 +57,37 @@ void spgpuDaxpby_(spgpuHandle_t handle,
 	__device double* x)
 {
 	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
 
+	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
+void spgpuDaxpby_(spgpuHandle_t handle,
+	__device double *z,
+	int n,
+	double beta,
+	__device double *y,
+	double alpha,
+	__device double* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
 	dim3 block(BLOCK_SIZE);
-	dim3 grid(msize);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
 
 	spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
 }
@@ -84,7 +115,7 @@ void spgpuDaxpby(spgpuHandle_t handle,
 
 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuDmaxpby(spgpuHandle_t handle,
 		  __device double *z,
 		  int n,
diff --git a/cuda/spgpu/kernels/saxpby.cu b/cuda/spgpu/kernels/saxpby.cu
index 2c46f19e..42e2a7a7 100644
--- a/cuda/spgpu/kernels/saxpby.cu
+++ b/cuda/spgpu/kernels/saxpby.cu
@@ -30,8 +30,9 @@ extern "C"
 __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@@ -44,6 +45,29 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
 }
 
 
+#if 1
+void spgpuSaxpby(spgpuHandle_t handle,
+	__device float *z,
+	int n,
+	float beta,
+	__device float *y,
+	float alpha,
+	__device float* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
 void spgpuSaxpby_(spgpuHandle_t handle,
 	__device float *z,
 	int n,
@@ -83,7 +107,7 @@ void spgpuSaxpby(spgpuHandle_t handle,
 
 	cudaCheckError("CUDA error on saxpby");
 }
-
+#endif
 void spgpuSmaxpby(spgpuHandle_t handle,
 		  __device float *z,
 		  int n,
diff --git a/cuda/spgpu/kernels/zaxpby.cu b/cuda/spgpu/kernels/zaxpby.cu
index 7f9d5797..da438fc2 100644
--- a/cuda/spgpu/kernels/zaxpby.cu
+++ b/cuda/spgpu/kernels/zaxpby.cu
@@ -33,8 +33,9 @@ extern "C"
 __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
 {
 	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
-	
-	if (id < n)
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
 	{
 		// Since z, x and y are accessed with the same offset by the same thread,
 		// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@@ -46,7 +47,29 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
 	}
 }
 
+#if 1
+void spgpuZaxpby(spgpuHandle_t handle,
+	__device cuDoubleComplex *z,
+	int n,
+	cuDoubleComplex beta,
+	__device cuDoubleComplex *y,
+	cuDoubleComplex alpha,
+	__device cuDoubleComplex* x)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, 0);
+	num_mp         = deviceProp.multiProcessorCount;
+	max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
 
+	spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
+}
+#else
 void spgpuZaxpby_(spgpuHandle_t handle,
 	__device cuDoubleComplex *z,
 	int n,
@@ -86,7 +109,7 @@ void spgpuZaxpby(spgpuHandle_t handle,
 
 	cudaCheckError("CUDA error on daxpby");
 }
-
+#endif
 void spgpuZmaxpby(spgpuHandle_t handle,
 		  __device cuDoubleComplex *z,
 		  int n,