diff --git a/cuda/spgpu/kernels/caxpby.cu b/cuda/spgpu/kernels/caxpby.cu index d3d326ef..16eb87ed 100644 --- a/cuda/spgpu/kernels/caxpby.cu +++ b/cuda/spgpu/kernels/caxpby.cu @@ -32,8 +32,9 @@ extern "C" __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; - - if (id < n) + unsigned int gridSize = blockDim.x * gridDim.x; + for ( ; id < n; id +=gridSize) + //if (id,n) { // Since z, x and y are accessed with the same offset by the same thread, // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). @@ -45,7 +46,30 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c } } +#if 1 +void spgpuCaxpby(spgpuHandle_t handle, + __device cuFloatComplex *z, + int n, + cuFloatComplex beta, + __device cuFloatComplex *y, + cuFloatComplex alpha, + __device cuFloatComplex* x) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuCaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); +} +#else void spgpuCaxpby_(spgpuHandle_t handle, __device cuFloatComplex *z, int n, @@ -55,9 +79,15 @@ void spgpuCaxpby_(spgpuHandle_t handle, __device cuFloatComplex* x) { int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; - + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - dim3 grid(msize); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); spgpuCaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); } @@ -86,7 +116,7 @@ void spgpuCaxpby(spgpuHandle_t handle, cudaCheckError("CUDA error on saxpby"); } - +#endif void spgpuCmaxpby(spgpuHandle_t handle, __device cuFloatComplex *z, int n, diff --git a/cuda/spgpu/kernels/daxpby.cu b/cuda/spgpu/kernels/daxpby.cu index 83724ce2..a0a163a2 100644 --- a/cuda/spgpu/kernels/daxpby.cu +++ b/cuda/spgpu/kernels/daxpby.cu @@ -16,6 +16,7 @@ #include "cudadebug.h" #include "cudalang.h" +#include extern "C" { @@ -31,8 +32,9 @@ extern "C" __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; - - if (id < n) + unsigned int gridSize = blockDim.x * gridDim.x; + for ( ; id < n; id +=gridSize) + //if (id,n) { // Since z, x and y are accessed with the same offset by the same thread, // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). @@ -44,8 +46,9 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double } } +#if 1 -void spgpuDaxpby_(spgpuHandle_t handle, +void spgpuDaxpby(spgpuHandle_t handle, __device double *z, int n, double beta, @@ -54,9 +57,37 @@ void spgpuDaxpby_(spgpuHandle_t handle, __device double* x) { int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + spgpuDaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); +} +#else +void spgpuDaxpby_(spgpuHandle_t handle, + __device double *z, + int n, + double beta, + __device double *y, + double alpha, + __device double* x) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - dim3 grid(msize); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); spgpuDaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); } @@ -84,7 +115,7 @@ void spgpuDaxpby(spgpuHandle_t handle, cudaCheckError("CUDA error on daxpby"); } - +#endif void spgpuDmaxpby(spgpuHandle_t handle, __device double *z, int n, diff --git a/cuda/spgpu/kernels/saxpby.cu b/cuda/spgpu/kernels/saxpby.cu index 2c46f19e..42e2a7a7 100644 --- a/cuda/spgpu/kernels/saxpby.cu +++ b/cuda/spgpu/kernels/saxpby.cu @@ -30,8 +30,9 @@ extern "C" __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; - - if (id < n) + unsigned int gridSize = blockDim.x * gridDim.x; + for ( ; id < n; id +=gridSize) + //if (id,n) { // Since z, x and y are accessed with the same offset by the same thread, // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). @@ -44,6 +45,29 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp } +#if 1 +void spgpuSaxpby(spgpuHandle_t handle, + __device float *z, + int n, + float beta, + __device float *y, + float alpha, + __device float* x) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuSaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); +} +#else void spgpuSaxpby_(spgpuHandle_t handle, __device float *z, int n, @@ -83,7 +107,7 @@ void spgpuSaxpby(spgpuHandle_t handle, cudaCheckError("CUDA error on saxpby"); } - +#endif void spgpuSmaxpby(spgpuHandle_t handle, __device float *z, int n, diff --git a/cuda/spgpu/kernels/zaxpby.cu b/cuda/spgpu/kernels/zaxpby.cu index 7f9d5797..da438fc2 100644 --- a/cuda/spgpu/kernels/zaxpby.cu +++ b/cuda/spgpu/kernels/zaxpby.cu @@ -33,8 +33,9 @@ extern "C" __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; - - if (id < n) + unsigned int gridSize = blockDim.x * gridDim.x; + for ( ; id < n; id +=gridSize) + //if (id,n) { // Since z, x and y are accessed with the same offset by the same thread, // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). @@ -46,7 +47,29 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, } } +#if 1 +void spgpuZaxpby(spgpuHandle_t handle, + __device cuDoubleComplex *z, + int n, + cuDoubleComplex beta, + __device cuDoubleComplex *y, + cuDoubleComplex alpha, + __device cuDoubleComplex* x) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + spgpuZaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); +} +#else void spgpuZaxpby_(spgpuHandle_t handle, __device cuDoubleComplex *z, int n, @@ -86,7 +109,7 @@ void spgpuZaxpby(spgpuHandle_t handle, cudaCheckError("CUDA error on daxpby"); } - +#endif void spgpuZmaxpby(spgpuHandle_t handle, __device cuDoubleComplex *z, int n,