diff --git a/cuda/spgpu/kernels/caxpby.cu b/cuda/spgpu/kernels/caxpby.cu index 33deecbc..3e97f75f 100644 --- a/cuda/spgpu/kernels/caxpby.cu +++ b/cuda/spgpu/kernels/caxpby.cu @@ -22,6 +22,9 @@ extern "C" { #include "core.h" #include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); + //#include "cuda_util.h" } @@ -29,6 +32,8 @@ extern "C" #define BLOCK_SIZE 512 +#if 1 + __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; @@ -51,7 +56,6 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c } } -#if 1 void spgpuCaxpby(spgpuHandle_t handle, __device cuFloatComplex *z, int n, @@ -63,10 +67,8 @@ void spgpuCaxpby(spgpuHandle_t handle, int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - num_mp = deviceProp.multiProcessorCount; - max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks = num_blocks_mp*num_mp; dim3 grid(num_blocks); @@ -75,6 +77,23 @@ void spgpuCaxpby(spgpuHandle_t handle, } #else + +__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + + if (id < n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + if (cuFloatComplex_isZero(beta)) + z[id] = cuCmulf(alpha,x[id]); + else + z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id])); + } +} + void spgpuCaxpby_(spgpuHandle_t handle, __device cuFloatComplex *z, int n, diff --git a/cuda/spgpu/kernels/dabgdxyz.cu b/cuda/spgpu/kernels/dabgdxyz.cu index 525371d3..f2b18e02 100644 --- a/cuda/spgpu/kernels/dabgdxyz.cu +++ b/cuda/spgpu/kernels/dabgdxyz.cu @@ -22,6 +22,8 @@ extern "C" { #include "core.h" #include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); } @@ -65,10 +67,8 @@ void spgpuDabgdxyz(spgpuHandle_t handle, int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - num_mp = deviceProp.multiProcessorCount; - max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks = num_blocks_mp*num_mp; dim3 grid(num_blocks); diff --git a/cuda/spgpu/kernels/daxpby.cu b/cuda/spgpu/kernels/daxpby.cu index ce7c0dd4..fa87d996 100644 --- a/cuda/spgpu/kernels/daxpby.cu +++ b/cuda/spgpu/kernels/daxpby.cu @@ -22,6 +22,9 @@ extern "C" { #include "core.h" #include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); + //#include "cuda_util.h" } @@ -29,6 +32,8 @@ extern "C" #define BLOCK_SIZE 512 + +#if 1 __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; @@ -36,23 +41,17 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double if (beta == 0.0) { for ( ; id < n; id +=gridSize) { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). z[id] = PREC_DMUL(alpha,x[id]); } } else { for ( ; id < n; id +=gridSize) { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); } } } -#if 1 - void spgpuDaxpby(spgpuHandle_t handle, __device double *z, int n, @@ -64,10 +63,8 @@ void spgpuDaxpby(spgpuHandle_t handle, int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - num_mp = deviceProp.multiProcessorCount; - max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks = num_blocks_mp*num_mp; dim3 grid(num_blocks); @@ -75,6 +72,23 @@ void spgpuDaxpby(spgpuHandle_t handle, spgpuDaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); } #else + +__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + + if (id < n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + if (beta == 0.0) + z[id] = PREC_DMUL(alpha,x[id]); + else + z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); + } +} + void spgpuDaxpby_(spgpuHandle_t handle, __device double *z, int n, diff --git a/cuda/spgpu/kernels/saxpby.cu b/cuda/spgpu/kernels/saxpby.cu index 36c3cdbe..2f06e39c 100644 --- a/cuda/spgpu/kernels/saxpby.cu +++ b/cuda/spgpu/kernels/saxpby.cu @@ -20,6 +20,9 @@ extern "C" { #include "core.h" #include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); + //#include "cuda_util.h" } @@ -27,6 +30,8 @@ extern "C" #define BLOCK_SIZE 512 + +#if 1 __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; @@ -49,8 +54,6 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp } } - -#if 1 void spgpuSaxpby(spgpuHandle_t handle, __device float *z, int n, @@ -62,17 +65,35 @@ void spgpuSaxpby(spgpuHandle_t handle, int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - num_mp = deviceProp.multiProcessorCount; - max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks = num_blocks_mp*num_mp; dim3 grid(num_blocks); spgpuSaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); } + #else + +__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + + if (id < n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + if (beta == 0.0f) + z[id] = PREC_FMUL(alpha,x[id]); + else + z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); + } +} + + + void spgpuSaxpby_(spgpuHandle_t handle, __device float *z, int n, diff --git a/cuda/spgpu/kernels/zaxpby.cu b/cuda/spgpu/kernels/zaxpby.cu index 8aec3e17..8efc40d2 100644 --- a/cuda/spgpu/kernels/zaxpby.cu +++ b/cuda/spgpu/kernels/zaxpby.cu @@ -23,6 +23,9 @@ extern "C" { #include "core.h" #include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); + //#include "cuda_util.h" } @@ -30,6 +33,7 @@ extern "C" #define BLOCK_SIZE 512 +#if 1 __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; @@ -52,7 +56,6 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, } } -#if 1 void spgpuZaxpby(spgpuHandle_t handle, __device cuDoubleComplex *z, int n, @@ -64,10 +67,8 @@ void spgpuZaxpby(spgpuHandle_t handle, int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int num_mp, max_threads_mp, num_blocks_mp, num_blocks; dim3 block(BLOCK_SIZE); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - num_mp = deviceProp.multiProcessorCount; - max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks = num_blocks_mp*num_mp; dim3 grid(num_blocks); @@ -75,6 +76,23 @@ void spgpuZaxpby(spgpuHandle_t handle, spgpuZaxpby_krn<<currentStream>>>(z, n, beta, y, alpha, x); } #else +__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + + if (id < n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + if (cuDoubleComplex_isZero(beta)) + z[id] = cuCmul(alpha,x[id]); + else + z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id])); + } +} + + void spgpuZaxpby_(spgpuHandle_t handle, __device cuDoubleComplex *z, int n,