diff --git a/cuda/spgpu/kernels/caxpby.cu b/cuda/spgpu/kernels/caxpby.cu index 16eb87ed..33deecbc 100644 --- a/cuda/spgpu/kernels/caxpby.cu +++ b/cuda/spgpu/kernels/caxpby.cu @@ -33,16 +33,21 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; unsigned int gridSize = blockDim.x * gridDim.x; - for ( ; id < n; id +=gridSize) - //if (id,n) - { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). - - if (cuFloatComplex_isZero(beta)) - z[id] = cuCmulf(alpha,x[id]); - else - z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id])); + if (cuFloatComplex_isZero(beta)) { + for ( ; id < n; id +=gridSize) + //if (id,n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + z[id] = cuCmulf(alpha,x[id]); + } + } else { + for ( ; id < n; id +=gridSize) + //if (id,n) + { + z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id])); + } } } diff --git a/cuda/spgpu/kernels/daxpby.cu b/cuda/spgpu/kernels/daxpby.cu index a0a163a2..ce7c0dd4 100644 --- a/cuda/spgpu/kernels/daxpby.cu +++ b/cuda/spgpu/kernels/daxpby.cu @@ -33,16 +33,21 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; unsigned int gridSize = blockDim.x * gridDim.x; - for ( ; id < n; id +=gridSize) - //if (id,n) - { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). - - if (beta == 0.0) - z[id] = PREC_DMUL(alpha,x[id]); - else - z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); + if (beta == 0.0) { + for ( ; id < n; id +=gridSize) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + z[id] = PREC_DMUL(alpha,x[id]); + } + } else { + for ( ; id < n; id +=gridSize) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); + } } } diff --git a/cuda/spgpu/kernels/saxpby.cu b/cuda/spgpu/kernels/saxpby.cu index 42e2a7a7..36c3cdbe 100644 --- a/cuda/spgpu/kernels/saxpby.cu +++ b/cuda/spgpu/kernels/saxpby.cu @@ -31,16 +31,21 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; unsigned int gridSize = blockDim.x * gridDim.x; - for ( ; id < n; id +=gridSize) - //if (id,n) - { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). - - if (beta == 0.0f) - z[id] = PREC_FMUL(alpha,x[id]); - else - z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); + if (beta == 0.0f) { + for ( ; id < n; id +=gridSize) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + z[id] = PREC_FMUL(alpha,x[id]); + } + } else { + for ( ; id < n; id +=gridSize) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); + } } } diff --git a/cuda/spgpu/kernels/zaxpby.cu b/cuda/spgpu/kernels/zaxpby.cu index da438fc2..8aec3e17 100644 --- a/cuda/spgpu/kernels/zaxpby.cu +++ b/cuda/spgpu/kernels/zaxpby.cu @@ -34,16 +34,21 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; unsigned int gridSize = blockDim.x * gridDim.x; - for ( ; id < n; id +=gridSize) - //if (id,n) - { - // Since z, x and y are accessed with the same offset by the same thread, - // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). - - if (cuDoubleComplex_isZero(beta)) - z[id] = cuCmul(alpha,x[id]); - else - z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id])); + if (cuDoubleComplex_isZero(beta)) { + for ( ; id < n; id +=gridSize) + //if (id,n) + { + // Since z, x and y are accessed with the same offset by the same thread, + // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). + + z[id] = cuCmul(alpha,x[id]); + } + } else { + for ( ; id < n; id +=gridSize) + //if (id,n) + { + z[id] = cuCfma(beta, y[id], cuCmul(alpha, x[id])); + } } }