Switch FOR and IF in AXPBY

nond-rep
Salvatore Filippone 10 months ago
parent f9677bc892
commit 1ba8dfc7b7

@ -33,16 +33,21 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize) if (cuFloatComplex_isZero(beta)) {
//if (id,n) for ( ; id < n; id +=gridSize)
{ //if (id,n)
// Since z, x and y are accessed with the same offset by the same thread, {
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuFloatComplex_isZero(beta))
z[id] = cuCmulf(alpha,x[id]); z[id] = cuCmulf(alpha,x[id]);
else }
z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id])); } else {
for ( ; id < n; id +=gridSize)
//if (id,n)
{
z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
}
} }
} }

@ -33,16 +33,21 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize) if (beta == 0.0) {
//if (id,n) for ( ; id < n; id +=gridSize)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0) z[id] = PREC_DMUL(alpha,x[id]);
z[id] = PREC_DMUL(alpha,x[id]); }
else } else {
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
}
} }
} }

@ -31,16 +31,21 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize) if (beta == 0.0f) {
//if (id,n) for ( ; id < n; id +=gridSize)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0f) z[id] = PREC_FMUL(alpha,x[id]);
z[id] = PREC_FMUL(alpha,x[id]); }
else } else {
z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
}
} }
} }

@ -34,16 +34,21 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize) if (cuDoubleComplex_isZero(beta)) {
//if (id,n) for ( ; id < n; id +=gridSize)
{ //if (id,n)
// Since z, x and y are accessed with the same offset by the same thread, {
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuDoubleComplex_isZero(beta))
z[id] = cuCmul(alpha,x[id]); z[id] = cuCmul(alpha,x[id]);
else }
z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id])); } else {
for ( ; id < n; id +=gridSize)
//if (id,n)
{
z[id] = cuCfma(beta, y[id], cuCmul(alpha, x[id]));
}
} }
} }

Loading…
Cancel
Save