Switch FOR and IF in AXPBY

nond-rep
Salvatore Filippone 11 months ago
parent f9677bc892
commit 1ba8dfc7b7

@ -33,18 +33,23 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
if (cuFloatComplex_isZero(beta)) {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
//if (id,n) //if (id,n)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuFloatComplex_isZero(beta))
z[id] = cuCmulf(alpha,x[id]); z[id] = cuCmulf(alpha,x[id]);
else }
} else {
for ( ; id < n; id +=gridSize)
//if (id,n)
{
z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id])); z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
} }
} }
}
#if 1 #if 1
void spgpuCaxpby(spgpuHandle_t handle, void spgpuCaxpby(spgpuHandle_t handle,

@ -33,18 +33,23 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
if (beta == 0.0) {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
//if (id,n)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0)
z[id] = PREC_DMUL(alpha,x[id]); z[id] = PREC_DMUL(alpha,x[id]);
else }
} else {
for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
} }
} }
}
#if 1 #if 1

@ -31,18 +31,23 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
if (beta == 0.0f) {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
//if (id,n)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0f)
z[id] = PREC_FMUL(alpha,x[id]); z[id] = PREC_FMUL(alpha,x[id]);
else }
} else {
for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
} }
} }
}
#if 1 #if 1

@ -34,16 +34,21 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
unsigned int gridSize = blockDim.x * gridDim.x; unsigned int gridSize = blockDim.x * gridDim.x;
if (cuDoubleComplex_isZero(beta)) {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
//if (id,n) //if (id,n)
{ {
// Since z, x and y are accessed with the same offset by the same thread, // Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing). // and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuDoubleComplex_isZero(beta))
z[id] = cuCmul(alpha,x[id]); z[id] = cuCmul(alpha,x[id]);
else }
z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id])); } else {
for ( ; id < n; id +=gridSize)
//if (id,n)
{
z[id] = cuCfma(beta, y[id], cuCmul(alpha, x[id]));
}
} }
} }

Loading…
Cancel
Save