Better AXPBY implementation in CUDA.

nond-rep
Salvatore Filippone 11 months ago
parent f4c7604f61
commit a41b209144

@ -32,8 +32,9 @@ extern "C"
__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize)
//if (id,n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -45,7 +46,30 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
}
}
#if 1
void spgpuCaxpby(spgpuHandle_t handle,
__device cuFloatComplex *z,
int n,
cuFloatComplex beta,
__device cuFloatComplex *y,
cuFloatComplex alpha,
__device cuFloatComplex* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
void spgpuCaxpby_(spgpuHandle_t handle,
__device cuFloatComplex *z,
int n,
@ -55,9 +79,15 @@ void spgpuCaxpby_(spgpuHandle_t handle,
__device cuFloatComplex* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
dim3 grid(msize);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuCaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
@ -86,7 +116,7 @@ void spgpuCaxpby(spgpuHandle_t handle,
cudaCheckError("CUDA error on saxpby");
}
#endif
void spgpuCmaxpby(spgpuHandle_t handle,
__device cuFloatComplex *z,
int n,

@ -16,6 +16,7 @@
#include "cudadebug.h"
#include "cudalang.h"
#include <cuda_runtime.h>
extern "C"
{
@ -31,8 +32,9 @@ extern "C"
__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize)
//if (id,n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,8 +46,9 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
}
}
#if 1
void spgpuDaxpby_(spgpuHandle_t handle,
void spgpuDaxpby(spgpuHandle_t handle,
__device double *z,
int n,
double beta,
@ -54,9 +57,37 @@ void spgpuDaxpby_(spgpuHandle_t handle,
__device double* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
void spgpuDaxpby_(spgpuHandle_t handle,
__device double *z,
int n,
double beta,
__device double *y,
double alpha,
__device double* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
dim3 grid(msize);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
@ -84,7 +115,7 @@ void spgpuDaxpby(spgpuHandle_t handle,
cudaCheckError("CUDA error on daxpby");
}
#endif
void spgpuDmaxpby(spgpuHandle_t handle,
__device double *z,
int n,

@ -30,8 +30,9 @@ extern "C"
__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize)
//if (id,n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -44,6 +45,29 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
}
#if 1
void spgpuSaxpby(spgpuHandle_t handle,
__device float *z,
int n,
float beta,
__device float *y,
float alpha,
__device float* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
void spgpuSaxpby_(spgpuHandle_t handle,
__device float *z,
int n,
@ -83,7 +107,7 @@ void spgpuSaxpby(spgpuHandle_t handle,
cudaCheckError("CUDA error on saxpby");
}
#endif
void spgpuSmaxpby(spgpuHandle_t handle,
__device float *z,
int n,

@ -33,8 +33,9 @@ extern "C"
__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
unsigned int gridSize = blockDim.x * gridDim.x;
for ( ; id < n; id +=gridSize)
//if (id,n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
@ -46,7 +47,29 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
}
}
#if 1
void spgpuZaxpby(spgpuHandle_t handle,
__device cuDoubleComplex *z,
int n,
cuDoubleComplex beta,
__device cuDoubleComplex *y,
cuDoubleComplex alpha,
__device cuDoubleComplex* x)
{
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
void spgpuZaxpby_(spgpuHandle_t handle,
__device cuDoubleComplex *z,
int n,
@ -86,7 +109,7 @@ void spgpuZaxpby(spgpuHandle_t handle,
cudaCheckError("CUDA error on daxpby");
}
#endif
void spgpuZmaxpby(spgpuHandle_t handle,
__device cuDoubleComplex *z,
int n,

Loading…
Cancel
Save