From 4681767ef869ac902b30361cbd0b36eb9d5c43fb Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Sat, 17 Feb 2024 18:20:12 +0100 Subject: [PATCH] New implementation for ABGDXYZ in CUDA --- cuda/dvectordev.c | 24 +++++++++++ cuda/psb_d_cuda_vect_mod.F90 | 3 +- cuda/psb_d_vectordev_mod.F90 | 13 ++++++ cuda/spgpu/kernels/Makefile | 2 +- cuda/spgpu/kernels/dabgdxyz.cu | 79 ++++++++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 cuda/spgpu/kernels/dabgdxyz.cu diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 39aa5b2a..785753dd 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -241,6 +241,30 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX, return(i); } + +int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + +#if 1 + spgpuDabgdxyz(handle,n, alpha,beta,gamma,delta, + (double*)devVecX->v_,(double*) devVecY->v_,(double*) devVecZ->v_); +#else + for(j=0;jcount_;j++) + spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, + (double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j); +#endif + return(i); +} + int axyMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB) { int i = 0; struct MultiVectDevice *devVecA = (struct MultiVectDevice *) deviceVecA; diff --git a/cuda/psb_d_cuda_vect_mod.F90 b/cuda/psb_d_cuda_vect_mod.F90 index 36fac14e..8256eaa0 100644 --- a/cuda/psb_d_cuda_vect_mod.F90 +++ b/cuda/psb_d_cuda_vect_mod.F90 @@ -947,7 +947,8 @@ contains if ((nx + +extern "C" +{ +#include "core.h" +#include "vector.h" +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuDabgdxyz_krn(int n, double alpha, double beta, double gamma, double delta, + double* x, double *y, double *z) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + double t; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + if (beta == 0.0) + t = PREC_DMUL(alpha,x[id]); + else + t = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); + if (delta == 0.0) + z[id] = gamma * t; + else + z[id] = PREC_DADD(PREC_DMUL(gamma, t), PREC_DMUL(delta,z[id])); + y[id] = t; + } +} + + +void spgpuDabgdxyz(spgpuHandle_t handle, + int n, + double alpha, + double beta, + double gamma, + double delta, + __device double* x, + __device double* y, + __device double *z) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + num_mp = deviceProp.multiProcessorCount; + max_threads_mp = deviceProp.maxThreadsPerMultiProcessor; + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuDabgdxyz_krn<<currentStream>>>(n, alpha, beta, gamma, delta, + x, y, z); +} +