From a11f328e62785f3e1684fb667b4a512ce7f4e77e Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 5 Mar 2024 12:42:21 +0100 Subject: [PATCH] Added CUDA version of XYZW --- cuda/cvectordev.c | 22 +++++++++++++ cuda/cvectordev.h | 5 +++ cuda/dvectordev.c | 19 +++++++++++ cuda/dvectordev.h | 3 ++ cuda/psb_c_cuda_vect_mod.F90 | 64 ++++++++++++++++++++++++++++++++++-- cuda/psb_c_vectordev_mod.F90 | 12 +++++++ cuda/psb_d_cuda_vect_mod.F90 | 64 ++++++++++++++++++++++++++++++++++-- cuda/psb_d_vectordev_mod.F90 | 12 +++++++ cuda/psb_s_cuda_vect_mod.F90 | 64 ++++++++++++++++++++++++++++++++++-- cuda/psb_s_vectordev_mod.F90 | 12 +++++++ cuda/psb_z_cuda_vect_mod.F90 | 64 ++++++++++++++++++++++++++++++++++-- cuda/psb_z_vectordev_mod.F90 | 12 +++++++ cuda/spgpu/kernels/Makefile | 3 +- cuda/spgpu/vector.h | 46 ++++++++++++++++++++++++++ cuda/svectordev.c | 21 ++++++++++++ cuda/svectordev.h | 3 ++ cuda/zvectordev.c | 24 +++++++++++++- cuda/zvectordev.h | 5 +++ 18 files changed, 445 insertions(+), 10 deletions(-) diff --git a/cuda/cvectordev.c b/cuda/cvectordev.c index 9db5202e..cdfda481 100644 --- a/cuda/cvectordev.c +++ b/cuda/cvectordev.c @@ -273,6 +273,28 @@ int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha,cuFloatComplex return(i); } +int xyzwMultiVecDeviceFloatComplex(int n,cuFloatComplex a,cuFloatComplex b, + cuFloatComplex c, cuFloatComplex d, + cuFloatComplex e, cuFloatComplex f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + struct MultiVectDevice *devVecW = (struct MultiVectDevice *) devMultiVecW; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuCxyzw(handle,n, a,b,c,d,e,f, + (cuFloatComplex *)devVecX->v_,(cuFloatComplex *) devVecY->v_, + (cuFloatComplex *) devVecZ->v_,(cuFloatComplex *) devVecW->v_); + return(i); +} + int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB) { int i = 0; diff --git a/cuda/cvectordev.h b/cuda/cvectordev.h index fc18e328..62693e27 100644 --- a/cuda/cvectordev.h +++ b/cuda/cvectordev.h @@ -72,6 +72,11 @@ int axpbyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void* devVecX, int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha,cuFloatComplex beta, cuFloatComplex gamma, cuFloatComplex delta, void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); +int xyzwMultiVecDeviceFloatComplex(int n,cuFloatComplex a,cuFloatComplex b, + cuFloatComplex c, cuFloatComplex d, + cuFloatComplex e, cuFloatComplex f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW); int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB, cuFloatComplex beta, void *deviceVecZ); diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index b4ca95f4..723f48d8 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -258,6 +258,25 @@ int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, do return(i); } +int xyzwMultiVecDeviceDouble(int n,double a, double b, double c, double d, double e, double f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + struct MultiVectDevice *devVecW = (struct MultiVectDevice *) devMultiVecW; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuDxyzw(handle,n, a,b,c,d,e,f, + (double*)devVecX->v_,(double*) devVecY->v_,(double*) devVecZ->v_,(double*) devVecW->v_); + return(i); +} + int axyMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB) { int i = 0; struct MultiVectDevice *devVecA = (struct MultiVectDevice *) deviceVecA; diff --git a/cuda/dvectordev.h b/cuda/dvectordev.h index 81a2e8f6..c2bfa1b5 100644 --- a/cuda/dvectordev.h +++ b/cuda/dvectordev.h @@ -69,6 +69,9 @@ int dotMultiVecDeviceDouble(double* y_res, int n, void* devVecA, void* devVecB); int axpbyMultiVecDeviceDouble(int n, double alpha, void* devVecX, double beta, void* devVecY); int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); +int xyzwMultiVecDeviceDouble(int n,double a, double b, double c, double d, double e, double f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW); int axyMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB, double beta, void *deviceVecZ); diff --git a/cuda/psb_c_cuda_vect_mod.F90 b/cuda/psb_c_cuda_vect_mod.F90 index 7eee128f..727249df 100644 --- a/cuda/psb_c_cuda_vect_mod.F90 +++ b/cuda/psb_c_cuda_vect_mod.F90 @@ -914,7 +914,6 @@ contains end subroutine c_cuda_axpby_v - subroutine c_cuda_abgdxyz(m,alpha, beta, gamma,delta,x, y, z, info) use psi_serial_mod implicit none @@ -975,9 +974,70 @@ contains call z%axpby(m,gamma,y,delta,info) end if - end subroutine c_cuda_abgdxyz + subroutine c_cuda_xyzw(m,a,b,c,d,e,f,x, y, z,w, info) + use psi_serial_mod + implicit none + integer(psb_ipk_), intent(in) :: m + class(psb_c_base_vect_type), intent(inout) :: x + class(psb_c_base_vect_type), intent(inout) :: y + class(psb_c_base_vect_type), intent(inout) :: z + class(psb_c_vect_cuda), intent(inout) :: w + complex(psb_spk_), intent (in) :: a,b,c,d,e,f + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: nx, ny, nz, nw + logical :: gpu_done + + info = psb_success_ + + gpu_done = .false. + if ((a==czero).or.(b==czero).or. & + & (c==czero).or.(d==czero).or.& + & (e==czero).or.(f==czero)) then + write(0,*) 'XYZW assumes a,b,c,d,e,f are all nonzero' + else + select type(xx => x) + class is (psb_c_vect_cuda) + select type(yy => y) + class is (psb_c_vect_cuda) + select type(zz => z) + class is (psb_c_vect_cuda) + ! Do something different here + if (xx%is_host()) call xx%sync() + if (yy%is_host()) call yy%sync() + if (zz%is_host()) call zz%sync() + if (w%is_host()) call w%sync() + nx = getMultiVecDeviceSize(xx%deviceVect) + ny = getMultiVecDeviceSize(yy%deviceVect) + nz = getMultiVecDeviceSize(zz%deviceVect) + nw = getMultiVecDeviceSize(w%deviceVect) + if ((nx x) + class is (psb_d_vect_cuda) + select type(yy => y) + class is (psb_d_vect_cuda) + select type(zz => z) + class is (psb_d_vect_cuda) + ! Do something different here + if (xx%is_host()) call xx%sync() + if (yy%is_host()) call yy%sync() + if (zz%is_host()) call zz%sync() + if (w%is_host()) call w%sync() + nx = getMultiVecDeviceSize(xx%deviceVect) + ny = getMultiVecDeviceSize(yy%deviceVect) + nz = getMultiVecDeviceSize(zz%deviceVect) + nw = getMultiVecDeviceSize(w%deviceVect) + if ((nx x) + class is (psb_s_vect_cuda) + select type(yy => y) + class is (psb_s_vect_cuda) + select type(zz => z) + class is (psb_s_vect_cuda) + ! Do something different here + if (xx%is_host()) call xx%sync() + if (yy%is_host()) call yy%sync() + if (zz%is_host()) call zz%sync() + if (w%is_host()) call w%sync() + nx = getMultiVecDeviceSize(xx%deviceVect) + ny = getMultiVecDeviceSize(yy%deviceVect) + nz = getMultiVecDeviceSize(zz%deviceVect) + nw = getMultiVecDeviceSize(w%deviceVect) + if ((nx x) + class is (psb_z_vect_cuda) + select type(yy => y) + class is (psb_z_vect_cuda) + select type(zz => z) + class is (psb_z_vect_cuda) + ! Do something different here + if (xx%is_host()) call xx%sync() + if (yy%is_host()) call yy%sync() + if (zz%is_host()) call zz%sync() + if (w%is_host()) call w%sync() + nx = getMultiVecDeviceSize(xx%deviceVect) + ny = getMultiVecDeviceSize(yy%deviceVect) + nz = getMultiVecDeviceSize(zz%deviceVect) + nw = getMultiVecDeviceSize(w%deviceVect) + if ((nxpitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuSxyzw(handle,n, a,b,c,d,e,f, + (float*)devVecX->v_,(float*) devVecY->v_, + (float*) devVecZ->v_,(float*) devVecW->v_); + return(i); +} + int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB) { int i = 0; struct MultiVectDevice *devVecA = (struct MultiVectDevice *) deviceVecA; diff --git a/cuda/svectordev.h b/cuda/svectordev.h index 730f929a..363c0108 100644 --- a/cuda/svectordev.h +++ b/cuda/svectordev.h @@ -69,6 +69,9 @@ int dotMultiVecDeviceFloat(float* y_res, int n, void* devVecA, void* devVecB); int axpbyMultiVecDeviceFloat(int n, float alpha, void* devVecX, float beta, void* devVecY); int abgdxyzMultiVecDeviceFloat(int n,float alpha,float beta, float gamma, float delta, void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); +int xyzwMultiVecDeviceFloat(int n,float a,float b, float c, float d, float e, float f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW); int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB, float beta, void *deviceVecZ); diff --git a/cuda/zvectordev.c b/cuda/zvectordev.c index d1f23f2a..e9f0cec7 100644 --- a/cuda/zvectordev.c +++ b/cuda/zvectordev.c @@ -251,7 +251,29 @@ int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, (cuDoubleComplex *)devVecX->v_,(cuDoubleComplex *) devVecY->v_,(cuDoubleComplex *) devVecZ->v_); return(i); } - + +int xyzwMultiVecDeviceDoubleComplex(int n,cuDoubleComplex a, cuDoubleComplex b, + cuDoubleComplex c, cuDoubleComplex d, + cuDoubleComplex e, cuDoubleComplex f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + struct MultiVectDevice *devVecW = (struct MultiVectDevice *) devMultiVecW; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuZxyzw(handle,n, a,b,c,d,e,f, + (cuDoubleComplex *)devVecX->v_,(cuDoubleComplex *) devVecY->v_, + (cuDoubleComplex *) devVecZ->v_,(cuDoubleComplex *) devVecW->v_); + return(i); +} + int axpbyMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, void* devMultiVecX, cuDoubleComplex beta, void* devMultiVecY) { int j=0, i=0; diff --git a/cuda/zvectordev.h b/cuda/zvectordev.h index 4c32f11c..ae623bdb 100644 --- a/cuda/zvectordev.h +++ b/cuda/zvectordev.h @@ -80,6 +80,11 @@ int axpbyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void* devVecX int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, cuDoubleComplex beta, cuDoubleComplex gamma, cuDoubleComplex delta, void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); +int xyzwMultiVecDeviceDoubleComplex(int n,cuDoubleComplex a, cuDoubleComplex b, + cuDoubleComplex c, cuDoubleComplex d, + cuDoubleComplex e, cuDoubleComplex f, + void* devMultiVecX, void* devMultiVecY, + void* devMultiVecZ, void* devMultiVecW); int axyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void *deviceVecA,