From 9daa04c3dcb0594cde09156e12619481401769e8 Mon Sep 17 00:00:00 2001 From: gabrielequatrana Date: Thu, 18 Apr 2024 22:35:27 +0200 Subject: [PATCH] Updated HLG SpMM (s,d,c,z) --- base/modules/serial/psb_c_base_mat_mod.F90 | 1 + base/modules/serial/psb_s_base_mat_mod.F90 | 1 + base/modules/serial/psb_z_base_mat_mod.F90 | 1 + cuda/hlldev.c | 164 +++++--- cuda/hlldev_mod.F90 | 36 ++ cuda/impl/psb_c_cuda_hlg_csmm.F90 | 8 +- cuda/impl/psb_c_cuda_hlg_vect_mv.F90 | 89 +++++ cuda/impl/psb_d_cuda_hlg_csmm.F90 | 2 +- cuda/impl/psb_d_cuda_hlg_vect_mv.F90 | 2 +- cuda/impl/psb_s_cuda_hlg_csmm.F90 | 8 +- cuda/impl/psb_s_cuda_hlg_vect_mv.F90 | 89 +++++ cuda/impl/psb_z_cuda_hlg_csmm.F90 | 8 +- cuda/impl/psb_z_cuda_hlg_vect_mv.F90 | 89 +++++ cuda/psb_c_cuda_hlg_mat_mod.F90 | 10 + cuda/psb_s_cuda_hlg_mat_mod.F90 | 10 + cuda/psb_z_cuda_hlg_mat_mod.F90 | 10 + cuda/spgpu/hell.h | 181 ++++++++- cuda/spgpu/kernels/Makefile | 3 +- cuda/spgpu/kernels/hell_cspmm.cu | 35 ++ cuda/spgpu/kernels/hell_dspmm.cu | 35 ++ cuda/spgpu/kernels/hell_dspmv.cu | 371 ------------------ cuda/spgpu/kernels/hell_spmm_base.cuh | 275 +++++++++++++ .../spgpu/kernels/hell_spmm_base_template.cuh | 197 ++++++++++ cuda/spgpu/kernels/hell_sspmm.cu | 34 ++ cuda/spgpu/kernels/hell_zspmm.cu | 35 ++ 25 files changed, 1237 insertions(+), 457 deletions(-) create mode 100644 cuda/spgpu/kernels/hell_cspmm.cu create mode 100644 cuda/spgpu/kernels/hell_dspmm.cu create mode 100644 cuda/spgpu/kernels/hell_spmm_base.cuh create mode 100644 cuda/spgpu/kernels/hell_spmm_base_template.cuh create mode 100644 cuda/spgpu/kernels/hell_sspmm.cu create mode 100644 cuda/spgpu/kernels/hell_zspmm.cu diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index 33982e3a..52fd8111 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_c_base_mat_mod use psb_base_mat_mod use psb_c_base_vect_mod + use psb_c_base_multivect_mod !> \namespace psb_base_mod \class psb_c_base_sparse_mat diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 92bda7d8..29b1c2dc 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_s_base_mat_mod use psb_base_mat_mod use psb_s_base_vect_mod + use psb_s_base_multivect_mod !> \namespace psb_base_mod \class psb_s_base_sparse_mat diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 3e8196f4..0cee142e 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_z_base_mat_mod use psb_base_mat_mod use psb_z_base_vect_mod + use psb_z_base_multivect_mod !> \namespace psb_base_mod \class psb_z_base_sparse_mat diff --git a/cuda/hlldev.c b/cuda/hlldev.c index eb78c16c..a5862a29 100644 --- a/cuda/hlldev.c +++ b/cuda/hlldev.c @@ -156,7 +156,9 @@ int FallocHllDevice(void** deviceMat,unsigned int hksize, unsigned int rows, un return(i); } - +// +// Single Precision Float +// int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, float beta, void* deviceY) { @@ -170,108 +172,170 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif - /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, - devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta, - devMat->baseIndex);*/ - - spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, (float *)devMat->cM, - devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL, + spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, + (float *)devMat->cM, devMat->rP, + devMat->hackSize, devMat->hackOffs, devMat->rS, NULL, devMat->avgNzr, devMat->rows, (float *)x->v_, beta, devMat->baseIndex); return SPGPU_SUCCESS; } -void -dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP, - int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, - double *x, int xPitch, double beta, int firstIndex) +int spmmHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, + float beta, void* deviceY) { - int i=0; + HllDevice *devMat = (HllDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; spgpuHandle_t handle=psb_cudaGetHandle(); -#if defined(NEW_MM) - spgpuDhellspmm(handle, count, (double*) z, zPitch, (double*)z, zPitch, - alpha, (double*) cM, rP,hackSize, hackOffs, rS, NULL, - rows, (double*)x, xPitch, beta, firstIndex); -#else - - for (i=0; icount_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif + spgpuShellspmm(handle, y->count_, (float *)y->v_, y->pitch_, + (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM, + devMat->rP, devMat->hackSize, devMat->hackOffs, + devMat->rS, NULL, devMat->rows, (float*)x->v_, + x->pitch_, beta, devMat->baseIndex); + + return SPGPU_SUCCESS; } -//new +// +// Double Precision +// int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, double beta, void* deviceY) { HllDevice *devMat = (HllDevice *) deviceMat; struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); #ifdef VERBOSE /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif - dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, - alpha, (double *)devMat->cM, - devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs, - devMat->avgNzr, devMat->rows, - (double *)x->v_, x->pitch_, beta, devMat->baseIndex); + spgpuDhellspmv (handle, (double *)y->v_, (double *)y->v_, alpha, + (double *)devMat->cM, devMat->rP, + devMat->hackSize, devMat->hackOffs, devMat->rS, NULL, + devMat->avgNzr, devMat->rows, (double *)x->v_, beta, devMat->baseIndex); return SPGPU_SUCCESS; } -int spmvHllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX, - float complex beta, void* deviceY) +int spmmHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, + double beta, void* deviceY) { HllDevice *devMat = (HllDevice *) deviceMat; struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; spgpuHandle_t handle=psb_cudaGetHandle(); - cuFloatComplex a = make_cuFloatComplex(crealf(alpha),cimagf(alpha)); - cuFloatComplex b = make_cuFloatComplex(crealf(beta),cimagf(beta)); #ifdef VERBOSE /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif - /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, - devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta, - devMat->baseIndex);*/ + spgpuDhellspmm(handle, y->count_, (double *)y->v_, y->pitch_, + (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM, + devMat->rP, devMat->hackSize, devMat->hackOffs, + devMat->rS, NULL, devMat->rows, (double *)x->v_, + x->pitch_, beta, devMat->baseIndex); - spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, a, (cuFloatComplex *)devMat->cM, - devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL, - devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, b, devMat->baseIndex); - return SPGPU_SUCCESS; } -int spmvHllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX, - double complex beta, void* deviceY) +// +// Single Precision Float Complex +// +int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, + cuFloatComplex beta, void* deviceY) { HllDevice *devMat = (HllDevice *) deviceMat; struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; spgpuHandle_t handle=psb_cudaGetHandle(); - cuDoubleComplex a = make_cuDoubleComplex(creal(alpha),cimag(alpha)); - cuDoubleComplex b = make_cuDoubleComplex(creal(beta),cimag(beta)); #ifdef VERBOSE /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif + spgpuShellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha, + (cuFloatComplex *)devMat->cM, devMat->rP, + devMat->hackSize, devMat->hackOffs, devMat->rS, NULL, + devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex); + + return SPGPU_SUCCESS; +} + +int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, + cuFloatComplex beta, void* deviceY) +{ + HllDevice *devMat = (HllDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); - spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, a, (cuDoubleComplex *)devMat->cM, - devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL, - devMat->avgNzr,devMat->rows, (cuDoubleComplex *)x->v_, b, devMat->baseIndex); +#ifdef VERBOSE + /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ +#endif + spgpuShellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_, + (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM, + devMat->rP, devMat->hackSize, devMat->hackOffs, + devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_, + x->pitch_, beta, devMat->baseIndex); + + return SPGPU_SUCCESS; +} + +// +// Double Precision Complex +// +int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, + cuDoubleComplex beta, void* deviceY) +{ + HllDevice *devMat = (HllDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); + +#ifdef VERBOSE + /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ +#endif + spgpuShellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha, + (cuDoubleComplex *)devMat->cM, devMat->rP, + devMat->hackSize, devMat->hackOffs, devMat->rS, NULL, + devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex); + + return SPGPU_SUCCESS; +} + +int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, + cuDoubleComplex beta, void* deviceY) +{ + HllDevice *devMat = (HllDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); + +#ifdef VERBOSE + /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ +#endif + spgpuShellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_, + (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM, + devMat->rP, devMat->hackSize, devMat->hackOffs, + devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_, + x->pitch_, beta, devMat->baseIndex); return SPGPU_SUCCESS; } diff --git a/cuda/hlldev_mod.F90 b/cuda/hlldev_mod.F90 index 90b8e13c..edac8956 100644 --- a/cuda/hlldev_mod.F90 +++ b/cuda/hlldev_mod.F90 @@ -265,4 +265,40 @@ module hlldev_mod end interface +interface spmmHllDevice + + function spmmHllDeviceFloat(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHllDeviceFloat') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + real(c_float),value :: alpha, beta + end function spmmHllDeviceFloat + + function spmmHllDeviceDouble(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHllDeviceDouble') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + real(c_double),value :: alpha, beta + end function spmmHllDeviceDouble + + function spmmHllDeviceFloatComplex(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHllDeviceFloatComplex') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + complex(c_float_complex),value :: alpha, beta + end function spmmHllDeviceFloatComplex + + function spmmHllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHllDeviceDoubleComplex') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + complex(c_double_complex),value :: alpha, beta + end function spmmHllDeviceDoubleComplex + +end interface + end module hlldev_mod diff --git a/cuda/impl/psb_c_cuda_hlg_csmm.F90 b/cuda/impl/psb_c_cuda_hlg_csmm.F90 index 88aa53a8..f3d3dd29 100644 --- a/cuda/impl/psb_c_cuda_hlg_csmm.F90 +++ b/cuda/impl/psb_c_cuda_hlg_csmm.F90 @@ -98,16 +98,16 @@ subroutine psb_c_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & - & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY) + & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_c_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_c_cuda_hlg_vect_mv.F90 index 3789ef17..6ddbe0fd 100644 --- a/cuda/impl/psb_c_cuda_hlg_vect_mv.F90 +++ b/cuda/impl/psb_c_cuda_hlg_vect_mv.F90 @@ -117,3 +117,92 @@ subroutine psb_c_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans) return end subroutine psb_c_cuda_hlg_vect_mv + +subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hlldev_mod + use psb_vectordev_mod + use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_multivect_mv + use psb_c_cuda_vect_mod + implicit none + class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_vect_type), intent(inout) :: x + class(psb_c_base_vect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + complex(psb_spk_), allocatable :: rx(:), ry(:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='c_cuda_hlg_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (.not.x%is_host()) call x%sync() + if (beta /= czero) then + if (.not.y%is_host()) call y%sync() + end if + if (a%is_dev()) call a%sync() + call a%psb_c_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_c_vect_cuda) + select type(yy => y) + type is (psb_c_vect_cuda) + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + rx = xx%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + rx = x%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_cuda_hlg_multivect_mv \ No newline at end of file diff --git a/cuda/impl/psb_d_cuda_hlg_csmm.F90 b/cuda/impl/psb_d_cuda_hlg_csmm.F90 index 78a1820a..862cfe41 100644 --- a/cuda/impl/psb_d_cuda_hlg_csmm.F90 +++ b/cuda/impl/psb_d_cuda_hlg_csmm.F90 @@ -105,7 +105,7 @@ subroutine psb_d_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans) & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & - & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY) + & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 index 6e32fc71..2c87f8f7 100644 --- a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 @@ -172,7 +172,7 @@ subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) if (beta /= dzero) then if (yy%is_host()) call yy%sync() end if - info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,& + info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,& & beta,yy%deviceVect) if (info /= 0) then call psb_errpush(psb_err_from_subroutine_ai_,name,& diff --git a/cuda/impl/psb_s_cuda_hlg_csmm.F90 b/cuda/impl/psb_s_cuda_hlg_csmm.F90 index 0dc28c7f..60443a8b 100644 --- a/cuda/impl/psb_s_cuda_hlg_csmm.F90 +++ b/cuda/impl/psb_s_cuda_hlg_csmm.F90 @@ -98,16 +98,16 @@ subroutine psb_s_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & - & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY) + & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_s_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_s_cuda_hlg_vect_mv.F90 index 94696949..3c4974c2 100644 --- a/cuda/impl/psb_s_cuda_hlg_vect_mv.F90 +++ b/cuda/impl/psb_s_cuda_hlg_vect_mv.F90 @@ -117,3 +117,92 @@ subroutine psb_s_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans) return end subroutine psb_s_cuda_hlg_vect_mv + +subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hlldev_mod + use psb_vectordev_mod + use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_multivect_mv + use psb_s_cuda_vect_mod + implicit none + class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_vect_type), intent(inout) :: x + class(psb_s_base_vect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + real(psb_spk_), allocatable :: rx(:), ry(:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='s_cuda_hlg_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (.not.x%is_host()) call x%sync() + if (beta /= szero) then + if (.not.y%is_host()) call y%sync() + end if + if (a%is_dev()) call a%sync() + call a%psb_s_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_s_vect_cuda) + select type(yy => y) + type is (psb_s_vect_cuda) + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + rx = xx%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + rx = x%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_cuda_hlg_multivect_mv diff --git a/cuda/impl/psb_z_cuda_hlg_csmm.F90 b/cuda/impl/psb_z_cuda_hlg_csmm.F90 index 8eb30ef9..b8ec358d 100644 --- a/cuda/impl/psb_z_cuda_hlg_csmm.F90 +++ b/cuda/impl/psb_z_cuda_hlg_csmm.F90 @@ -98,16 +98,16 @@ subroutine psb_z_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & - & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY) + & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_z_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_z_cuda_hlg_vect_mv.F90 index e2e93b85..d09326bd 100644 --- a/cuda/impl/psb_z_cuda_hlg_vect_mv.F90 +++ b/cuda/impl/psb_z_cuda_hlg_vect_mv.F90 @@ -117,3 +117,92 @@ subroutine psb_z_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans) return end subroutine psb_z_cuda_hlg_vect_mv + +subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hlldev_mod + use psb_vectordev_mod + use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_multivect_mv + use psb_z_cuda_vect_mod + implicit none + class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_vect_type), intent(inout) :: x + class(psb_z_base_vect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + complex(psb_dpk_), allocatable :: rx(:), ry(:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='z_cuda_hlg_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (.not.x%is_host()) call x%sync() + if (beta /= zzero) then + if (.not.y%is_host()) call y%sync() + end if + if (a%is_dev()) call a%sync() + call a%psb_z_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_z_vect_cuda) + select type(yy => y) + type is (psb_z_vect_cuda) + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + rx = xx%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + rx = x%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + + 9999 call psb_error_handler(err_act) + + return + + end subroutine psb_z_cuda_hlg_multivect_mv diff --git a/cuda/psb_c_cuda_hlg_mat_mod.F90 b/cuda/psb_c_cuda_hlg_mat_mod.F90 index e98f2474..4ef5b019 100644 --- a/cuda/psb_c_cuda_hlg_mat_mod.F90 +++ b/cuda/psb_c_cuda_hlg_mat_mod.F90 @@ -55,6 +55,7 @@ module psb_c_cuda_hlg_mat_mod contains procedure, nopass :: get_fmt => c_cuda_hlg_get_fmt procedure, pass(a) :: sizeof => c_cuda_hlg_sizeof + procedure, pass(a) :: multivect_mv => psb_c_cuda_hlg_multivect_mv procedure, pass(a) :: vect_mv => psb_c_cuda_hlg_vect_mv procedure, pass(a) :: csmm => psb_c_cuda_hlg_csmm procedure, pass(a) :: csmv => psb_c_cuda_hlg_csmv @@ -97,6 +98,15 @@ module psb_c_cuda_hlg_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_c_cuda_hlg_vect_mv + subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_c_cuda_hlg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_ + class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_multivect_type), intent(inout) :: x + class(psb_c_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_c_cuda_hlg_multivect_mv end interface interface diff --git a/cuda/psb_s_cuda_hlg_mat_mod.F90 b/cuda/psb_s_cuda_hlg_mat_mod.F90 index 4f7b4b6f..a7099ea4 100644 --- a/cuda/psb_s_cuda_hlg_mat_mod.F90 +++ b/cuda/psb_s_cuda_hlg_mat_mod.F90 @@ -56,6 +56,7 @@ module psb_s_cuda_hlg_mat_mod procedure, nopass :: get_fmt => s_cuda_hlg_get_fmt procedure, pass(a) :: sizeof => s_cuda_hlg_sizeof procedure, pass(a) :: vect_mv => psb_s_cuda_hlg_vect_mv + procedure, pass(a) :: multivect_mv => psb_s_cuda_hlg_multivect_mv procedure, pass(a) :: csmm => psb_s_cuda_hlg_csmm procedure, pass(a) :: csmv => psb_s_cuda_hlg_csmv procedure, pass(a) :: in_vect_sv => psb_s_cuda_hlg_inner_vect_sv @@ -97,6 +98,15 @@ module psb_s_cuda_hlg_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_s_cuda_hlg_vect_mv + subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_s_cuda_hlg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_ + class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_multivect_type), intent(inout) :: x + class(psb_s_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_s_cuda_hlg_multivect_mv end interface interface diff --git a/cuda/psb_z_cuda_hlg_mat_mod.F90 b/cuda/psb_z_cuda_hlg_mat_mod.F90 index 2acf43f1..61f30374 100644 --- a/cuda/psb_z_cuda_hlg_mat_mod.F90 +++ b/cuda/psb_z_cuda_hlg_mat_mod.F90 @@ -56,6 +56,7 @@ module psb_z_cuda_hlg_mat_mod procedure, nopass :: get_fmt => z_cuda_hlg_get_fmt procedure, pass(a) :: sizeof => z_cuda_hlg_sizeof procedure, pass(a) :: vect_mv => psb_z_cuda_hlg_vect_mv + procedure, pass(a) :: multivect_mv => psb_z_cuda_hlg_multivect_mv procedure, pass(a) :: csmm => psb_z_cuda_hlg_csmm procedure, pass(a) :: csmv => psb_z_cuda_hlg_csmv procedure, pass(a) :: in_vect_sv => psb_z_cuda_hlg_inner_vect_sv @@ -97,6 +98,15 @@ module psb_z_cuda_hlg_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_z_cuda_hlg_vect_mv + subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_z_cuda_hlg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_ + class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_multivect_type), intent(inout) :: x + class(psb_z_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_z_cuda_hlg_multivect_mv end interface interface diff --git a/cuda/spgpu/hell.h b/cuda/spgpu/hell.h index e62f9ae0..2c339b80 100644 --- a/cuda/spgpu/hell.h +++ b/cuda/spgpu/hell.h @@ -68,7 +68,46 @@ void spgpuShellspmv (spgpuHandle_t handle, float beta, int baseIndex); - +/** +* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device float *x,int xpitch,float beta,int baseIndex) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param cM The HELL non zero values allocation pointer + * \param rP The HELL column indices allocation pointer + * \param hackSize The constant size of every hack (must be a multiple of 32). + * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation. + * \param rS the array containing the row sized (in non zero elements) + * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format). + * \param rows the rows count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran). + */ +void spgpuShellspmm(spgpuHandle_t handle, + int count, + __device float *z, + int zpitch, + const __device float *y, + int ypitch, + float alpha, + const __device float* cM, + const __device int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const __device float *x, + int xpitch, + float beta, + int baseIndex); /** * \fn void spgpuDhellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device double *x, double beta,int baseIndex) @@ -104,26 +143,47 @@ void spgpuDhellspmv (spgpuHandle_t handle, const __device double *x, double beta, int baseIndex); -#if defined(NEW_MM) + +/** +* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device double *x,int xpitch,double beta,int baseIndex) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param cM The HELL non zero values allocation pointer + * \param rP The HELL column indices allocation pointer + * \param hackSize The constant size of every hack (must be a multiple of 32). + * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation. + * \param rS the array containing the row sized (in non zero elements) + * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format). + * \param rows the rows count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran). + */ void spgpuDhellspmm(spgpuHandle_t handle, - int count, - __device double *z, - int zpitch, - const __device double *y, - int ypitch, - double alpha, - const __device double* cM, - const __device int* rP, - int hackSize, - const __device int* hackOffsets, - const __device int* rS, - const __device int* rIdx, - int rows, - const __device double *x, - int xpitch, - double beta, - int baseIndex); -#endif + int count, + __device double *z, + int zpitch, + const __device double *y, + int ypitch, + double alpha, + const __device double* cM, + const __device int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const __device double *x, + int xpitch, + double beta, + int baseIndex); /** * \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex) @@ -160,7 +220,46 @@ void spgpuChellspmv (spgpuHandle_t handle, cuFloatComplex beta, int baseIndex); - +/** +* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param cM The HELL non zero values allocation pointer + * \param rP The HELL column indices allocation pointer + * \param hackSize The constant size of every hack (must be a multiple of 32). + * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation. + * \param rS the array containing the row sized (in non zero elements) + * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format). + * \param rows the rows count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran). + */ +void spgpuChellspmm(spgpuHandle_t handle, + int count, + __device cuFloatComplex *z, + int zpitch, + const __device cuFloatComplex *y, + int ypitch, + cuFloatComplex alpha, + const __device cuFloatComplex* cM, + const __device int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const __device cuFloatComplex *x, + int xpitch, + cuFloatComplex beta, + int baseIndex); /** * \fn void spgpuZhellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex) @@ -197,6 +296,46 @@ void spgpuZhellspmv (spgpuHandle_t handle, cuDoubleComplex beta, int baseIndex); +/** +* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param cM The HELL non zero values allocation pointer + * \param rP The HELL column indices allocation pointer + * \param hackSize The constant size of every hack (must be a multiple of 32). + * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation. + * \param rS the array containing the row sized (in non zero elements) + * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format). + * \param rows the rows count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran). + */ +void spgpuZhellspmm(spgpuHandle_t handle, + int count, + __device cuDoubleComplex *z, + int zpitch, + const __device cuDoubleComplex *y, + int ypitch, + cuDoubleComplex alpha, + const __device cuDoubleComplex* cM, + const __device int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const __device cuDoubleComplex *x, + int xpitch, + cuDoubleComplex beta, + int baseIndex); /** @}*/ diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile index c4ae38a0..8a71263f 100644 --- a/cuda/spgpu/kernels/Makefile +++ b/cuda/spgpu/kernels/Makefile @@ -17,7 +17,8 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \ dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \ ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \ hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \ - hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \ + hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \ + igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \ saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\ zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \ sxyzw.o cxyzw.o dxyzw.o zxyzw.o diff --git a/cuda/spgpu/kernels/hell_cspmm.cu b/cuda/spgpu/kernels/hell_cspmm.cu new file mode 100644 index 00000000..dea4c60b --- /dev/null +++ b/cuda/spgpu/kernels/hell_cspmm.cu @@ -0,0 +1,35 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2014 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include "cuComplex.h" + +extern "C" +{ +#include "core.h" +#include "hell.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE cuFloatComplex +#define TYPE_SYMBOL C +#define TEX_FETCH_TYPE cuFloatComplex +#include "hell_spmm_base.cuh" diff --git a/cuda/spgpu/kernels/hell_dspmm.cu b/cuda/spgpu/kernels/hell_dspmm.cu new file mode 100644 index 00000000..e6b263b6 --- /dev/null +++ b/cuda/spgpu/kernels/hell_dspmm.cu @@ -0,0 +1,35 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2014 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "hell.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE double +#define TYPE_SYMBOL D +#define TEX_FETCH_TYPE int2 +#include "hell_spmm_base.cuh" diff --git a/cuda/spgpu/kernels/hell_dspmv.cu b/cuda/spgpu/kernels/hell_dspmv.cu index 509131d4..4e120869 100644 --- a/cuda/spgpu/kernels/hell_dspmv.cu +++ b/cuda/spgpu/kernels/hell_dspmv.cu @@ -32,374 +32,3 @@ extern "C" #define TYPE_SYMBOL D #define TEX_FETCH_TYPE int2 #include "hell_spmv_base.cuh" - - - -#if 0 - -#define MMBSZ 8 - -#undef GEN_SPGPU_HELL_NAME -#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm) -#undef GEN_SPGPU_HELL_NAME_VANILLA -#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla) - - -__global__ void -CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) - (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, - VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, - int hackSize, const int* hackOffsets, const int* rS, int rows, - const VALUE_TYPE *x, int xPitch, - VALUE_TYPE beta, int baseIndex) -{ - VALUE_TYPE *pz,*px,*py; - VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); - VALUE_TYPE yVal; - __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; - - int *rrP; - VALUE_TYPE *rcM; - - unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int gridSize = gridDim.x * blockDim.x; - - while (i < rows) { - int j; - int hackId = i / hackSize; - int hackLaneId = i % hackSize; - - int hackOffset; - unsigned int laneId = threadIdx.x % 32; - if (laneId == 0) - hackOffset = hackOffsets[hackId]; - //__syncthreads(); - hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId; - - rrP = (int *) rP + hackOffset; - rcM = (VALUE_TYPE *) cM + hackOffset; - - int rowSize = rS[i]; - for (int k=0; k maxShmemSz) { - fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz); - return; - } - cnt = count; - px = (VALUE_TYPE *) x; - py = (VALUE_TYPE *) y; - pz = (VALUE_TYPE *) z; - while (cnt > 2*MMBSZ) { - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) - <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch, - alpha, cM, rP, hackSize, hackOffsets, - rS, rows, px, xPitch, beta, baseIndex); - px += xPitch*MMBSZ; - py += yPitch*MMBSZ; - pz += zPitch*MMBSZ; - cnt -= MMBSZ; - } - if (cnt > MMBSZ) { - c1 = cnt/2; - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) - <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch, - alpha, cM, rP, hackSize, hackOffsets, - rS, rows, px, xPitch, beta, baseIndex); - cnt -= c1; - } - if (cnt > MMBSZ) { - fprintf(stderr,"Invalid residual count %d\n",cnt); - } else if (cnt > 0){ - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) - <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch, - alpha, cM, rP, hackSize, hackOffsets, - rS, rows, px, xPitch, beta, baseIndex); - } - cudaCheckError("CUDA error on hell_spmm"); -} - -#elif defined(NEW_MM) - -#define MMBSZ 8 - -#undef GEN_SPGPU_HELL_NAME -#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm) -#undef GEN_SPGPU_HELL_NAME_VANILLA -#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla) - - -__global__ void -CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) - (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, - VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, - int hackSize, const int* hackOffsets, const int* rS, int rows, - const VALUE_TYPE *x, int xPitch, - VALUE_TYPE beta, int baseIndex) -{ - VALUE_TYPE *pz,*px,*py; - VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); - VALUE_TYPE yVal; - __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; - - int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK); - - if (i < rows) { - int j; - int hackId = i / hackSize; - int hackLaneId = i % hackSize; - - int hackOffset; - unsigned int laneId = threadIdx.x % 32; - if (laneId == 0) - hackOffset = hackOffsets[hackId]; - //__syncthreads(); - hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId; - - rP += hackOffset; - cM += hackOffset; - - int rowSize = rS[i]; - for (int k=0; kcurrentStream >>> (count, z, zPitch,y, yPitch, - alpha, cM, rP, hackSize, hackOffsets, rS, rows, - x, xPitch, beta, baseIndex); -} - -void -GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) - (spgpuHandle_t handle, - int count, - VALUE_TYPE* z, - int zPitch, - const VALUE_TYPE *y, - int yPitch, - VALUE_TYPE alpha, - const VALUE_TYPE* cM, - const int* rP, - int hackSize, - const __device int* hackOffsets, - const __device int* rS, - const __device int* rIdx, - int rows, - const VALUE_TYPE *x, - int xPitch, - VALUE_TYPE beta, - int baseIndex) -{ - VALUE_TYPE *px,*py, *pz; - int cnt; - int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); - - // maxNForACall should be a multiple of hackSize - maxNForACall = (maxNForACall/hackSize)*hackSize; - int maxShmemSz; - maxShmemSz=getGPUSharedMemPerBlock(); - //fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz); - while (rows > maxNForACall) {//managing large vectors - cnt = count; - px = (VALUE_TYPE *) x; - py = (VALUE_TYPE *) y; - pz = (VALUE_TYPE *) z; - while (cnt > MMBSZ) { - //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); - CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, - py, yPitch, - alpha, cM, rP, - hackSize, hackOffsets, - rS, rIdx, - maxNForACall, - px, xPitch, beta, baseIndex); - px += xPitch*MMBSZ; - py += yPitch*MMBSZ; - pz += zPitch*MMBSZ; - cnt -= MMBSZ; - } - if (cnt >0) { - CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, - py, yPitch, - alpha, cM, rP, - hackSize, hackOffsets, - rS, rIdx, - maxNForACall, - px, xPitch, beta, baseIndex); - } - - y = y + maxNForACall; - z = z + maxNForACall; - hackOffsets = hackOffsets + maxNForACall/hackSize; - rS = rS + maxNForACall; - - rows -= maxNForACall; - } - cnt = count; - px = (VALUE_TYPE *) x; - py = (VALUE_TYPE *) y; - pz = (VALUE_TYPE *) z; - while (cnt > MMBSZ) { - //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); - CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch, - alpha, cM, rP, hackSize, hackOffsets, - rS, rIdx, rows, - px, xPitch, beta, baseIndex); - px += xPitch*MMBSZ; - py += yPitch*MMBSZ; - pz += zPitch*MMBSZ; - cnt -= MMBSZ; - } - if (cnt >0) { - CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, - py, yPitch, - alpha, cM, rP, - hackSize, hackOffsets, - rS, rIdx, - rows, - px, xPitch, beta, baseIndex); - } - - - cudaCheckError("CUDA error on hell_spmm"); -} - - -#endif - diff --git a/cuda/spgpu/kernels/hell_spmm_base.cuh b/cuda/spgpu/kernels/hell_spmm_base.cuh new file mode 100644 index 00000000..1aaf6b85 --- /dev/null +++ b/cuda/spgpu/kernels/hell_spmm_base.cuh @@ -0,0 +1,275 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define PRE_CONCAT(A, B) A ## B +#define CONCAT(A, B) PRE_CONCAT(A, B) + +#undef GEN_SPGPU_HELL_NAME +#undef X_TEX +#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX) + +__device__ __host__ static float zero_float() { return 0.0f; } +__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); } +__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; } + +__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); } +__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); } +__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); } + +__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } +__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); } +__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); } + +__device__ static float readValue_float(float fetch) { return fetch; } +__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; } + +// host or c.c >= 1.3 +#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__) +__device__ __host__ static double zero_double() { return 0.0; } +__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); } +__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; } + +__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); } +__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); } +__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); } + +__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); } +__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); } +__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); } + +__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); } +__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) +{ + cuDoubleComplex c; + c.x = __hiloint2double (fetch.y, fetch.x); + c.y = __hiloint2double (fetch.w, fetch.z); + return c; +} +#endif +#if 0 +// Texture cache management +texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX; + +#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x) +#define unbind_tex_x(x) cudaUnbindTexture(X_TEX) + +__device__ static VALUE_TYPE +fetchTex (int pointer) +{ + TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer); + return CONCAT(readValue_,VALUE_TYPE) (fetch); +} +#endif +#if __CUDA_ARCH__ < 300 +extern __shared__ int dynShrMem[]; +#endif + +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla) +#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla) +#include "hell_spmm_base_template.cuh" +#undef GEN_SPGPU_HELL_NAME +#if 0 +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch) +#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch) +#undef USE_PREFETCHING +#define USE_PREFETCHING +#include "hell_spmm_base_template.cuh" +#define ENABLE_CACHE +#undef GEN_SPGPU_HELL_NAME +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch) +#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch) +#include "hell_spmm_base_template.cuh" +#undef GEN_SPGPU_HELL_NAME +#undef USE_PREFETCHING +#endif +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache) +#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache) +#include "hell_spmm_base_template.cuh" +#undef GEN_SPGPU_HELL_NAME +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm) + +#if 0 + +void +GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) +(spgpuHandle_t handle, + int count, + VALUE_TYPE* z, + int zPitch, + const VALUE_TYPE *y, + int yPitch, + VALUE_TYPE alpha, + const VALUE_TYPE* cM, + const int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const VALUE_TYPE *x, + int xPitch, + VALUE_TYPE beta, + int baseIndex) +{ + VALUE_TYPE *px,*py, *pz; + int cnt, c1; + + dim3 block (THREAD_BLOCK, 1); + // dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK); + // Should we generalize the code to 1/2/4/8 threads per row? + // And maybe adjust THREAD_BLOCK size? + int shrMemSize,maxShmemSz; + int numMp=getGPUMultiProcessors(); + int maxThMp=getGPUMaxThreadsPerMP(); + int nmblksMp=maxThMp/THREAD_BLOCK; + int nmblk=nmblksMp*numMp; + dim3 grid (nmblk); + + maxShmemSz=getGPUSharedMemPerBlock(); + shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE); + if (shrMemSize > maxShmemSz) { + fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz); + return; + } + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > 2*MMBSZ) { + CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch, + alpha, cM, rP, hackSize, hackOffsets, + rS, rows, px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt > MMBSZ) { + c1 = cnt/2; + CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch, + alpha, cM, rP, hackSize, hackOffsets, + rS, rows, px, xPitch, beta, baseIndex); + cnt -= c1; + } + if (cnt > MMBSZ) { + fprintf(stderr,"Invalid residual count %d\n",cnt); + } else if (cnt > 0){ + CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch, + alpha, cM, rP, hackSize, hackOffsets, + rS, rows, px, xPitch, beta, baseIndex); + } + cudaCheckError("CUDA error on hell_spmm"); +} + +#endif + +void +GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) +(spgpuHandle_t handle, + int count, + VALUE_TYPE* z, + int zPitch, + const VALUE_TYPE *y, + int yPitch, + VALUE_TYPE alpha, + const VALUE_TYPE* cM, + const int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const VALUE_TYPE *x, + int xPitch, + VALUE_TYPE beta, + int baseIndex) +{ + VALUE_TYPE *px,*py, *pz; + int cnt; + int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); + + // maxNForACall should be a multiple of hackSize + maxNForACall = (maxNForACall/hackSize)*hackSize; + int maxShmemSz; + maxShmemSz=getGPUSharedMemPerBlock(); + //fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz); + while (rows > maxNForACall) {//managing large vectors + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + maxNForACall, + px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + maxNForACall, + px, xPitch, beta, baseIndex); + } + + y = y + maxNForACall; + z = z + maxNForACall; + hackOffsets = hackOffsets + maxNForACall/hackSize; + rS = rS + maxNForACall; + + rows -= maxNForACall; + } + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch, + alpha, cM, rP, hackSize, hackOffsets, + rS, rIdx, rows, + px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + rows, + px, xPitch, beta, baseIndex); + } + + + cudaCheckError("CUDA error on hell_spmm"); +} diff --git a/cuda/spgpu/kernels/hell_spmm_base_template.cuh b/cuda/spgpu/kernels/hell_spmm_base_template.cuh new file mode 100644 index 00000000..1d085875 --- /dev/null +++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh @@ -0,0 +1,197 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define IDX2 +#define THREAD_BLOCK 128 +#define MMBSZ 8 + +#if 0 + +__global__ void +CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, + VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, + int hackSize, const int* hackOffsets, const int* rS, int rows, + const VALUE_TYPE *x, int xPitch, + VALUE_TYPE beta, int baseIndex) +{ + VALUE_TYPE *pz,*px,*py; + VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); + VALUE_TYPE yVal; + __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; + + int *rrP; + VALUE_TYPE *rcM; + + unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int gridSize = gridDim.x * blockDim.x; + + while (i < rows) { + int j; + int hackId = i / hackSize; + int hackLaneId = i % hackSize; + + int hackOffset; + unsigned int laneId = threadIdx.x % 32; + if (laneId == 0) + hackOffset = hackOffsets[hackId]; + //__syncthreads(); + hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId; + + rrP = (int *) rP + hackOffset; + rcM = (VALUE_TYPE *) cM + hackOffset; + + int rowSize = rS[i]; + for (int k=0; kcurrentStream >>> (count, z, zPitch,y, yPitch, + alpha, cM, rP, hackSize, hackOffsets, rS, rows, + x, xPitch, beta, baseIndex); +} diff --git a/cuda/spgpu/kernels/hell_sspmm.cu b/cuda/spgpu/kernels/hell_sspmm.cu new file mode 100644 index 00000000..7f50c035 --- /dev/null +++ b/cuda/spgpu/kernels/hell_sspmm.cu @@ -0,0 +1,34 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2014 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" + +extern "C" +{ +#include "core.h" +#include "hell.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE float +#define TYPE_SYMBOL S +#define TEX_FETCH_TYPE float +#include "hell_spmm_base.cuh" diff --git a/cuda/spgpu/kernels/hell_zspmm.cu b/cuda/spgpu/kernels/hell_zspmm.cu new file mode 100644 index 00000000..13a12046 --- /dev/null +++ b/cuda/spgpu/kernels/hell_zspmm.cu @@ -0,0 +1,35 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2014 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include "cuComplex.h" + +extern "C" +{ +#include "core.h" +#include "hell.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE cuDoubleComplex +#define TYPE_SYMBOL Z +#define TEX_FETCH_TYPE int4 +#include "hell_spmm_base.cuh"