From beb418e00ba0857ea6bea2b7647f70610e7dc4fb Mon Sep 17 00:00:00 2001 From: gabrielequatrana Date: Wed, 10 Apr 2024 14:57:15 +0200 Subject: [PATCH] Fixed SpMM for ELG (AXPBY GPU not working) --- configure.ac | 0 cuda/cuda_util.c | 2 - cuda/dvectordev.c | 10 +-- cuda/impl/psb_d_cuda_csrg_csmm.F90 | 2 +- cuda/impl/psb_d_cuda_csrg_vect_mv.F90 | 6 +- cuda/impl/psb_d_cuda_elg_csmm.F90 | 6 +- cuda/impl/psb_d_cuda_elg_vect_mv.F90 | 91 ++++++++++++++++++++++++++ cuda/impl/psb_d_cuda_hlg_vect_mv.F90 | 89 +++++++++++++++++++++++++ cuda/psb_d_cuda_elg_mat_mod.F90 | 10 +++ cuda/psb_d_cuda_hlg_mat_mod.F90 | 10 +++ ext/impl/Makefile | 0 test/block_krylov/kernel/dpdegenmm.F90 | 63 +++++++++--------- test/cudakern/Makefile | 0 test/cudakern/d_file_spmv.F90 | 4 +- 14 files changed, 244 insertions(+), 49 deletions(-) mode change 100755 => 100644 configure.ac mode change 100755 => 100644 ext/impl/Makefile mode change 100755 => 100644 test/cudakern/Makefile diff --git a/configure.ac b/configure.ac old mode 100755 new mode 100644 diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c index 28a159c1..e0ab9bdc 100644 --- a/cuda/cuda_util.c +++ b/cuda/cuda_util.c @@ -181,7 +181,6 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count) } } -// TODO int writeRemoteBufferR2(void* hostSrc, int hpitch, void* buffer, int count, int pitch, int size) { cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, hpitch, size, count, cudaMemcpyHostToDevice); @@ -220,7 +219,6 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count) } } -// TODO sistemare pitch e size (si possono gestire senza realloc su fortran) int readRemoteBufferR2(void* hostDest, int hpitch, void* buffer, int count, int pitch, int size) { cudaError_t err = cudaMemcpy2D(hostDest, hpitch, buffer, pitch, size, count, cudaMemcpyDeviceToHost); diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 384569e5..bba7d136 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -59,8 +59,6 @@ int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) i = writeRemoteBufferR2((void*) hostVec, ld*sizeof(double), (void *)devVec->v_, (devVec->count_), sizeof(double)*(devVec->pitch_), (devVec->size_)*sizeof(double)); -// i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec); - //fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_); if (i != 0) { fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i); } @@ -85,8 +83,6 @@ int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) i = readRemoteBufferR2((void *) hostVec, ld*sizeof(double), (void *)devVec->v_, devVec->count_, sizeof(double)*(devVec->pitch_), (devVec->size_)*sizeof(double)); -// i = readMultiVecDeviceDouble(deviceVec, hostVec); -// fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_); if (i != 0) { fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i); } @@ -247,9 +243,9 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX, return SPGPU_UNSUPPORTED; for(j=0;jcount_;j++) - //fprintf(stderr,"CUDA ENTERED %d %d %d %d \n",j, n, pitch, devVecY->size_); - spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, - (double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j); + fprintf(stderr,"CUDA ENTERED %d %p %d %d %d \n",j, (((double *)(devVecX->v_))+(pitch)*j), n, pitch, devVecY->size_); + spgpuDaxpby(handle,(((double *)(devVecY->v_))+(pitch)*j), n, beta, + (((double *)(devVecY->v_))+(pitch)*j), alpha,(((double *)(devVecX->v_))+(pitch)*j)); return(i); } diff --git a/cuda/impl/psb_d_cuda_csrg_csmm.F90 b/cuda/impl/psb_d_cuda_csrg_csmm.F90 index 4515eb31..85809e7d 100644 --- a/cuda/impl/psb_d_cuda_csrg_csmm.F90 +++ b/cuda/impl/psb_d_cuda_csrg_csmm.F90 @@ -107,7 +107,7 @@ subroutine psb_d_cuda_csrg_csmm(alpha,a,x,beta,y,info,trans) & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & - & info = spmvCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY) + & info = spmmCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 index 03526be8..07ae7a17 100644 --- a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 @@ -184,13 +184,15 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans) class default rx = xx%get_vect() ry = y%get_vect() - call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info) + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) call y%bld(ry) end select class default rx = x%get_vect() ry = y%get_vect() - call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info) + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) call y%bld(ry) end select end if diff --git a/cuda/impl/psb_d_cuda_elg_csmm.F90 b/cuda/impl/psb_d_cuda_elg_csmm.F90 index f77d72d8..34c1d12f 100644 --- a/cuda/impl/psb_d_cuda_elg_csmm.F90 +++ b/cuda/impl/psb_d_cuda_elg_csmm.F90 @@ -98,16 +98,16 @@ subroutine psb_d_cuda_elg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_d_cuda_elg_vect_mv.F90 b/cuda/impl/psb_d_cuda_elg_vect_mv.F90 index f0b83c2b..bead49c5 100644 --- a/cuda/impl/psb_d_cuda_elg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_elg_vect_mv.F90 @@ -119,3 +119,94 @@ subroutine psb_d_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans) return end subroutine psb_d_cuda_elg_vect_mv + +subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use elldev_mod + use psb_vectordev_mod + use psb_d_cuda_elg_mat_mod, psb_protect_name => psb_d_cuda_elg_multivect_mv + use psb_d_cuda_multivect_mod + implicit none + class(psb_d_cuda_elg_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + real(psb_dpk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='d_cuda_elg_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (a%is_dev()) call a%sync() + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + call a%psb_d_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_d_multivect_cuda) + select type(yy => y) + type is (psb_d_multivect_cuda) + if (a%is_host()) call a%sync() + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmvEllDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + if (a%is_dev()) call a%sync() + rx = xx%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + if (a%is_dev()) call a%sync() + rx = x%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + + 9999 call psb_error_handler(err_act) + + return + + end subroutine psb_d_cuda_elg_multivect_mv diff --git a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 index cccba74b..a3f18cca 100644 --- a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 @@ -117,3 +117,92 @@ subroutine psb_d_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans) return end subroutine psb_d_cuda_hlg_vect_mv + +subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hlldev_mod + use psb_vectordev_mod + use psb_d_cuda_hlg_mat_mod, psb_protect_name => psb_d_cuda_hlg_multivect_mv + use psb_d_cuda_multivect_mod + implicit none + class(psb_d_cuda_hlg_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + real(psb_dpk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='d_cuda_hlg_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + if (a%is_dev()) call a%sync() + call a%psb_d_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_d_multivectcuda) + select type(yy => y) + type is (psb_d_multivect_cuda) + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + rx = xx%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + rx = x%get_vect() + ry = y%get_vect() + if (a%is_dev()) call a%sync() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + + 9999 call psb_error_handler(err_act) + + return + + end subroutine psb_d_cuda_hlg_multivect_mv diff --git a/cuda/psb_d_cuda_elg_mat_mod.F90 b/cuda/psb_d_cuda_elg_mat_mod.F90 index 1af80f2a..d0c2b967 100644 --- a/cuda/psb_d_cuda_elg_mat_mod.F90 +++ b/cuda/psb_d_cuda_elg_mat_mod.F90 @@ -56,6 +56,7 @@ module psb_d_cuda_elg_mat_mod procedure, nopass :: get_fmt => d_cuda_elg_get_fmt procedure, pass(a) :: sizeof => d_cuda_elg_sizeof procedure, pass(a) :: vect_mv => psb_d_cuda_elg_vect_mv + procedure, pass(a) :: multivect_mv => psb_d_cuda_elg_multivect_mv procedure, pass(a) :: csmm => psb_d_cuda_elg_csmm procedure, pass(a) :: csmv => psb_d_cuda_elg_csmv procedure, pass(a) :: in_vect_sv => psb_d_cuda_elg_inner_vect_sv @@ -101,6 +102,15 @@ module psb_d_cuda_elg_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_d_cuda_elg_vect_mv + subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_d_cuda_elg_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_ + class(psb_d_cuda_elg_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_d_cuda_elg_multivect_mv end interface interface diff --git a/cuda/psb_d_cuda_hlg_mat_mod.F90 b/cuda/psb_d_cuda_hlg_mat_mod.F90 index 6627f824..c6313d1f 100644 --- a/cuda/psb_d_cuda_hlg_mat_mod.F90 +++ b/cuda/psb_d_cuda_hlg_mat_mod.F90 @@ -56,6 +56,7 @@ module psb_d_cuda_hlg_mat_mod procedure, nopass :: get_fmt => d_cuda_hlg_get_fmt procedure, pass(a) :: sizeof => d_cuda_hlg_sizeof procedure, pass(a) :: vect_mv => psb_d_cuda_hlg_vect_mv + procedure, pass(a) :: multivect_mv => psb_d_cuda_hlg_multivect_mv procedure, pass(a) :: csmm => psb_d_cuda_hlg_csmm procedure, pass(a) :: csmv => psb_d_cuda_hlg_csmv procedure, pass(a) :: in_vect_sv => psb_d_cuda_hlg_inner_vect_sv @@ -97,6 +98,15 @@ module psb_d_cuda_hlg_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_d_cuda_hlg_vect_mv + subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_d_cuda_hlg_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_ + class(psb_d_cuda_hlg_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_d_cuda_hlg_multivect_mv end interface interface diff --git a/ext/impl/Makefile b/ext/impl/Makefile old mode 100755 new mode 100644 diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90 index 2fdc3aa1..448fb155 100644 --- a/test/block_krylov/kernel/dpdegenmm.F90 +++ b/test/block_krylov/kernel/dpdegenmm.F90 @@ -596,7 +596,7 @@ program pdegenmm ! solver parameters integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes real(psb_dpk_) :: err, eps - integer, parameter :: ntests=1, ngpu=50, ncnv=20 + integer, parameter :: ntests=50, ngpu=50, ncnv=20 type(psb_d_coo_sparse_mat), target :: acoo type(psb_d_csr_sparse_mat), target :: acsr type(psb_d_ell_sparse_mat), target :: aell @@ -612,7 +612,9 @@ program pdegenmm #if CUDA_SHORT_VERSION <= 10 type(psb_d_cuda_hybg_sparse_mat), target :: ahybg #endif + ! TODO HLG da fare (complesso) type(psb_d_cuda_hlg_sparse_mat), target :: ahlg + ! TODO HDIAG E DNSG non hanno nemmeno CSMM type(psb_d_cuda_hdiag_sparse_mat), target :: ahdiag type(psb_d_cuda_dnsg_sparse_mat), target :: adnsg #endif @@ -624,9 +626,7 @@ program pdegenmm character(len=20) :: name,ch_err character(len=40) :: fname - real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:) - - type(c_ptr) :: gpx, gpy + real(psb_dpk_) :: random_value info=psb_success_ @@ -804,30 +804,37 @@ program pdegenmm ! FIXME: cache flush needed here x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) + +! ! TODO test AXPBY + call psb_geall(xg,desc_a,info) + call psb_geasb(xg,desc_a,info,mold=tmold) + call xg%set(done) + call xg%sync() + call psb_geall(bg,desc_a,info) + call psb_geasb(bg,desc_a,info,mold=tmold) + !call bg%set(done+done) + +! ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble) + call psb_geaxpby(done,xg,dzero,bg,desc_a,info) + call psb_cuda_DeviceSync() + + write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync() + call bg%sync() + write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync() do i=1,8 - write(*,*) x1(i,:) + write(*,*) bg%v%v(i) end do - ! TODO test AXPBY e SPMM -! call psb_geall(xg,desc_a,info) -! call psb_geasb(xg,desc_a,info,mold=tmold) -! call xg%set(done) -! call xg%sync() -! call psb_geall(bg,desc_a,info) -! call psb_geasb(bg,desc_a,info,mold=tmold) -! call bg%set(done+done) + return -! ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble) -! call psb_geaxpby(done,xg,dzero,bg,desc_a,info) -! call psb_cuda_DeviceSync() -! call psb_barrier(ctxt) - -! write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync() -! call bg%sync() -! write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync() -! do i=1,8 -! write(*,*) bg%v%v(i) +! call x_mv_g%set(done) +! call x_mv_g%sync() + +! call psb_geaxpby(done,x_mv_g,dzero,b_mv_g,desc_a,info) + +! call b_mv_g%sync() +! do i=1,size(b_mv_g%v%v,1) +! write(*,*) b_mv_g%v%v(i,:) ! end do ! return @@ -848,10 +855,6 @@ program pdegenmm call psb_amx(ctxt,tt2) x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) - do i=1,size(x2,1) - write(*,*) x2(i,:) - end do nr = desc_a%get_local_rows() eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs))) call psb_amx(ctxt,eps) @@ -879,10 +882,6 @@ program pdegenmm call b_mv_g%sync() x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) - do i=1,size(x2,1) - write(*,*) x2(i,:) - end do call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info) eps = psb_geamax(b_mv,desc_a,info) diff --git a/test/cudakern/Makefile b/test/cudakern/Makefile old mode 100755 new mode 100644 diff --git a/test/cudakern/d_file_spmv.F90 b/test/cudakern/d_file_spmv.F90 index 2bbc0bc4..95ae323b 100644 --- a/test/cudakern/d_file_spmv.F90 +++ b/test/cudakern/d_file_spmv.F90 @@ -245,7 +245,7 @@ program d_file_spmv write(psb_out_unit,'("Partition type: graph")') write(psb_out_unit,'(" ")') ! write(psb_err_unit,'("Build type: graph")') - call build_mtpart(aux_a,np) + call build_mtpart(aux_a,int(np,psb_lpk_)) endif call psb_barrier(ctxt) call distr_mtpart(psb_root_,ctxt) @@ -274,7 +274,7 @@ program d_file_spmv nrg = desc_a%get_global_rows() call psb_geall(x_col,desc_a,info) do i=1, nr - call desc_a%l2g(i,ig,info) + call desc_a%l2g(int(i,psb_ipk_),ig,info) call psb_geins(ione,(/ig/),(/(done + (done*ig)/nrg)/),x_col,desc_a,info) end do call psb_geasb(x_col,desc_a,info)