From 0230fbb7afb9663287e2a94aa62b61e0511221a1 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 7 Dec 2023 12:09:00 +0100 Subject: [PATCH] Identufied problems with CSRG. Will fix in a branch --- cuda/CUDA/psi_cuda_CopyCooToElg.cuh | 2 +- cuda/d_cusparse_mod.F90 | 2 + cuda/fcusparse_fct.h | 8 ++- cuda/impl/psb_c_cuda_cp_csrg_from_fmt.F90 | 2 +- cuda/impl/psb_d_cuda_cp_csrg_from_fmt.F90 | 2 +- cuda/impl/psb_d_cuda_csrg_to_gpu.F90 | 82 ++++++++++++++++++++++- cuda/impl/psb_s_cuda_cp_csrg_from_fmt.F90 | 2 +- cuda/impl/psb_z_cuda_cp_csrg_from_fmt.F90 | 2 +- 8 files changed, 93 insertions(+), 9 deletions(-) diff --git a/cuda/CUDA/psi_cuda_CopyCooToElg.cuh b/cuda/CUDA/psi_cuda_CopyCooToElg.cuh index 10a81a36..98aac050 100644 --- a/cuda/CUDA/psi_cuda_CopyCooToElg.cuh +++ b/cuda/CUDA/psi_cuda_CopyCooToElg.cuh @@ -89,7 +89,7 @@ GEN_PSI_FUNC_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, int nr, int nc, int nza, int baseIdx, int hacksz, int ldv, int nzm, int *rS,int *devIdisp, int *devJa, VALUE_TYPE *devVal, int *idiag, int *rP, VALUE_TYPE *cM) -{ int i,j,k, nrws; +{ int i,j, nrws; //int maxNForACall = THREAD_BLOCK*handle->maxGridSizeX; int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); diff --git a/cuda/d_cusparse_mod.F90 b/cuda/d_cusparse_mod.F90 index 509253e6..ae9bcceb 100644 --- a/cuda/d_cusparse_mod.F90 +++ b/cuda/d_cusparse_mod.F90 @@ -97,6 +97,7 @@ module d_cusparse_mod end function d_CSRGDeviceSetMatIndexBase end interface +#if CUDA_SHORT_VERSION <= 10 interface CSRGDeviceCsrsmAnalysis function d_CSRGDeviceCsrsmAnalysis(Mat) & & bind(c,name="d_CSRGDeviceCsrsmAnalysis") result(res) @@ -106,6 +107,7 @@ module d_cusparse_mod integer(c_int) :: res end function d_CSRGDeviceCsrsmAnalysis end interface +#endif interface CSRGDeviceAlloc function d_CSRGDeviceAlloc(Mat,nr,nc,nz) & diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 5a3b1ac6..5afe410d 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -187,7 +187,7 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, (void *) vY, CUSPARSE_BASE_TYPE, CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer)); -#else +#elif CUDA_VERSION <= 12030 cusparseDnVecDescr_t vecX, vecY; size_t bfsz; vX=x->v_; @@ -212,6 +212,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, cMat->mvbuffer)); CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); +#else + fprintf(stderr,"Unsupported CUSPARSE version\n"); #endif } @@ -244,7 +246,7 @@ int T_spsvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, (const TYPE *) vX, (TYPE *) vY, CUSPARSE_SOLVE_POLICY_USE_LEVEL, (void *) cMat->svbuffer)); -#else +#elif CUDA_VERSION <= 12030 cusparseDnVecDescr_t vecX, vecY; size_t bfsz; vX=x->v_; @@ -285,6 +287,8 @@ int T_spsvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, *(cMat->spsvDescr))); CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); +#else + fprintf(stderr,"Unsupported CUSPARSE version\n"); #endif } diff --git a/cuda/impl/psb_c_cuda_cp_csrg_from_fmt.F90 b/cuda/impl/psb_c_cuda_cp_csrg_from_fmt.F90 index 65b12a11..e9d42139 100644 --- a/cuda/impl/psb_c_cuda_cp_csrg_from_fmt.F90 +++ b/cuda/impl/psb_c_cuda_cp_csrg_from_fmt.F90 @@ -38,7 +38,7 @@ subroutine psb_c_cuda_cp_csrg_from_fmt(a,b,info) implicit none class(psb_c_cuda_csrg_sparse_mat), intent(inout) :: a - class(psb_c_base_sparse_mat), intent(inout) :: b + class(psb_c_base_sparse_mat), intent(in) :: b integer(psb_ipk_), intent(out) :: info diff --git a/cuda/impl/psb_d_cuda_cp_csrg_from_fmt.F90 b/cuda/impl/psb_d_cuda_cp_csrg_from_fmt.F90 index d030538e..8f8e8cbe 100644 --- a/cuda/impl/psb_d_cuda_cp_csrg_from_fmt.F90 +++ b/cuda/impl/psb_d_cuda_cp_csrg_from_fmt.F90 @@ -38,7 +38,7 @@ subroutine psb_d_cuda_cp_csrg_from_fmt(a,b,info) implicit none class(psb_d_cuda_csrg_sparse_mat), intent(inout) :: a - class(psb_d_base_sparse_mat), intent(inout) :: b + class(psb_d_base_sparse_mat), intent(in) :: b integer(psb_ipk_), intent(out) :: info diff --git a/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 b/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 index d1949421..16cb541d 100644 --- a/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 @@ -227,7 +227,7 @@ subroutine psb_d_cuda_csrg_to_gpu(a,info,nzrm) endif -#else +#elif 0 if (a%is_unit()) then ! @@ -308,7 +308,85 @@ subroutine psb_d_cuda_csrg_to_gpu(a,info,nzrm) !!$ if ((info == 0) .and. a%is_triangle()) then !!$ info = CSRGDeviceCsrsmAnalysis(a%deviceMat) !!$ end if - +#else + + if (a%is_unit()) then + ! + ! CUSPARSE has the habit of storing the diagonal and then ignoring, + ! whereas we do not store it. Hence this adapter code. + ! + nzdi = nz + m + if (info == 0) info = CSRGDeviceAlloc(a%deviceMat,m,n,nzdi) + if (info == 0) then + if (a%is_unit()) then + info = CSRGDeviceSetMatDiagType(a%deviceMat,cusparse_diag_type_unit) + else + info = CSRGDeviceSetMatDiagType(a%deviceMat,cusparse_diag_type_non_unit) + end if + end if + !!! We are explicitly adding the diagonal + !! info = CSRGDeviceSetMatDiagType(a%deviceMat,cusparse_diag_type_non_unit) + if ((info == 0) .and. a%is_triangle()) then +!!$ info = CSRGDeviceSetMatType(a%deviceMat,cusparse_matrix_type_triangular) + if ((info == 0).and.a%is_upper()) then + info = CSRGDeviceSetMatFillMode(a%deviceMat,cusparse_fill_mode_upper) + else + info = CSRGDeviceSetMatFillMode(a%deviceMat,cusparse_fill_mode_lower) + end if + end if + if (info == 0) allocate(irpdi(m+1),jadi(nzdi),valdi(nzdi),stat=info) + if (info == 0) then + irpdi(1) = 1 + if (a%is_triangle().and.a%is_upper()) then + do i=1,m + j = irpdi(i) + jadi(j) = i + valdi(j) = done + nrz = a%irp(i+1)-a%irp(i) + jadi(j+1:j+nrz) = a%ja(a%irp(i):a%irp(i+1)-1) + valdi(j+1:j+nrz) = a%val(a%irp(i):a%irp(i+1)-1) + irpdi(i+1) = j + nrz + 1 + ! write(0,*) 'Row ',i,' : ',irpdi(i:i+1),':',jadi(j:j+nrz),valdi(j:j+nrz) + end do + else + do i=1,m + j = irpdi(i) + nrz = a%irp(i+1)-a%irp(i) + jadi(j+0:j+nrz-1) = a%ja(a%irp(i):a%irp(i+1)-1) + valdi(j+0:j+nrz-1) = a%val(a%irp(i):a%irp(i+1)-1) + jadi(j+nrz) = i + valdi(j+nrz) = done + irpdi(i+1) = j + nrz + 1 + ! write(0,*) 'Row ',i,' : ',irpdi(i:i+1),':',jadi(j:j+nrz),valdi(j:j+nrz) + end do + end if + end if + if (info == 0) info = CSRGHost2Device(a%deviceMat,m,n,nzdi,irpdi,jadi,valdi) + + else + + if (info == 0) info = CSRGDeviceAlloc(a%deviceMat,m,n,nz) + !info = CSRGDeviceSetMatType(a%deviceMat,cusparse_matrix_type_general) +!!$ if (info == 0) info = CSRGDeviceSetMatIndexBase(a%deviceMat,cusparse_index_base_one) +!!$ if (a%is_triangle()) then +!!$ if (info == 0) then +!!$ if (a%is_unit()) then +!!$ info = CSRGDeviceSetMatDiagType(a%deviceMat,cusparse_diag_type_unit) +!!$ else +!!$ info = CSRGDeviceSetMatDiagType(a%deviceMat,cusparse_diag_type_non_unit) +!!$ end if +!!$ end if +!!$ if ((info == 0) )then +!!$ if ((info == 0).and.a%is_upper()) then +!!$ info = CSRGDeviceSetMatFillMode(a%deviceMat,cusparse_fill_mode_upper) +!!$ else +!!$ info = CSRGDeviceSetMatFillMode(a%deviceMat,cusparse_fill_mode_lower) +!!$ end if +!!$ end if +!!$ end if + if (info == 0) info = CSRGHost2Device(a%deviceMat,m,n,nz,a%irp,a%ja,a%val) + endif + #endif call a%set_sync() diff --git a/cuda/impl/psb_s_cuda_cp_csrg_from_fmt.F90 b/cuda/impl/psb_s_cuda_cp_csrg_from_fmt.F90 index 29bbea6e..76871b59 100644 --- a/cuda/impl/psb_s_cuda_cp_csrg_from_fmt.F90 +++ b/cuda/impl/psb_s_cuda_cp_csrg_from_fmt.F90 @@ -38,7 +38,7 @@ subroutine psb_s_cuda_cp_csrg_from_fmt(a,b,info) implicit none class(psb_s_cuda_csrg_sparse_mat), intent(inout) :: a - class(psb_s_base_sparse_mat), intent(inout) :: b + class(psb_s_base_sparse_mat), intent(in) :: b integer(psb_ipk_), intent(out) :: info diff --git a/cuda/impl/psb_z_cuda_cp_csrg_from_fmt.F90 b/cuda/impl/psb_z_cuda_cp_csrg_from_fmt.F90 index 26490a15..e086c8a4 100644 --- a/cuda/impl/psb_z_cuda_cp_csrg_from_fmt.F90 +++ b/cuda/impl/psb_z_cuda_cp_csrg_from_fmt.F90 @@ -38,7 +38,7 @@ subroutine psb_z_cuda_cp_csrg_from_fmt(a,b,info) implicit none class(psb_z_cuda_csrg_sparse_mat), intent(inout) :: a - class(psb_z_base_sparse_mat), intent(inout) :: b + class(psb_z_base_sparse_mat), intent(in) :: b integer(psb_ipk_), intent(out) :: info