From 409b51e609a6340f20e986064af0bf76f5fa095d Mon Sep 17 00:00:00 2001 From: gabrielequatrana Date: Tue, 9 Apr 2024 23:21:17 +0200 Subject: [PATCH] Try to fix SpMM --- base/modules/tools/psb_d_tools_mod.F90 | 2 +- cuda/cuda_util.c | 4 ++-- cuda/dvectordev.c | 4 ++-- cuda/fcusparse_fct.h | 7 +++---- cuda/impl/psb_d_cuda_csrg_csmm.F90 | 6 +++--- cuda/impl/psb_d_cuda_csrg_vect_mv.F90 | 1 - cuda/vectordev.c | 2 +- test/block_krylov/kernel/dpdegenmm.F90 | 22 +++++++++++++++++++--- 8 files changed, 31 insertions(+), 17 deletions(-) diff --git a/base/modules/tools/psb_d_tools_mod.F90 b/base/modules/tools/psb_d_tools_mod.F90 index 72f2a1fe..b84b3095 100644 --- a/base/modules/tools/psb_d_tools_mod.F90 +++ b/base/modules/tools/psb_d_tools_mod.F90 @@ -69,7 +69,7 @@ Module psb_d_tools_mod subroutine psb_dalloc_multivect_r2(x, desc_a,info,m,n,lb, dupl, bldmode) import implicit none - type(psb_d_multivect_type), intent(out) :: x(:) + type(psb_d_multivect_type), allocatable, intent(out) :: x(:) type(psb_desc_type), intent(in) :: desc_a integer(psb_ipk_),intent(out) :: info integer(psb_ipk_), optional, intent(in) :: m, n, lb diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c index c66fb96f..de3e6686 100644 --- a/cuda/cuda_util.c +++ b/cuda/cuda_util.c @@ -184,7 +184,7 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count) // TODO int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size) { - cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice); + cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, size, size, count, cudaMemcpyHostToDevice); if (err == cudaSuccess) return SPGPU_SUCCESS; @@ -223,7 +223,7 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count) // TODO sistemare pitch e size (si possono gestire senza realloc su fortran) int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size) { - cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy2D(hostDest, size, buffer, pitch, size, count, cudaMemcpyDeviceToHost); if (err == cudaSuccess) return SPGPU_SUCCESS; diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 28b9055a..27d13a3c 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -56,7 +56,7 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec) int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; - i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_); + i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double)); // i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec); fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_); if (i != 0) { @@ -79,7 +79,7 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec) int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; - i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_); + i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double)); // i = readMultiVecDeviceDouble(deviceVec, hostVec); fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_); if (i != 0) { diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 4d901134..118cbd7c 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -209,8 +209,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, // vX=x->v_; // vY=y->v_; // fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta); -// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); -// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); +// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, y->size_, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); +// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, y->size_, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); // CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, // CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha, // (*(cMat->spmvDescr)),vecX,&beta,vecY, @@ -236,7 +236,7 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, for(j=0;jcount_;j++) { vX=x->v_+pitch*j; vY=y->v_+pitch*j; - fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n); + fprintf(stderr,"CUDA ENTERED %d %p %p %d %d\n",j, vX, vY, pitch, y->size_); CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) ); CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) ); CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -257,7 +257,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, cMat->mvbuffer)); - fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_); CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); } diff --git a/cuda/impl/psb_d_cuda_csrg_csmm.F90 b/cuda/impl/psb_d_cuda_csrg_csmm.F90 index ddac1373..4515eb31 100644 --- a/cuda/impl/psb_d_cuda_csrg_csmm.F90 +++ b/cuda/impl/psb_d_cuda_csrg_csmm.F90 @@ -100,16 +100,16 @@ subroutine psb_d_cuda_csrg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & & info = spmvCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 index a62e1d0f..b4a7ea53 100644 --- a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 @@ -173,7 +173,6 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans) if (yy%is_host()) call yy%sync() end if ! TODO - write(*,*) 'AAAAAAAAA' info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,& & beta,yy%deviceVect) if (info /= 0) then diff --git a/cuda/vectordev.c b/cuda/vectordev.c index f947a73a..22039b5c 100644 --- a/cuda/vectordev.c +++ b/cuda/vectordev.c @@ -165,7 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count, p = getMultiVectorDeviceParams(count, size, elementType); i = allocMultiVecDevice(deviceMultiVec, &p); - fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size); + //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size); //cudaSync(); if (i != 0) { fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size); diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90 index 44c24365..98b1f2c8 100644 --- a/test/block_krylov/kernel/dpdegenmm.F90 +++ b/test/block_krylov/kernel/dpdegenmm.F90 @@ -624,7 +624,7 @@ program pdegenmm character(len=20) :: name,ch_err character(len=40) :: fname - real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:) + real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:) type(c_ptr) :: gpx, gpy @@ -856,10 +856,27 @@ program pdegenmm ! write(*,*) test(i,:) ! end do - write(*,*) 'TEST' +! TODO +! allocate(test(8,2),test1(8,2),test2(8)) +! do i=1,size(test,1) +! test(i,:) = i*done +! end do +! info = FallocMultiVecDevice(gpx,nrhs,size(test,1),spgpu_type_double) +! info = writeMultiVecDevice(gpx,test,size(test,1)) +! !info = FallocMultiVecDevice(gpy,nrhs,size(test1,1),spgpu_type_double) +! info = readMultiVecDevice(gpx,test1,size(test1,1)) + +! do i=1,size(test1,1) +! write(*,*) test1(i,:) +! end do + +! return + call x_mv_g%set(done) call x_mv_g%sync() + call b_mv_g%set(done) + call b_mv_g%sync() call psb_barrier(ctxt) tt1 = psb_wtime() do i=1,ntests @@ -876,7 +893,6 @@ program pdegenmm call psb_amx(ctxt,tt2) x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) 'MHANZ ', b_mv_g%get_nrows(), size(b_mv_g%v%v,1) write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:) do i=1,size(b_mv_g%v%v,1) write(*,*) b_mv_g%v%v(i,:)