diff --git a/base/modules/tools/psb_d_tools_mod.F90 b/base/modules/tools/psb_d_tools_mod.F90 index 72f2a1fe..b84b3095 100644 --- a/base/modules/tools/psb_d_tools_mod.F90 +++ b/base/modules/tools/psb_d_tools_mod.F90 @@ -69,7 +69,7 @@ Module psb_d_tools_mod subroutine psb_dalloc_multivect_r2(x, desc_a,info,m,n,lb, dupl, bldmode) import implicit none - type(psb_d_multivect_type), intent(out) :: x(:) + type(psb_d_multivect_type), allocatable, intent(out) :: x(:) type(psb_desc_type), intent(in) :: desc_a integer(psb_ipk_),intent(out) :: info integer(psb_ipk_), optional, intent(in) :: m, n, lb diff --git a/cuda/d_cusparse_mod.F90 b/cuda/d_cusparse_mod.F90 index 399ac085..d2ed65d8 100644 --- a/cuda/d_cusparse_mod.F90 +++ b/cuda/d_cusparse_mod.F90 @@ -166,6 +166,19 @@ module d_cusparse_mod integer(c_int) :: res end function d_spmvCSRGDevice end interface + + interface spmmCSRGDevice + function d_spmmCSRGDevice(Mat,alpha,x,beta,y) & + & bind(c,name="d_spmmCSRGDevice") result(res) + use iso_c_binding + import d_Cmat + type(d_Cmat) :: Mat + type(c_ptr), value :: x + type(c_ptr), value :: y + real(c_double), value :: alpha,beta + integer(c_int) :: res + end function d_spmmCSRGDevice + end interface interface CSRGHost2Device function d_CSRGHost2Device(Mat,m,n,nz,irp,ja,val) & diff --git a/cuda/dcusparse.c b/cuda/dcusparse.c index 657ca5be..30ee1dab 100644 --- a/cuda/dcusparse.c +++ b/cuda/dcusparse.c @@ -45,6 +45,7 @@ #define T_CSRGDeviceMat d_CSRGDeviceMat #define T_Cmat d_Cmat #define T_spmvCSRGDevice d_spmvCSRGDevice +#define T_spmmCSRGDevice d_spmmCSRGDevice #define T_spsvCSRGDevice d_spsvCSRGDevice #define T_CSRGDeviceAlloc d_CSRGDeviceAlloc #define T_CSRGDeviceFree d_CSRGDeviceFree diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 9df0f625..384569e5 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -56,7 +56,6 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec) int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; - double *hv, *dv; i = writeRemoteBufferR2((void*) hostVec, ld*sizeof(double), (void *)devVec->v_, (devVec->count_), sizeof(double)*(devVec->pitch_), (devVec->size_)*sizeof(double)); @@ -81,7 +80,6 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec) int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; - double *hv, *dv; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; i = readRemoteBufferR2((void *) hostVec, ld*sizeof(double), diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 4fabdd40..c28e579a 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -137,10 +137,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; void *vX, *vY; - int j,r,n; - int pitch; + int r,n; cusparseHandle_t *my_handle=getHandle(); - pitch = y->pitch_; TYPE ealpha=alpha, ebeta=beta; #if CUDA_SHORT_VERSION <= 10 /* getAddrMultiVecDevice(deviceX, &vX); */ @@ -198,73 +196,115 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer)); #else -// cusparseDnVecDescr_t vecX, vecY; - cusparseDnMatDescr_t vecX, vecY; + cusparseDnVecDescr_t vecX, vecY; size_t bfsz; if (T_CSRGIsNullMvDescr(cMat)) { cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *)); } T_CSRGCreateSpMVDescr(cMat); -// vX=x->v_; -// vY=y->v_; -// fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta); -// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); -// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); -// CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, -// CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha, -// (*(cMat->spmvDescr)),vecX,&beta,vecY, -// CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT, -// &bfsz)); -// if (bfsz > cMat->mvbsize) { -// if (cMat->mvbuffer != NULL) { -// CHECK_CUDA(cudaFree(cMat->mvbuffer)); -// cMat->mvbuffer = NULL; -// } -// //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz)); -// allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); + vX=x->v_; + vY=y->v_; + CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) ); + CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) ); + CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, + CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, + &bfsz)); + if (bfsz > cMat->mvbsize) { + if (cMat->mvbuffer != NULL) { + CHECK_CUDA(cudaFree(cMat->mvbuffer)); + cMat->mvbuffer = NULL; + } + //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz)); + allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); -// cMat->mvbsize = bfsz; -// } - -// CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, -// CUSPARSE_OPERATION_NON_TRANSPOSE, -// &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE, -// CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer)); -// CHECK_CUSPARSE(cusparseDestroyDnMat(vecX)); -// CHECK_CUSPARSE(cusparseDestroyDnMat(vecY)); + cMat->mvbsize = bfsz; + } + CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, + CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, + cMat->mvbuffer)); + CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); + CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); + CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr))); +#endif +} + +int T_spmmCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, + TYPE beta, void *deviceY) +{ + T_CSRGDeviceMat *cMat=Matrix->mat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + void *vX, *vY; + int j,r,n; + cusparseHandle_t *my_handle=getHandle(); + TYPE ealpha=alpha, ebeta=beta; + cusparseDnMatDescr_t vecX, vecY; + size_t bfsz; + + if (T_CSRGIsNullMvDescr(cMat)) { + cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *)); + } + T_CSRGCreateSpMVDescr(cMat); + // vX=x->v_; + // vY=y->v_; + // fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta); + // CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); + // CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL)); + // CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, + // CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha, + // (*(cMat->spmvDescr)),vecX,&beta,vecY, + // CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT, + // &bfsz)); + // if (bfsz > cMat->mvbsize) { + // if (cMat->mvbuffer != NULL) { + // CHECK_CUDA(cudaFree(cMat->mvbuffer)); + // cMat->mvbuffer = NULL; + // } + // //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz)); + // allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); + + // cMat->mvbsize = bfsz; + // } + + // CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, + // CUSPARSE_OPERATION_NON_TRANSPOSE, + // &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE, + // CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer)); + // CHECK_CUSPARSE(cusparseDestroyDnMat(vecX)); + // CHECK_CUSPARSE(cusparseDestroyDnMat(vecY)); for(j=0;jcount_;j++) { - vX=x->v_+pitch*j*sizeof(TYPE); - vY=y->v_+pitch*j*sizeof(TYPE); - fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n); + vX=(x->v_)+(x->pitch_)*j*sizeof(TYPE); + vY=(y->v_)+(y->pitch_)*j*sizeof(TYPE); + // fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n); CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) ); CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) ); CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, - CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, - &bfsz)); + &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, + CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, + &bfsz)); + if (bfsz > cMat->mvbsize) { - if (cMat->mvbuffer != NULL) { - CHECK_CUDA(cudaFree(cMat->mvbuffer)); - cMat->mvbuffer = NULL; - } - //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz)); - allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); - - cMat->mvbsize = bfsz; + if (cMat->mvbuffer != NULL) { + CHECK_CUDA(cudaFree(cMat->mvbuffer)); + cMat->mvbuffer = NULL; + } + allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); + cMat->mvbsize = bfsz; } CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, - CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, - cMat->mvbuffer)); + &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, + CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, + cMat->mvbuffer)); //fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_); CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); } - CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr))); -#endif + return(0); } - + int T_spsvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, TYPE beta, void *deviceY) { diff --git a/cuda/impl/psb_d_cuda_csrg_csmm.F90 b/cuda/impl/psb_d_cuda_csrg_csmm.F90 index ddac1373..4515eb31 100644 --- a/cuda/impl/psb_d_cuda_csrg_csmm.F90 +++ b/cuda/impl/psb_d_cuda_csrg_csmm.F90 @@ -100,16 +100,16 @@ subroutine psb_d_cuda_csrg_csmm(alpha,a,x,beta,y,info,trans) if (info == 0) & & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpX,x,nxy) + & info = writeMultiVecDevice(gpX,x,size(x,1)) if (info == 0) & & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double) if (info == 0) & - & info = writeMultiVecDevice(gpY,y,nxy) + & info = writeMultiVecDevice(gpY,y,size(y,1)) if (info == 0) & & info = spmvCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY) if (info == 0) & - & info = readMultiVecDevice(gpY,y,nxy) + & info = readMultiVecDevice(gpY,y,size(y,1)) if (info /= 0) goto 9999 call freeMultiVecDevice(gpX) call freeMultiVecDevice(gpY) diff --git a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 index a62e1d0f..03526be8 100644 --- a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 @@ -172,13 +172,11 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans) if (beta /= dzero) then if (yy%is_host()) call yy%sync() end if - ! TODO - write(*,*) 'AAAAAAAAA' - info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,& + info = spmmCSRGDevice(a%deviceMat,alpha,xx%deviceVect,& & beta,yy%deviceVect) if (info /= 0) then call psb_errpush(psb_err_from_subroutine_ai_,name,& - & a_err='spmvCSRGDevice',i_err=(/info,izero,izero,izero,izero/)) + & a_err='spmmCSRGDevice',i_err=(/info,izero,izero,izero,izero/)) info = psb_err_from_subroutine_ai_ goto 9999 end if diff --git a/cuda/vectordev.c b/cuda/vectordev.c index f947a73a..22039b5c 100644 --- a/cuda/vectordev.c +++ b/cuda/vectordev.c @@ -165,7 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count, p = getMultiVectorDeviceParams(count, size, elementType); i = allocMultiVecDevice(deviceMultiVec, &p); - fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size); + //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size); //cudaSync(); if (i != 0) { fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size); diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90 index 44c24365..2fdc3aa1 100644 --- a/test/block_krylov/kernel/dpdegenmm.F90 +++ b/test/block_krylov/kernel/dpdegenmm.F90 @@ -624,7 +624,7 @@ program pdegenmm character(len=20) :: name,ch_err character(len=40) :: fname - real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:) + real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:) type(c_ptr) :: gpx, gpy @@ -789,7 +789,6 @@ program pdegenmm #endif end do - call x_mv%set(done) call psb_barrier(ctxt) t1 = psb_wtime() do i=1,ntests @@ -805,6 +804,7 @@ program pdegenmm ! FIXME: cache flush needed here x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() + write(*,*) do i=1,8 write(*,*) x1(i,:) end do @@ -818,9 +818,6 @@ program pdegenmm ! call psb_geasb(bg,desc_a,info,mold=tmold) ! call bg%set(done+done) -! call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info) -! call psb_cuda_DeviceSync() - ! ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble) ! call psb_geaxpby(done,xg,dzero,bg,desc_a,info) ! call psb_cuda_DeviceSync() @@ -835,35 +832,10 @@ program pdegenmm ! return - - ! TODO Test NRM2 AMAX -! call b_mv_g%set(done) -! test = psb_genrm2(b_mv_g,desc_a,info) -! write(*,*) 'AMAX ', psb_geamax(b_mv_g,desc_a,info) -! do i=1,nrhs -! write(*,*) test(i) -! end do - - ! TODO SpMM da fare con vettori GPU su mod csrg - ! TODO SpMM da fare a parte dopo - ! TODO Da cambiare WRITE READ R2 devono usare Memcopy 2D - ! TODO Test DDOT -! call x_mv_g%set(done) -! call b_mv_g%set(done+done) -! test = psb_gedot(b_mv_g,test2,desc_a,info) -! write(*,*) 'SIZE ', size(test,1), size(test,2) -! do i=1,size(test,1) -! write(*,*) test(i,:) -! end do - - write(*,*) 'TEST' - call x_mv_g%set(done) - call x_mv_g%sync() - call psb_barrier(ctxt) tt1 = psb_wtime() do i=1,ntests - call psb_spmm(done,agpu,x_mv_g,dzero,b_mv_g,desc_a,info) + call psb_spmm(done,agpu,x_mv,dzero,b_mv_g,desc_a,info) if ((info /= 0).or.(psb_get_errstatus()/=0)) then write(0,*) 'From 1 spmm',info,i,ntests call psb_error() @@ -876,18 +848,15 @@ program pdegenmm call psb_amx(ctxt,tt2) x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) 'MHANZ ', b_mv_g%get_nrows(), size(b_mv_g%v%v,1) - write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:) - do i=1,size(b_mv_g%v%v,1) - write(*,*) b_mv_g%v%v(i,:) + write(*,*) + do i=1,size(x2,1) + write(*,*) x2(i,:) end do nr = desc_a%get_local_rows() eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs))) call psb_amx(ctxt,eps) if (iam==0) write(*,*) 'Max diff on xGPU',eps - return - ! FIXME: cache flush needed here call x_mv_g%set(x0) call x_mv_g%sync() @@ -910,12 +879,13 @@ program pdegenmm call b_mv_g%sync() x1 = b_mv%get_vect() x2 = b_mv_g%get_vect() - write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:) + write(*,*) + do i=1,size(x2,1) + write(*,*) x2(i,:) + end do call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info) eps = psb_geamax(b_mv,desc_a,info) - return - call psb_amx(ctxt,t2) eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs))) call psb_amx(ctxt,eps)