Fixed SpMM for CSRG

psblas-bgmres
gabrielequatrana 10 months ago
parent 409b51e609
commit 82715fec9b

@ -166,6 +166,19 @@ module d_cusparse_mod
integer(c_int) :: res
end function d_spmvCSRGDevice
end interface
interface spmmCSRGDevice
function d_spmmCSRGDevice(Mat,alpha,x,beta,y) &
& bind(c,name="d_spmmCSRGDevice") result(res)
use iso_c_binding
import d_Cmat
type(d_Cmat) :: Mat
type(c_ptr), value :: x
type(c_ptr), value :: y
real(c_double), value :: alpha,beta
integer(c_int) :: res
end function d_spmmCSRGDevice
end interface
interface CSRGHost2Device
function d_CSRGHost2Device(Mat,m,n,nz,irp,ja,val) &

@ -45,6 +45,7 @@
#define T_CSRGDeviceMat d_CSRGDeviceMat
#define T_Cmat d_Cmat
#define T_spmvCSRGDevice d_spmvCSRGDevice
#define T_spmmCSRGDevice d_spmmCSRGDevice
#define T_spsvCSRGDevice d_spsvCSRGDevice
#define T_CSRGDeviceAlloc d_CSRGDeviceAlloc
#define T_CSRGDeviceFree d_CSRGDeviceFree

@ -57,8 +57,6 @@ int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i;
struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
// i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_);
if (i != 0) {
fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i);
}
@ -80,8 +78,6 @@ int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i;
struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
// i = readMultiVecDeviceDouble(deviceVec, hostVec);
fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_);
if (i != 0) {
fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i);
}

@ -137,10 +137,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
void *vX, *vY;
int j,r,n;
int pitch;
int r,n;
cusparseHandle_t *my_handle=getHandle();
pitch = y->pitch_;
TYPE ealpha=alpha, ebeta=beta;
#if CUDA_SHORT_VERSION <= 10
/* getAddrMultiVecDevice(deviceX, &vX); */
@ -198,72 +196,86 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer));
#else
// cusparseDnVecDescr_t vecX, vecY;
cusparseDnMatDescr_t vecX, vecY;
cusparseDnVecDescr_t vecX, vecY;
size_t bfsz;
if (T_CSRGIsNullMvDescr(cMat)) {
cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *));
}
T_CSRGCreateSpMVDescr(cMat);
// vX=x->v_;
// vY=y->v_;
// fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta);
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, y->size_, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, y->size_, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
// CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
// (*(cMat->spmvDescr)),vecX,&beta,vecY,
// CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT,
// &bfsz));
// if (bfsz > cMat->mvbsize) {
// if (cMat->mvbuffer != NULL) {
// CHECK_CUDA(cudaFree(cMat->mvbuffer));
// cMat->mvbuffer = NULL;
// }
// //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
// allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
// cMat->mvbsize = bfsz;
// }
// CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
// CUSPARSE_OPERATION_NON_TRANSPOSE,
// &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE,
// CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer));
// CHECK_CUSPARSE(cusparseDestroyDnMat(vecX));
// CHECK_CUSPARSE(cusparseDestroyDnMat(vecY));
for(j=0;j<y->count_;j++) {
vX=x->v_+pitch*j;
vY=y->v_+pitch*j;
fprintf(stderr,"CUDA ENTERED %d %p %p %d %d\n",j, vX, vY, pitch, y->size_);
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
vX=x->v_;
vY=y->v_;
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
&bfsz));
if (bfsz > cMat->mvbsize) {
if (cMat->mvbuffer != NULL) {
CHECK_CUDA(cudaFree(cMat->mvbuffer));
cMat->mvbuffer = NULL;
}
//CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
cMat->mvbsize = bfsz;
if (bfsz > cMat->mvbsize) {
if (cMat->mvbuffer != NULL) {
CHECK_CUDA(cudaFree(cMat->mvbuffer));
cMat->mvbuffer = NULL;
}
CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
//CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
cMat->mvbsize = bfsz;
}
CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
cMat->mvbuffer));
CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
}
CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr)));
#endif
}
int T_spmmCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
TYPE beta, void *deviceY)
{
T_CSRGDeviceMat *cMat=Matrix->mat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
void *vX, *vY;
int j,r,n;
cusparseHandle_t *my_handle=getHandle();
TYPE ealpha=alpha, ebeta=beta;
cusparseDnMatDescr_t vecX, vecY;
size_t bfsz;
if (T_CSRGIsNullMvDescr(cMat)) {
cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *));
}
T_CSRGCreateSpMVDescr(cMat);
vX=x->v_;
vY=y->v_;
CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, x->pitch_, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, y->pitch_, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT,
&bfsz));
if (bfsz > cMat->mvbsize) {
if (cMat->mvbuffer != NULL) {
CHECK_CUDA(cudaFree(cMat->mvbuffer));
cMat->mvbuffer = NULL;
}
allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
cMat->mvbsize = bfsz;
}
CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE,
CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer));
CHECK_CUSPARSE(cusparseDestroyDnMat(vecX));
CHECK_CUSPARSE(cusparseDestroyDnMat(vecY));
CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr)));
}
int T_spsvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
TYPE beta, void *deviceY)
{

@ -172,12 +172,11 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans)
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
! TODO
info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
info = spmmCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmvCSRGDevice',i_err=(/info,izero,izero,izero,izero/))
& a_err='spmmCSRGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if

@ -789,7 +789,6 @@ program pdegenmm
#endif
end do
call x_mv%set(done)
call psb_barrier(ctxt)
t1 = psb_wtime()
do i=1,ntests
@ -805,6 +804,7 @@ program pdegenmm
! FIXME: cache flush needed here
x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect()
write(*,*)
do i=1,8
write(*,*) x1(i,:)
end do
@ -818,9 +818,6 @@ program pdegenmm
! call psb_geasb(bg,desc_a,info,mold=tmold)
! call bg%set(done+done)
! call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
! call psb_cuda_DeviceSync()
! ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble)
! call psb_geaxpby(done,xg,dzero,bg,desc_a,info)
! call psb_cuda_DeviceSync()
@ -835,52 +832,10 @@ program pdegenmm
! return
! TODO Test NRM2 AMAX
! call b_mv_g%set(done)
! test = psb_genrm2(b_mv_g,desc_a,info)
! write(*,*) 'AMAX ', psb_geamax(b_mv_g,desc_a,info)
! do i=1,nrhs
! write(*,*) test(i)
! end do
! TODO SpMM da fare con vettori GPU su mod csrg
! TODO SpMM da fare a parte dopo
! TODO Da cambiare WRITE READ R2 devono usare Memcopy 2D
! TODO Test DDOT
! call x_mv_g%set(done)
! call b_mv_g%set(done+done)
! test = psb_gedot(b_mv_g,test2,desc_a,info)
! write(*,*) 'SIZE ', size(test,1), size(test,2)
! do i=1,size(test,1)
! write(*,*) test(i,:)
! end do
! TODO
! allocate(test(8,2),test1(8,2),test2(8))
! do i=1,size(test,1)
! test(i,:) = i*done
! end do
! info = FallocMultiVecDevice(gpx,nrhs,size(test,1),spgpu_type_double)
! info = writeMultiVecDevice(gpx,test,size(test,1))
! !info = FallocMultiVecDevice(gpy,nrhs,size(test1,1),spgpu_type_double)
! info = readMultiVecDevice(gpx,test1,size(test1,1))
! do i=1,size(test1,1)
! write(*,*) test1(i,:)
! end do
! return
call x_mv_g%set(done)
call x_mv_g%sync()
call b_mv_g%set(done)
call b_mv_g%sync()
call psb_barrier(ctxt)
tt1 = psb_wtime()
do i=1,ntests
call psb_spmm(done,agpu,x_mv_g,dzero,b_mv_g,desc_a,info)
call psb_spmm(done,agpu,x_mv,dzero,b_mv_g,desc_a,info)
if ((info /= 0).or.(psb_get_errstatus()/=0)) then
write(0,*) 'From 1 spmm',info,i,ntests
call psb_error()
@ -893,17 +848,15 @@ program pdegenmm
call psb_amx(ctxt,tt2)
x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect()
write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:)
do i=1,size(b_mv_g%v%v,1)
write(*,*) b_mv_g%v%v(i,:)
write(*,*)
do i=1,size(x2,1)
write(*,*) x2(i,:)
end do
nr = desc_a%get_local_rows()
eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs)))
call psb_amx(ctxt,eps)
if (iam==0) write(*,*) 'Max diff on xGPU',eps
return
! FIXME: cache flush needed here
call x_mv_g%set(x0)
call x_mv_g%sync()
@ -926,12 +879,13 @@ program pdegenmm
call b_mv_g%sync()
x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect()
write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:)
write(*,*)
do i=1,size(x2,1)
write(*,*) x2(i,:)
end do
call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info)
eps = psb_geamax(b_mv,desc_a,info)
return
call psb_amx(ctxt,t2)
eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs)))
call psb_amx(ctxt,eps)

Loading…
Cancel
Save