Read/Write multivect fixed (SpMM bug)

psblas-bgmres
gabrielequatrana 10 months ago
parent a624b7098b
commit ee140bc8dd

@ -3400,7 +3400,7 @@ contains
implicit none implicit none
class(psb_d_base_multivect_type), intent(inout) :: x class(psb_d_base_multivect_type), intent(inout) :: x
integer(psb_ipk_), intent(in) :: nr integer(psb_ipk_), intent(in) :: nr
real(psb_dpk_), allocatable :: res real(psb_dpk_) :: res
integer(psb_ipk_) :: j, nc integer(psb_ipk_) :: j, nc
if (x%is_dev()) call x%sync() if (x%is_dev()) call x%sync()

@ -181,6 +181,20 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count)
} }
} }
// TODO
int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size)
{
cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error writeRemoteBuffer: %s %p %p %d %d %d\n",
cudaGetErrorString(err),buffer, hostSrc, count, pitch, size);
return SPGPU_UNSPECIFIED;
}
}
int readRemoteBuffer(void* hostDest, void* buffer, int count) int readRemoteBuffer(void* hostDest, void* buffer, int count)
{ {
@ -206,6 +220,20 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count)
} }
} }
// TODO sistemare pitch e size (si possono gestire senza realloc su fortran)
int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size)
{
cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error readRemoteBuffer: %s %p %p %d %d\n",
cudaGetErrorString(err),hostDest,buffer,count,err);
return SPGPU_UNSPECIFIED;
}
}
int freeRemoteBuffer(void* buffer) int freeRemoteBuffer(void* buffer)
{ {
cudaError_t err = cudaFree(buffer); cudaError_t err = cudaFree(buffer);

@ -49,7 +49,9 @@ int allocMappedMemory(void **buffer, void **dp, int size);
int registerMappedMemory(void *buffer, void **dp, int size); int registerMappedMemory(void *buffer, void **dp, int size);
int unregisterMappedMemory(void *buffer); int unregisterMappedMemory(void *buffer);
int writeRemoteBuffer(void* hostSrc, void* buffer, int count); int writeRemoteBuffer(void* hostSrc, void* buffer, int count);
int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size);
int readRemoteBuffer(void* hostDest, void* buffer, int count); int readRemoteBuffer(void* hostDest, void* buffer, int count);
int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size);
int freeRemoteBuffer(void* buffer); int freeRemoteBuffer(void* buffer);
int gpuInit(int dev); int gpuInit(int dev);
int getDeviceCount(); int getDeviceCount();

@ -55,7 +55,10 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec)
int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i; { int i;
i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec); struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
// i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_);
if (i != 0) { if (i != 0) {
fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i); fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i);
} }
@ -75,7 +78,10 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec)
int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i; { int i;
i = readMultiVecDeviceDouble(deviceVec, hostVec); struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
// i = readMultiVecDeviceDouble(deviceVec, hostVec);
fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_);
if (i != 0) { if (i != 0) {
fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i); fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i);
} }
@ -236,6 +242,7 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX,
return SPGPU_UNSUPPORTED; return SPGPU_UNSUPPORTED;
for(j=0;j<devVecY->count_;j++) for(j=0;j<devVecY->count_;j++)
fprintf(stderr,"CUDA ENTERED %d %d %d %d \n",j, n, pitch, devVecY->size_);
spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta,
(double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j); (double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j);
return(i); return(i);

@ -137,8 +137,10 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
void *vX, *vY; void *vX, *vY;
int r,n; int j,r,n;
int pitch;
cusparseHandle_t *my_handle=getHandle(); cusparseHandle_t *my_handle=getHandle();
pitch = y->pitch_;
TYPE ealpha=alpha, ebeta=beta; TYPE ealpha=alpha, ebeta=beta;
#if CUDA_SHORT_VERSION <= 10 #if CUDA_SHORT_VERSION <= 10
/* getAddrMultiVecDevice(deviceX, &vX); */ /* getAddrMultiVecDevice(deviceX, &vX); */
@ -196,37 +198,69 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer)); CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer));
#else #else
cusparseDnVecDescr_t vecX, vecY; // cusparseDnVecDescr_t vecX, vecY;
cusparseDnMatDescr_t vecX, vecY;
size_t bfsz; size_t bfsz;
if (T_CSRGIsNullMvDescr(cMat)) { if (T_CSRGIsNullMvDescr(cMat)) {
cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *)); cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *));
} }
T_CSRGCreateSpMVDescr(cMat); T_CSRGCreateSpMVDescr(cMat);
vX=x->v_; // vX=x->v_;
vY=y->v_; // vY=y->v_;
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) ); // fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta);
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) ); // CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE, // CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, // CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, // CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
&bfsz)); // (*(cMat->spmvDescr)),vecX,&beta,vecY,
if (bfsz > cMat->mvbsize) { // CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT,
if (cMat->mvbuffer != NULL) { // &bfsz));
CHECK_CUDA(cudaFree(cMat->mvbuffer)); // if (bfsz > cMat->mvbsize) {
cMat->mvbuffer = NULL; // if (cMat->mvbuffer != NULL) {
// CHECK_CUDA(cudaFree(cMat->mvbuffer));
// cMat->mvbuffer = NULL;
// }
// //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
// allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
// cMat->mvbsize = bfsz;
// }
// CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
// CUSPARSE_OPERATION_NON_TRANSPOSE,
// &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE,
// CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer));
// CHECK_CUSPARSE(cusparseDestroyDnMat(vecX));
// CHECK_CUSPARSE(cusparseDestroyDnMat(vecY));
for(j=0;j<y->count_;j++) {
vX=x->v_+pitch*j;
vY=y->v_+pitch*j;
fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n);
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
&bfsz));
if (bfsz > cMat->mvbsize) {
if (cMat->mvbuffer != NULL) {
CHECK_CUDA(cudaFree(cMat->mvbuffer));
cMat->mvbuffer = NULL;
}
//CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
cMat->mvbsize = bfsz;
} }
//CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz)); CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz); &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
cMat->mvbsize = bfsz; cMat->mvbuffer));
fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_);
CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
} }
CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
cMat->mvbuffer));
CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr))); CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr)));
#endif #endif
} }

@ -115,3 +115,92 @@ subroutine psb_d_cuda_csrg_vect_mv(alpha,a,x,beta,y,info,trans)
return return
end subroutine psb_d_cuda_csrg_vect_mv end subroutine psb_d_cuda_csrg_vect_mv
subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use cusparse_mod
use elldev_mod
use psb_vectordev_mod
use psb_d_cuda_csrg_mat_mod, psb_protect_name => psb_d_cuda_csrg_multivect_mv
use psb_d_cuda_multivect_mod
implicit none
class(psb_d_cuda_csrg_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: rx(:,:), ry(:,:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='d_cuda_csrg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= dzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_d_csr_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_d_multivect_cuda)
select type(yy => y)
type is (psb_d_multivect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
! TODO
write(*,*) 'AAAAAAAAA'
info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmvCSRGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_d_cuda_csrg_multivect_mv

@ -55,6 +55,7 @@ module psb_d_cuda_csrg_mat_mod
procedure, nopass :: get_fmt => d_cuda_csrg_get_fmt procedure, nopass :: get_fmt => d_cuda_csrg_get_fmt
procedure, pass(a) :: sizeof => d_cuda_csrg_sizeof procedure, pass(a) :: sizeof => d_cuda_csrg_sizeof
procedure, pass(a) :: vect_mv => psb_d_cuda_csrg_vect_mv procedure, pass(a) :: vect_mv => psb_d_cuda_csrg_vect_mv
procedure, pass(a) :: multivect_mv => psb_d_cuda_csrg_multivect_mv
procedure, pass(a) :: in_vect_sv => psb_d_cuda_csrg_inner_vect_sv procedure, pass(a) :: in_vect_sv => psb_d_cuda_csrg_inner_vect_sv
procedure, pass(a) :: csmm => psb_d_cuda_csrg_csmm procedure, pass(a) :: csmm => psb_d_cuda_csrg_csmm
procedure, pass(a) :: csmv => psb_d_cuda_csrg_csmv procedure, pass(a) :: csmv => psb_d_cuda_csrg_csmv
@ -109,6 +110,15 @@ module psb_d_cuda_csrg_mat_mod
integer(psb_ipk_), intent(out) :: info integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans character, optional, intent(in) :: trans
end subroutine psb_d_cuda_csrg_vect_mv end subroutine psb_d_cuda_csrg_vect_mv
subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_d_cuda_csrg_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_
class(psb_d_cuda_csrg_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_d_cuda_csrg_multivect_mv
end interface end interface
interface interface

@ -1357,9 +1357,8 @@ module psb_d_cuda_multivect_mod
procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows
procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols
procedure, nopass :: get_fmt => d_cuda_multi_get_fmt procedure, nopass :: get_fmt => d_cuda_multi_get_fmt
! TODO procedure, pass(x) :: dot_v => d_cuda_multi_dot_v
!!$ procedure, pass(x) :: dot_v => d_cuda_multi_dot_v procedure, pass(x) :: dot_a => d_cuda_multi_dot_a
!!$ procedure, pass(x) :: dot_a => d_cuda_multi_dot_a
procedure, pass(y) :: axpby_v => d_cuda_multi_axpby_v procedure, pass(y) :: axpby_v => d_cuda_multi_axpby_v
procedure, pass(y) :: axpby_a => d_cuda_multi_axpby_a procedure, pass(y) :: axpby_a => d_cuda_multi_axpby_a
!!$ procedure, pass(y) :: mlt_v => d_cuda_multi_mlt_v !!$ procedure, pass(y) :: mlt_v => d_cuda_multi_mlt_v
@ -1369,7 +1368,7 @@ module psb_d_cuda_multivect_mod
!!$ procedure, pass(x) :: scal => d_cuda_multi_scal !!$ procedure, pass(x) :: scal => d_cuda_multi_scal
procedure, pass(x) :: nrm2 => d_cuda_multi_nrm2 procedure, pass(x) :: nrm2 => d_cuda_multi_nrm2
procedure, pass(x) :: amax => d_cuda_multi_amax procedure, pass(x) :: amax => d_cuda_multi_amax
!!$ procedure, pass(x) :: asum => d_cuda_multi_asum procedure, pass(x) :: asum => d_cuda_multi_asum
procedure, pass(x) :: all => d_cuda_multi_all procedure, pass(x) :: all => d_cuda_multi_all
procedure, pass(x) :: zero => d_cuda_multi_zero procedure, pass(x) :: zero => d_cuda_multi_zero
procedure, pass(x) :: asb => d_cuda_multi_asb procedure, pass(x) :: asb => d_cuda_multi_asb
@ -1608,11 +1607,11 @@ contains
res = 'dGPU' res = 'dGPU'
end function d_cuda_multi_get_fmt end function d_cuda_multi_get_fmt
function d_cuda_multi_dot_v(n,x,y) result(res) function d_cuda_multi_dot_v(nr,x,y) result(res)
implicit none implicit none
class(psb_d_multivect_cuda), intent(inout) :: x class(psb_d_multivect_cuda), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(in) :: nr
real(psb_dpk_), allocatable :: res(:,:) real(psb_dpk_), allocatable :: res(:,:)
real(psb_dpk_), external :: ddot real(psb_dpk_), external :: ddot
integer(psb_ipk_) :: info integer(psb_ipk_) :: info
@ -1623,38 +1622,41 @@ contains
! When we get here, we are sure that X is of ! When we get here, we are sure that X is of
! TYPE psb_d_vect ! TYPE psb_d_vect
! !
! TODO tra
select type(yy => y) select type(yy => y)
type is (psb_d_multivect_cuda) type is (psb_d_multivect_cuda)
if (x%is_host()) call x%sync() if (x%is_host()) call x%sync()
if (yy%is_host()) call yy%sync() if (yy%is_host()) call yy%sync()
info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect,x%get_ncols()) info = dotMultiVecDevice(res,nr,x%deviceVect,yy%deviceVect,x%get_ncols())
if (info /= 0) then if (info /= 0) then
info = psb_err_internal_error_ info = psb_err_internal_error_
call psb_errpush(info,'d_cuda_multi_dot_v') call psb_errpush(info,'d_cuda_multi_dot_v')
end if end if
! TODO
class default class default
! y%sync is done in dot_a ! y%sync is done in dot_a
call x%sync() call x%sync()
res = y%dot(n,x%v) res = y%dot(nr,x%v)
end select end select
end function d_cuda_multi_dot_v end function d_cuda_multi_dot_v
! TODO function d_cuda_multi_dot_a(nr,x,y) result(res)
function d_cuda_multi_dot_a(n,x,y) result(res)
implicit none implicit none
class(psb_d_multivect_cuda), intent(inout) :: x class(psb_d_multivect_cuda), intent(inout) :: x
real(psb_dpk_), intent(in) :: y(:) real(psb_dpk_), intent(in) :: y(:,:)
integer(psb_ipk_), intent(in) :: n integer(psb_ipk_), intent(in) :: nr
real(psb_dpk_), allocatable :: res(:,:) real(psb_dpk_), allocatable :: res(:,:)
real(psb_dpk_), external :: ddot real(psb_dpk_), external :: ddot
integer(psb_ipk_) :: i, j, n_x, n_y
if (x%is_dev()) call x%sync() if (x%is_dev()) call x%sync()
allocate(res(2,2)) n_x = size(x%v,2)
res = ddot(n,y,1,x%v,1) n_y = size(y,2_psb_ipk_)
allocate(res(n_x,n_y))
do i=1,n_x
do j=1,n_y
res(i,j) = ddot(nr,x%v(1:nr,i),1,y(1:nr,j),1)
end do
end do
end function d_cuda_multi_dot_a end function d_cuda_multi_dot_a
@ -1980,7 +1982,7 @@ contains
nh=0 nh=0
end if end if
if (c_associated(x%deviceVect)) then if (c_associated(x%deviceVect)) then
md = getMultiVecDevicePitch(x%deviceVect) md = getMultiVecDeviceSize(x%deviceVect)
nd = getMultiVecDeviceCount(x%deviceVect) nd = getMultiVecDeviceCount(x%deviceVect)
if ((md < mh).or.(nd<nh)) then if ((md < mh).or.(nd<nh)) then
call freeMultiVecDevice(x%deviceVect) call freeMultiVecDevice(x%deviceVect)
@ -1991,7 +1993,7 @@ contains
if (.not.c_associated(x%deviceVect)) then if (.not.c_associated(x%deviceVect)) then
info = FallocMultiVecDevice(x%deviceVect,nh,mh,spgpu_type_double) info = FallocMultiVecDevice(x%deviceVect,nh,mh,spgpu_type_double)
if (info == 0) & if (info == 0) &
& call psb_realloc(getMultiVecDevicePitch(x%deviceVect),& & call psb_realloc(getMultiVecDeviceSize(x%deviceVect),&
& getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero) & getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero)
if (info /= 0) then if (info /= 0) then
!!$ write(0,*) 'Error from FallocMultiVecDevice',info,n !!$ write(0,*) 'Error from FallocMultiVecDevice',info,n
@ -2010,10 +2012,10 @@ contains
mh=0 mh=0
nh=0 nh=0
end if end if
md = getMultiVecDevicePitch(x%deviceVect) md = getMultiVecDeviceSize(x%deviceVect)
nd = getMultiVecDeviceCount(x%deviceVect) nd = getMultiVecDeviceCount(x%deviceVect)
if ((mh /= md).or.(nh /= nd)) then if ((mh /= md).or.(nh /= nd)) then
call psb_realloc(getMultiVecDevicePitch(x%deviceVect),& call psb_realloc(getMultiVecDeviceSize(x%deviceVect),&
& getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero) & getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero)
end if end if

@ -165,6 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
p = getMultiVectorDeviceParams(count, size, elementType); p = getMultiVectorDeviceParams(count, size, elementType);
i = allocMultiVecDevice(deviceMultiVec, &p); i = allocMultiVecDevice(deviceMultiVec, &p);
fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
//cudaSync(); //cudaSync();
if (i != 0) { if (i != 0) {
fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size); fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);

@ -226,10 +226,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
! BGMRES algorithm ! BGMRES algorithm
! TODO Con tanti ITRS e tanti NRHS si ottengono NaN, deflazione e restart dopo aver trovato una colonna, difficile...
! TODO Provare a compilare su GPU remota (Vedere REC)
! STEP 1: Compute R(0) = B - A*X(0) ! STEP 1: Compute R(0) = B - A*X(0)
! Store B in V(1) ! Store B in V(1)
@ -284,7 +280,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
idx_i = (i-1)*nrhs+1 idx_i = (i-1)*nrhs+1
! STEP 6: Compute H(i,j) = (V(i)**T)*W ! STEP 6: Compute H(i,j) = (V(i)**T)*W
h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_geprod(v(i),w,desc_a,info,trans=.true.) h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_gedot(v(i),w,desc_a,info)
if (info /= psb_success_) then if (info /= psb_success_) then
info=psb_err_from_subroutine_non_ info=psb_err_from_subroutine_non_
call psb_errpush(info,name) call psb_errpush(info,name)
@ -366,13 +362,13 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
errnum = rmn2 errnum = rmn2
errden = r0n2 errden = r0n2
do col=1,nrhs ! do col=1,nrhs
write(*,*) rmn2(col), r0n2(col) ! write(*,*) rmn2(col), r0n2(col)
end do ! end do
end if end if
! TODO Check convergence (max o media) ! Check convergence (max o media)
if (all(errnum.le.(eps*errden))) then if (maxval(errnum).le.(eps*maxval(errden))) then
! Exit algorithm ! Exit algorithm
exit outer exit outer
@ -394,7 +390,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
! END algorithm ! END algorithm
! TODO Versione finale che stampa errore massimo (si può usare log_conv con questo)
if (itrace_ > 0) call log_conv(methdname,me,itx,ione,maxval(errnum),maxval(errden),deps) if (itrace_ > 0) call log_conv(methdname,me,itx,ione,maxval(errnum),maxval(errden),deps)
call log_end(methdname,me,itx,itrace_,maxval(errnum),maxval(errden),deps,err=derr,iter=iter) call log_end(methdname,me,itx,itrace_,maxval(errnum),maxval(errden),deps,err=derr,iter=iter)
@ -445,8 +440,6 @@ contains
beta_e1 = dzero beta_e1 = dzero
beta_e1(1:nrhs,1:nrhs) = beta beta_e1(1:nrhs,1:nrhs) = beta
! TODO DGELS ha anche i residui (con i residui fai come MATLAB e poi si prova se esce uguale)
! Compute min Frobenius norm ! Compute min Frobenius norm
call dgels('N',m_h,n_h,nrhs,h_temp(1:m_h,1:n_h),m_h,beta_e1,m_h,work,lwork,info) call dgels('N',m_h,n_h,nrhs,h_temp(1:m_h,1:n_h),m_h,beta_e1,m_h,work,lwork,info)

@ -37,4 +37,3 @@ lib:
(cd ../../; make library) (cd ../../; make library)
verycleanlib: verycleanlib:
(cd ../../; make veryclean) (cd ../../; make veryclean)

@ -624,7 +624,9 @@ program pdegenmm
character(len=20) :: name,ch_err character(len=20) :: name,ch_err
character(len=40) :: fname character(len=40) :: fname
real(psb_dpk_), allocatable :: test(:,:) real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:)
type(c_ptr) :: gpx, gpy
info=psb_success_ info=psb_success_
@ -787,7 +789,7 @@ program pdegenmm
#endif #endif
end do end do
call x_mv%set(x0) call x_mv%set(done)
call psb_barrier(ctxt) call psb_barrier(ctxt)
t1 = psb_wtime() t1 = psb_wtime()
do i=1,ntests do i=1,ntests
@ -803,7 +805,11 @@ program pdegenmm
! FIXME: cache flush needed here ! FIXME: cache flush needed here
x1 = b_mv%get_vect() x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect() x2 = b_mv_g%get_vect()
do i=1,8
write(*,*) x1(i,:)
end do
! TODO test AXPBY e SPMM
! call psb_geall(xg,desc_a,info) ! call psb_geall(xg,desc_a,info)
! call psb_geasb(xg,desc_a,info,mold=tmold) ! call psb_geasb(xg,desc_a,info,mold=tmold)
! call xg%set(done) ! call xg%set(done)
@ -838,21 +844,26 @@ program pdegenmm
! write(*,*) test(i) ! write(*,*) test(i)
! end do ! end do
! TODO SpMM da fare con vettori GPU su mod csrg
! TODO SpMM da fare a parte dopo
! TODO Da cambiare WRITE READ R2 devono usare Memcopy 2D
! TODO Test DDOT ! TODO Test DDOT
call x_mv_g%set(done) ! call x_mv_g%set(done)
call b_mv_g%set(done+done) ! call b_mv_g%set(done+done)
test = b_mv_g%dot(8,x_mv_g) ! test = psb_gedot(b_mv_g,test2,desc_a,info)
write(*,*) 'SIZE ', size(test,1), size(test,2) ! write(*,*) 'SIZE ', size(test,1), size(test,2)
do i=1,nrhs ! do i=1,size(test,1)
write(*,*) test(i,:) ! write(*,*) test(i,:)
end do ! end do
return write(*,*) 'TEST'
call x_mv_g%set(done)
call x_mv_g%sync()
call psb_barrier(ctxt) call psb_barrier(ctxt)
tt1 = psb_wtime() tt1 = psb_wtime()
do i=1,ntests do i=1,ntests
call psb_spmm(done,agpu,x_mv,dzero,b_mv_g,desc_a,info) call psb_spmm(done,agpu,x_mv_g,dzero,b_mv_g,desc_a,info)
if ((info /= 0).or.(psb_get_errstatus()/=0)) then if ((info /= 0).or.(psb_get_errstatus()/=0)) then
write(0,*) 'From 1 spmm',info,i,ntests write(0,*) 'From 1 spmm',info,i,ntests
call psb_error() call psb_error()
@ -903,6 +914,8 @@ program pdegenmm
call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info) call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info)
eps = psb_geamax(b_mv,desc_a,info) eps = psb_geamax(b_mv,desc_a,info)
return
call psb_amx(ctxt,t2) call psb_amx(ctxt,t2)
eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs))) eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs)))
call psb_amx(ctxt,eps) call psb_amx(ctxt,eps)

Loading…
Cancel
Save