Try to fix SpMM

psblas-bgmres
gabrielequatrana 10 months ago
parent ee140bc8dd
commit 409b51e609

@ -69,7 +69,7 @@ Module psb_d_tools_mod
subroutine psb_dalloc_multivect_r2(x, desc_a,info,m,n,lb, dupl, bldmode)
import
implicit none
type(psb_d_multivect_type), intent(out) :: x(:)
type(psb_d_multivect_type), allocatable, intent(out) :: x(:)
type(psb_desc_type), intent(in) :: desc_a
integer(psb_ipk_),intent(out) :: info
integer(psb_ipk_), optional, intent(in) :: m, n, lb

@ -184,7 +184,7 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count)
// TODO
int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size)
{
cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice);
cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, size, size, count, cudaMemcpyHostToDevice);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
@ -223,7 +223,7 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count)
// TODO sistemare pitch e size (si possono gestire senza realloc su fortran)
int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size)
{
cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost);
cudaError_t err = cudaMemcpy2D(hostDest, size, buffer, pitch, size, count, cudaMemcpyDeviceToHost);
if (err == cudaSuccess)
return SPGPU_SUCCESS;

@ -56,7 +56,7 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec)
int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i;
struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
// i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_);
if (i != 0) {
@ -79,7 +79,7 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec)
int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
{ int i;
struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
// i = readMultiVecDeviceDouble(deviceVec, hostVec);
fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_);
if (i != 0) {

@ -209,8 +209,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
// vX=x->v_;
// vY=y->v_;
// fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta);
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, y->size_, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, y->size_, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
// CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
// CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
// (*(cMat->spmvDescr)),vecX,&beta,vecY,
@ -236,7 +236,7 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
for(j=0;j<y->count_;j++) {
vX=x->v_+pitch*j;
vY=y->v_+pitch*j;
fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n);
fprintf(stderr,"CUDA ENTERED %d %p %p %d %d\n",j, vX, vY, pitch, y->size_);
CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
@ -257,7 +257,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
&alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
cMat->mvbuffer));
fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_);
CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
}

@ -100,16 +100,16 @@ subroutine psb_d_cuda_csrg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -173,7 +173,6 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans)
if (yy%is_host()) call yy%sync()
end if
! TODO
write(*,*) 'AAAAAAAAA'
info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then

@ -165,7 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
p = getMultiVectorDeviceParams(count, size, elementType);
i = allocMultiVecDevice(deviceMultiVec, &p);
fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
//fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
//cudaSync();
if (i != 0) {
fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);

@ -624,7 +624,7 @@ program pdegenmm
character(len=20) :: name,ch_err
character(len=40) :: fname
real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:)
real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:)
type(c_ptr) :: gpx, gpy
@ -856,10 +856,27 @@ program pdegenmm
! write(*,*) test(i,:)
! end do
write(*,*) 'TEST'
! TODO
! allocate(test(8,2),test1(8,2),test2(8))
! do i=1,size(test,1)
! test(i,:) = i*done
! end do
! info = FallocMultiVecDevice(gpx,nrhs,size(test,1),spgpu_type_double)
! info = writeMultiVecDevice(gpx,test,size(test,1))
! !info = FallocMultiVecDevice(gpy,nrhs,size(test1,1),spgpu_type_double)
! info = readMultiVecDevice(gpx,test1,size(test1,1))
! do i=1,size(test1,1)
! write(*,*) test1(i,:)
! end do
! return
call x_mv_g%set(done)
call x_mv_g%sync()
call b_mv_g%set(done)
call b_mv_g%sync()
call psb_barrier(ctxt)
tt1 = psb_wtime()
do i=1,ntests
@ -876,7 +893,6 @@ program pdegenmm
call psb_amx(ctxt,tt2)
x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect()
write(*,*) 'MHANZ ', b_mv_g%get_nrows(), size(b_mv_g%v%v,1)
write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:)
do i=1,size(b_mv_g%v%v,1)
write(*,*) b_mv_g%v%v(i,:)

Loading…
Cancel
Save