From 409b51e609a6340f20e986064af0bf76f5fa095d Mon Sep 17 00:00:00 2001
From: gabrielequatrana <gabriele.quatrana@outlook.com>
Date: Tue, 9 Apr 2024 23:21:17 +0200
Subject: [PATCH] Try to fix SpMM

---
 base/modules/tools/psb_d_tools_mod.F90 |  2 +-
 cuda/cuda_util.c                       |  4 ++--
 cuda/dvectordev.c                      |  4 ++--
 cuda/fcusparse_fct.h                   |  7 +++----
 cuda/impl/psb_d_cuda_csrg_csmm.F90     |  6 +++---
 cuda/impl/psb_d_cuda_csrg_vect_mv.F90  |  1 -
 cuda/vectordev.c                       |  2 +-
 test/block_krylov/kernel/dpdegenmm.F90 | 22 +++++++++++++++++++---
 8 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/base/modules/tools/psb_d_tools_mod.F90 b/base/modules/tools/psb_d_tools_mod.F90
index 72f2a1fe..b84b3095 100644
--- a/base/modules/tools/psb_d_tools_mod.F90
+++ b/base/modules/tools/psb_d_tools_mod.F90
@@ -69,7 +69,7 @@ Module psb_d_tools_mod
     subroutine psb_dalloc_multivect_r2(x, desc_a,info,m,n,lb, dupl, bldmode)
       import
       implicit none
-      type(psb_d_multivect_type), intent(out)  :: x(:)
+      type(psb_d_multivect_type), allocatable, intent(out)  :: x(:)
       type(psb_desc_type), intent(in) :: desc_a
       integer(psb_ipk_),intent(out)             :: info
       integer(psb_ipk_), optional, intent(in)   :: m, n, lb
diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c
index c66fb96f..de3e6686 100644
--- a/cuda/cuda_util.c
+++ b/cuda/cuda_util.c
@@ -184,7 +184,7 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count)
 // TODO
 int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size)
 {
-  cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice);
+  cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, size, size, count, cudaMemcpyHostToDevice);
 
   if (err == cudaSuccess)
     return SPGPU_SUCCESS;	
@@ -223,7 +223,7 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count)
 // TODO sistemare pitch e size (si possono gestire senza realloc su fortran)
 int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size)
 {
-  cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost);
+  cudaError_t err = cudaMemcpy2D(hostDest, size, buffer, pitch, size, count, cudaMemcpyDeviceToHost);
 
   if (err == cudaSuccess)
     return SPGPU_SUCCESS;	
diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c
index 28b9055a..27d13a3c 100644
--- a/cuda/dvectordev.c
+++ b/cuda/dvectordev.c
@@ -56,7 +56,7 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec)
 int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
 { int i;
   struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
-  i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
+  i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
 //   i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
   fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_);
   if (i != 0) {
@@ -79,7 +79,7 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec)
 int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
 { int i;
   struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
-  i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
+  i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_, devVec->pitch_*sizeof(double), devVec->size_*sizeof(double));
 //   i = readMultiVecDeviceDouble(deviceVec, hostVec);
   fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_);
   if (i != 0) {
diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h
index 4d901134..118cbd7c 100644
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@@ -209,8 +209,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
 //   vX=x->v_;
 //   vY=y->v_;
 //   fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta);
-//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
-//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
+//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, y->size_, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
+//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, y->size_, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
 //   CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
 //                         CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
 //                         (*(cMat->spmvDescr)),vecX,&beta,vecY,
@@ -236,7 +236,7 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
   for(j=0;j<y->count_;j++) {
     vX=x->v_+pitch*j;
     vY=y->v_+pitch*j;
-    fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n);
+    fprintf(stderr,"CUDA ENTERED %d %p %p %d %d\n",j, vX, vY, pitch, y->size_);
     CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
     CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
     CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -257,7 +257,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
                     &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
                     CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
                     cMat->mvbuffer));
-    fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_);
     CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
     CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
   }
diff --git a/cuda/impl/psb_d_cuda_csrg_csmm.F90 b/cuda/impl/psb_d_cuda_csrg_csmm.F90
index ddac1373..4515eb31 100644
--- a/cuda/impl/psb_d_cuda_csrg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_csrg_csmm.F90
@@ -100,16 +100,16 @@ subroutine psb_d_cuda_csrg_csmm(alpha,a,x,beta,y,info,trans)
     if (info == 0) &
          & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
     if (info == 0) &
          & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
 
     if (info == 0)  &
          & info = spmvCSRGDevice(a%deviceMat,alpha,gpX,beta,gpY)
     if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
     if (info /= 0) goto 9999
     call freeMultiVecDevice(gpX)
     call freeMultiVecDevice(gpY)
diff --git a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90 b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90
index a62e1d0f..b4a7ea53 100644
--- a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90
@@ -173,7 +173,6 @@ subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans)
           if (yy%is_host()) call yy%sync()
         end if
         ! TODO
-        write(*,*) 'AAAAAAAAA'
         info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
              & beta,yy%deviceVect)
         if (info /= 0) then 
diff --git a/cuda/vectordev.c b/cuda/vectordev.c
index f947a73a..22039b5c 100644
--- a/cuda/vectordev.c
+++ b/cuda/vectordev.c
@@ -165,7 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
 
   p = getMultiVectorDeviceParams(count, size, elementType);
   i = allocMultiVecDevice(deviceMultiVec, &p);
-  fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
+  //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
   //cudaSync();
   if (i != 0) {
     fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90
index 44c24365..98b1f2c8 100644
--- a/test/block_krylov/kernel/dpdegenmm.F90
+++ b/test/block_krylov/kernel/dpdegenmm.F90
@@ -624,7 +624,7 @@ program pdegenmm
   character(len=20)  :: name,ch_err
   character(len=40)  :: fname
 
-  real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:)
+  real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:)
 
   type(c_ptr) :: gpx, gpy
 
@@ -856,10 +856,27 @@ program pdegenmm
 !     write(*,*) test(i,:)
 !   end do
 
-  write(*,*) 'TEST'
+! TODO
+!   allocate(test(8,2),test1(8,2),test2(8))
+!   do i=1,size(test,1)
+!     test(i,:) = i*done
+!   end do
+!   info = FallocMultiVecDevice(gpx,nrhs,size(test,1),spgpu_type_double)
+!   info = writeMultiVecDevice(gpx,test,size(test,1))
+!   !info = FallocMultiVecDevice(gpy,nrhs,size(test1,1),spgpu_type_double)
+!   info = readMultiVecDevice(gpx,test1,size(test1,1))
+
+!   do i=1,size(test1,1)
+!     write(*,*) test1(i,:)
+!   end do
+
+!   return
+
   call x_mv_g%set(done)
   call x_mv_g%sync()
 
+  call b_mv_g%set(done)
+  call b_mv_g%sync()
   call psb_barrier(ctxt)
   tt1 = psb_wtime()
   do i=1,ntests 
@@ -876,7 +893,6 @@ program pdegenmm
   call psb_amx(ctxt,tt2)
   x1 = b_mv%get_vect()
   x2 = b_mv_g%get_vect()
-  write(*,*) 'MHANZ ', b_mv_g%get_nrows(), size(b_mv_g%v%v,1)
   write(*,*) 'X1 ', x1(1,:), ' X2 ', x2(1,:)
   do i=1,size(b_mv_g%v%v,1)
     write(*,*) b_mv_g%v%v(i,:)