From 173ffec2d3e5129a2149d578be944bdbbd9d76ad Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@uniroma2.it>
Date: Mon, 27 May 2024 07:02:00 -0400
Subject: [PATCH] First working version of CSGA. To be tested and refined.

---
 cuda/d_csga_mod.F90           |  3 +-
 cuda/fcusparse_fct.h          |  2 +-
 cuda/spgpu/kernels/dcsgamv.cu | 77 ++++++++++++++++++++++-------------
 test/cudakern/Makefile        |  2 +-
 test/cudakern/dpdegenmv.F90   |  6 +++
 5 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/cuda/d_csga_mod.F90 b/cuda/d_csga_mod.F90
index e86288fe..9055a010 100644
--- a/cuda/d_csga_mod.F90
+++ b/cuda/d_csga_mod.F90
@@ -93,7 +93,7 @@ module d_csga_mod
   end interface
  
   interface spmvCSGADevice
-    function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
+    function d_spmvCSGADevice(Mat,alpha,x,beta,y,rb) &
          &  bind(c,name="d_spmvCSGADevice") result(res)
       use iso_c_binding
       import  d_Cmat
@@ -101,6 +101,7 @@ module d_csga_mod
       type(c_ptr), value    :: x
       type(c_ptr), value    :: y
       real(c_double), value :: alpha,beta
+      integer(c_int)        :: rb(*)
       integer(c_int)        :: res
     end function d_spmvCSGADevice
   end interface
diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h
index 7e757b2e..a90331ea 100644
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@@ -251,7 +251,7 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
   cMat->m  = nr;
   cMat->n  = nc;
   cMat->nz = nz;
-  fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
+  //fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
   if (nr1 == 0) nr1 = 1;
   if (nz1 == 0) nz1 = 1;
   if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)
diff --git a/cuda/spgpu/kernels/dcsgamv.cu b/cuda/spgpu/kernels/dcsgamv.cu
index 5c229ef7..f4d0afdd 100644
--- a/cuda/spgpu/kernels/dcsgamv.cu
+++ b/cuda/spgpu/kernels/dcsgamv.cu
@@ -23,9 +23,9 @@ __device__ double warp_reduce(double val){
   return val;
 }
 
-__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
-			     int m, int n, int col_multivector,
-			     int* rowBlocks, double* resultData, int baseIndex){
+__global__ void dCSGAmvINNER(double alpha, const double* as, const int* ja, const int* irp,
+			     const double* multivector, int m, int n, int col_multivector,
+			     const int* rowBlocks, double beta, double* resultData, int baseIndex){
   __shared__ double vals[MAX_NNZ_PER_WG];
   __shared__ int cols[MAX_NNZ_PER_WG];
   
@@ -34,6 +34,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
   long int numRows = stopRow - startRow;
   int nnz = irp[stopRow]-irp[startRow];
   int tid = threadIdx.x; // indice del thread nel blocco
+
   if (numRows > 1){
     //CSR-Stream
     //printf("csr stream\n");
@@ -46,6 +47,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
       //vals[i] *= multivector[ja[localCol]*col_multivector+j];
       cols[i] = ja[localCol];
     }
+    //return;
     int firstCol = irp[startRow];
     
     __syncthreads();
@@ -56,7 +58,11 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
       for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
 	temp += vals[i]*multivector[cols[i]*col_multivector + j];
       }
-      resultData[localRow*col_multivector +j] = temp;
+      if (beta == 0.0) {
+	resultData[localRow*col_multivector +j] = alpha*temp;
+      } else {
+	resultData[localRow*col_multivector +j] = alpha*temp + 	beta*resultData[localRow*col_multivector +j];
+      }
     }
     
     __syncthreads();    
@@ -65,7 +71,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
     //CSR-Vector
     //printf("csr vector\n");
     int warpId = tid / 32; // Global warp index
-    int lane = tid &(32-1); // thread index within the warp
+    int lane = tid &(0xFFFF); // thread index within the warp
     //one warp per row
     double val; 
     int col;
@@ -78,6 +84,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
 	cols[i] = ja[localCol];
       }
     }
+    //return;
     __syncthreads();
     if (warpId < col_multivector){
       for (int col_m = warpId; col_m < col_multivector; col_m +=32){
@@ -93,7 +100,12 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
 	}
 	sum[col_m] = warp_reduce(sum[col_m]);
 	if (lane == 0){
-	  resultData[startRow*col_multivector + col_m] = sum[col_m];   
+	  if (beta == 0.0) {
+	    resultData[startRow*col_multivector + col_m] = alpha*sum[col_m];   
+	  } else {
+	    resultData[startRow*col_multivector + col_m] = alpha*sum[col_m] +
+	      beta*resultData[startRow*col_multivector + col_m];
+	  }	  
 	}
       }
     }
@@ -102,44 +114,51 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
 
 
 __host__ int dCSGAMV(spgpuHandle_t handle, 
-	    double beta,
-	    double* y, 
-	    double alpha, 
-	    const double* as, 
-	    const int* ja,
-	    const int* irp,
-	    int m,
-	    int n,
-	    int  numBlocks,
-	    const int* rowBlocks,
-	    const double *x,
-	    int baseIndex)
-{
-  int maxBForACall = max(handle->maxGridSizeX, numBlocks);
+		     double beta,
+		     double* y, 
+		     double alpha, 
+		     const double* as, 
+		     const int* ja,
+		     const int* irp,
+		     int m,
+		     int n,
+		     int ncol,
+		     int  numBlocks,
+		     const int* rowBlocks,
+		     const double *x,
+		     int baseIndex,
+		     int *rb)
+{ 
+  //  fprintf(stderr," dcsgamv  %d   \n",numBlocks);
+  int maxBForACall = min(handle->maxGridSizeX, numBlocks);
+  //int maxBForACall = 1024;
   int blockX = THREAD_BLOCK;
   int gridX = maxBForACall;
   int rp,rows, blcks, bp, numb;
   dim3 blockSize(blockX);
   dim3 gridSize(gridX);
-
-  fprintf(stderr," dcsgamv  %d  %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
+  //fprintf(stderr," dcsgamv  start %d %d %d  \n",m,n,ncol);
 
   bp = 0;
   rp = 0;
   numb = numBlocks;  
   while (numb > maxBForACall) {//managing large vectors
     blcks = maxBForACall;
-    rp = rowBlocks[bp];
-    rows = rowBlocks[bp+blcks]-rp;
-    fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+    rp = rb[bp]-baseIndex;
+    rows = rb[bp+blcks]-rp;
+    fprintf(stderr,"  rp %d  rows %d  bp %d  blcks %d\n",rp,rows,bp,blcks);
+    dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
+					 &(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
     bp   += blcks;
     numb -= blcks;
   }
   blcks = numb;
-  rp = rowBlocks[bp];
-  rows = rowBlocks[bp+blcks]-rp;
-  fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+  rp = rb[bp]-baseIndex;
+  rows = rb[bp+blcks]-rp;
+  //fprintf(stderr,"  rp %d  rows %d  bp %d  blcks %d\n",rp,rows,bp,blcks);
+  dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
+				       &(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
   rp += rows;
-  fprintf(stderr,"  Final  rows %d    \n",rows);
+  //fprintf(stderr,"  Final  rows %d  %d  \n",rows,rp);
   return(SPGPU_SUCCESS);
 }
diff --git a/test/cudakern/Makefile b/test/cudakern/Makefile
index 5d938973..2653decd 100755
--- a/test/cudakern/Makefile
+++ b/test/cudakern/Makefile
@@ -32,7 +32,7 @@ dir:
 	(if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi)
 
 dpdegenmv: $(DPGOBJS)
-	$(FLINK) $(LOPT) $(DPGOBJS) -fopenmp -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) 
+	$(FLINK) $(LOPT) $(DPGOBJS)  -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) 
 	/bin/mv dpdegenmv $(EXEDIR) 
 spdegenmv: $(SPGOBJS)
 	$(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS) 
diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90
index 7f01051c..f5fedbe2 100644
--- a/test/cudakern/dpdegenmv.F90
+++ b/test/cudakern/dpdegenmv.F90
@@ -596,6 +596,7 @@ program pdgenmv
   integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
   real(psb_dpk_)   :: err, eps
   integer, parameter :: ntests=200, ngpu=50, ncnv=20
+  !integer, parameter :: ntests=1, ngpu=1, ncnv=1
   type(psb_d_coo_sparse_mat), target   :: acoo
   type(psb_d_csr_sparse_mat), target   :: acsr
   type(psb_d_ell_sparse_mat), target   :: aell
@@ -853,6 +854,11 @@ program pdgenmv
   eps = maxval(abs(x1(1:nr)-x2(1:nr)))
   call psb_amx(ctxt,eps)
   if (iam==0) write(*,*) 'Max diff on GPU',eps
+  if (.false.) then
+    do i=1,nr
+      write(0,*) i,x1(i),x2(i)
+    end do
+  end if
   if (dump) then 
     write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx'
     call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname)