First working version of CSGA. To be tested and refined.

10 months ago · 173ffec2d3
parent 3ba2002e60
commit 173ffec2d3
5 changed files with 58 additions and 32 deletions
--- a/cuda/d_csga_mod.F90
+++ b/cuda/d_csga_mod.F90
@ -93,7 +93,7 @@ module d_csga_mod
  end interface
 
  interface spmvCSGADevice
-    function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
+    function d_spmvCSGADevice(Mat,alpha,x,beta,y,rb) &
         &  bind(c,name="d_spmvCSGADevice") result(res)
      use iso_c_binding
      import  d_Cmat
@ -101,6 +101,7 @@ module d_csga_mod
      type(c_ptr), value    :: x
      type(c_ptr), value    :: y
      real(c_double), value :: alpha,beta
+      integer(c_int)        :: rb(*)
      integer(c_int)        :: res
    end function d_spmvCSGADevice
  end interface
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@ -251,7 +251,7 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
  cMat->m  = nr;
  cMat->n  = nc;
  cMat->nz = nz;
-  fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
+  //fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
  if (nr1 == 0) nr1 = 1;
  if (nz1 == 0) nz1 = 1;
  if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)
--- a/cuda/spgpu/kernels/dcsgamv.cu
+++ b/cuda/spgpu/kernels/dcsgamv.cu
@ -23,9 +23,9 @@ __device__ double warp_reduce(double val){
  return val;
 }

-__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
-			     int m, int n, int col_multivector,
-			     int* rowBlocks, double* resultData, int baseIndex){
+__global__ void dCSGAmvINNER(double alpha, const double* as, const int* ja, const int* irp,
+			     const double* multivector, int m, int n, int col_multivector,
+			     const int* rowBlocks, double beta, double* resultData, int baseIndex){
  __shared__ double vals[MAX_NNZ_PER_WG];
  __shared__ int cols[MAX_NNZ_PER_WG];
  
@ -34,6 +34,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
  long int numRows = stopRow - startRow;
  int nnz = irp[stopRow]-irp[startRow];
  int tid = threadIdx.x; // indice del thread nel blocco
+
  if (numRows > 1){
    //CSR-Stream
    //printf("csr stream\n");
@ -46,6 +47,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
      //vals[i] *= multivector[ja[localCol]*col_multivector+j];
      cols[i] = ja[localCol];
    }
+    //return;
    int firstCol = irp[startRow];
    
    __syncthreads();
@ -56,7 +58,11 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
      for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
 	temp += vals[i]*multivector[cols[i]*col_multivector + j];
      }
-      resultData[localRow*col_multivector +j] = temp;
+      if (beta == 0.0) {
+	resultData[localRow*col_multivector +j] = alpha*temp;
+      } else {
+	resultData[localRow*col_multivector +j] = alpha*temp + 	beta*resultData[localRow*col_multivector +j];
+      }
    }
    
    __syncthreads();    
@ -65,7 +71,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
    //CSR-Vector
    //printf("csr vector\n");
    int warpId = tid / 32; // Global warp index
-    int lane = tid &(32-1); // thread index within the warp
+    int lane = tid &(0xFFFF); // thread index within the warp
    //one warp per row
    double val; 
    int col;
@ -78,6 +84,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
 	cols[i] = ja[localCol];
      }
    }
+    //return;
    __syncthreads();
    if (warpId < col_multivector){
      for (int col_m = warpId; col_m < col_multivector; col_m +=32){
@ -93,7 +100,12 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
 	}
 	sum[col_m] = warp_reduce(sum[col_m]);
 	if (lane == 0){
-	  resultData[startRow*col_multivector + col_m] = sum[col_m];   
+	  if (beta == 0.0) {
+	    resultData[startRow*col_multivector + col_m] = alpha*sum[col_m];   
+	  } else {
+	    resultData[startRow*col_multivector + col_m] = alpha*sum[col_m] +
+	      beta*resultData[startRow*col_multivector + col_m];
+	  }	  
 	}
      }
    }
@ -102,44 +114,51 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,


 __host__ int dCSGAMV(spgpuHandle_t handle, 
-	    double beta,
-	    double* y, 
-	    double alpha, 
-	    const double* as, 
-	    const int* ja,
-	    const int* irp,
-	    int m,
-	    int n,
-	    int  numBlocks,
-	    const int* rowBlocks,
-	    const double *x,
-	    int baseIndex)
-{
-  int maxBForACall = max(handle->maxGridSizeX, numBlocks);
+		     double beta,
+		     double* y, 
+		     double alpha, 
+		     const double* as, 
+		     const int* ja,
+		     const int* irp,
+		     int m,
+		     int n,
+		     int ncol,
+		     int  numBlocks,
+		     const int* rowBlocks,
+		     const double *x,
+		     int baseIndex,
+		     int *rb)
+{ 
+  //  fprintf(stderr," dcsgamv  %d   \n",numBlocks);
+  int maxBForACall = min(handle->maxGridSizeX, numBlocks);
+  //int maxBForACall = 1024;
  int blockX = THREAD_BLOCK;
  int gridX = maxBForACall;
  int rp,rows, blcks, bp, numb;
  dim3 blockSize(blockX);
  dim3 gridSize(gridX);
-
-  fprintf(stderr," dcsgamv  %d  %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
+  //fprintf(stderr," dcsgamv  start %d %d %d  \n",m,n,ncol);

  bp = 0;
  rp = 0;
  numb = numBlocks;  
  while (numb > maxBForACall) {//managing large vectors
    blcks = maxBForACall;
-    rp = rowBlocks[bp];
-    rows = rowBlocks[bp+blcks]-rp;
-    fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+    rp = rb[bp]-baseIndex;
+    rows = rb[bp+blcks]-rp;
+    fprintf(stderr,"  rp %d  rows %d  bp %d  blcks %d\n",rp,rows,bp,blcks);
+    dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
+					 &(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
    bp   += blcks;
    numb -= blcks;
  }
  blcks = numb;
-  rp = rowBlocks[bp];
-  rows = rowBlocks[bp+blcks]-rp;
-  fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+  rp = rb[bp]-baseIndex;
+  rows = rb[bp+blcks]-rp;
+  //fprintf(stderr,"  rp %d  rows %d  bp %d  blcks %d\n",rp,rows,bp,blcks);
+  dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
+				       &(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
  rp += rows;
-  fprintf(stderr,"  Final  rows %d    \n",rows);
+  //fprintf(stderr,"  Final  rows %d  %d  \n",rows,rp);
  return(SPGPU_SUCCESS);
 }
--- a/test/cudakern/Makefile
+++ b/test/cudakern/Makefile
@ -32,7 +32,7 @@ dir:
 	(if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi)

 dpdegenmv: $(DPGOBJS)
-	$(FLINK) $(LOPT) $(DPGOBJS) -fopenmp -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) 
+	$(FLINK) $(LOPT) $(DPGOBJS)  -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) 
 	/bin/mv dpdegenmv $(EXEDIR) 
 spdegenmv: $(SPGOBJS)
 	$(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS) 
--- a/test/cudakern/dpdegenmv.F90
+++ b/test/cudakern/dpdegenmv.F90
@ -596,6 +596,7 @@ program pdgenmv
  integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
  real(psb_dpk_)   :: err, eps
  integer, parameter :: ntests=200, ngpu=50, ncnv=20
+  !integer, parameter :: ntests=1, ngpu=1, ncnv=1
  type(psb_d_coo_sparse_mat), target   :: acoo
  type(psb_d_csr_sparse_mat), target   :: acsr
  type(psb_d_ell_sparse_mat), target   :: aell
@ -853,6 +854,11 @@ program pdgenmv
  eps = maxval(abs(x1(1:nr)-x2(1:nr)))
  call psb_amx(ctxt,eps)
  if (iam==0) write(*,*) 'Max diff on GPU',eps
+  if (.false.) then
+    do i=1,nr
+      write(0,*) i,x1(i),x2(i)
+    end do
+  end if
  if (dump) then 
    write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx'
    call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname)