From 173ffec2d3e5129a2149d578be944bdbbd9d76ad Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Mon, 27 May 2024 07:02:00 -0400 Subject: [PATCH] First working version of CSGA. To be tested and refined. --- cuda/d_csga_mod.F90 | 3 +- cuda/fcusparse_fct.h | 2 +- cuda/spgpu/kernels/dcsgamv.cu | 77 ++++++++++++++++++++++------------- test/cudakern/Makefile | 2 +- test/cudakern/dpdegenmv.F90 | 6 +++ 5 files changed, 58 insertions(+), 32 deletions(-) diff --git a/cuda/d_csga_mod.F90 b/cuda/d_csga_mod.F90 index e86288fe..9055a010 100644 --- a/cuda/d_csga_mod.F90 +++ b/cuda/d_csga_mod.F90 @@ -93,7 +93,7 @@ module d_csga_mod end interface interface spmvCSGADevice - function d_spmvCSGADevice(Mat,alpha,x,beta,y) & + function d_spmvCSGADevice(Mat,alpha,x,beta,y,rb) & & bind(c,name="d_spmvCSGADevice") result(res) use iso_c_binding import d_Cmat @@ -101,6 +101,7 @@ module d_csga_mod type(c_ptr), value :: x type(c_ptr), value :: y real(c_double), value :: alpha,beta + integer(c_int) :: rb(*) integer(c_int) :: res end function d_spmvCSGADevice end interface diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 7e757b2e..a90331ea 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -251,7 +251,7 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz) cMat->m = nr; cMat->n = nc; cMat->nz = nz; - fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n); + //fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n); if (nr1 == 0) nr1 = 1; if (nz1 == 0) nz1 = 1; if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0) diff --git a/cuda/spgpu/kernels/dcsgamv.cu b/cuda/spgpu/kernels/dcsgamv.cu index 5c229ef7..f4d0afdd 100644 --- a/cuda/spgpu/kernels/dcsgamv.cu +++ b/cuda/spgpu/kernels/dcsgamv.cu @@ -23,9 +23,9 @@ __device__ double warp_reduce(double val){ return val; } -__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, - int m, int n, int col_multivector, - int* rowBlocks, double* resultData, int baseIndex){ +__global__ void dCSGAmvINNER(double alpha, const double* as, const int* ja, const int* irp, + const double* multivector, int m, int n, int col_multivector, + const int* rowBlocks, double beta, double* resultData, int baseIndex){ __shared__ double vals[MAX_NNZ_PER_WG]; __shared__ int cols[MAX_NNZ_PER_WG]; @@ -34,6 +34,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, long int numRows = stopRow - startRow; int nnz = irp[stopRow]-irp[startRow]; int tid = threadIdx.x; // indice del thread nel blocco + if (numRows > 1){ //CSR-Stream //printf("csr stream\n"); @@ -46,6 +47,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, //vals[i] *= multivector[ja[localCol]*col_multivector+j]; cols[i] = ja[localCol]; } + //return; int firstCol = irp[startRow]; __syncthreads(); @@ -56,7 +58,11 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){ temp += vals[i]*multivector[cols[i]*col_multivector + j]; } - resultData[localRow*col_multivector +j] = temp; + if (beta == 0.0) { + resultData[localRow*col_multivector +j] = alpha*temp; + } else { + resultData[localRow*col_multivector +j] = alpha*temp + beta*resultData[localRow*col_multivector +j]; + } } __syncthreads(); @@ -65,7 +71,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, //CSR-Vector //printf("csr vector\n"); int warpId = tid / 32; // Global warp index - int lane = tid &(32-1); // thread index within the warp + int lane = tid &(0xFFFF); // thread index within the warp //one warp per row double val; int col; @@ -78,6 +84,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, cols[i] = ja[localCol]; } } + //return; __syncthreads(); if (warpId < col_multivector){ for (int col_m = warpId; col_m < col_multivector; col_m +=32){ @@ -93,7 +100,12 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, } sum[col_m] = warp_reduce(sum[col_m]); if (lane == 0){ - resultData[startRow*col_multivector + col_m] = sum[col_m]; + if (beta == 0.0) { + resultData[startRow*col_multivector + col_m] = alpha*sum[col_m]; + } else { + resultData[startRow*col_multivector + col_m] = alpha*sum[col_m] + + beta*resultData[startRow*col_multivector + col_m]; + } } } } @@ -102,44 +114,51 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, __host__ int dCSGAMV(spgpuHandle_t handle, - double beta, - double* y, - double alpha, - const double* as, - const int* ja, - const int* irp, - int m, - int n, - int numBlocks, - const int* rowBlocks, - const double *x, - int baseIndex) -{ - int maxBForACall = max(handle->maxGridSizeX, numBlocks); + double beta, + double* y, + double alpha, + const double* as, + const int* ja, + const int* irp, + int m, + int n, + int ncol, + int numBlocks, + const int* rowBlocks, + const double *x, + int baseIndex, + int *rb) +{ + // fprintf(stderr," dcsgamv %d \n",numBlocks); + int maxBForACall = min(handle->maxGridSizeX, numBlocks); + //int maxBForACall = 1024; int blockX = THREAD_BLOCK; int gridX = maxBForACall; int rp,rows, blcks, bp, numb; dim3 blockSize(blockX); dim3 gridSize(gridX); - - fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]); + //fprintf(stderr," dcsgamv start %d %d %d \n",m,n,ncol); bp = 0; rp = 0; numb = numBlocks; while (numb > maxBForACall) {//managing large vectors blcks = maxBForACall; - rp = rowBlocks[bp]; - rows = rowBlocks[bp+blcks]-rp; - fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); + rp = rb[bp]-baseIndex; + rows = rb[bp+blcks]-rp; + fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks); + dCSGAmvINNER<<>>(alpha,as,ja,irp,x,rows,n,ncol, + &(rowBlocks[bp]),beta,&(y[rp]),baseIndex); bp += blcks; numb -= blcks; } blcks = numb; - rp = rowBlocks[bp]; - rows = rowBlocks[bp+blcks]-rp; - fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); + rp = rb[bp]-baseIndex; + rows = rb[bp+blcks]-rp; + //fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks); + dCSGAmvINNER<<>>(alpha,as,ja,irp,x,rows,n,ncol, + &(rowBlocks[bp]),beta,&(y[rp]),baseIndex); rp += rows; - fprintf(stderr," Final rows %d \n",rows); + //fprintf(stderr," Final rows %d %d \n",rows,rp); return(SPGPU_SUCCESS); } diff --git a/test/cudakern/Makefile b/test/cudakern/Makefile index 5d938973..2653decd 100755 --- a/test/cudakern/Makefile +++ b/test/cudakern/Makefile @@ -32,7 +32,7 @@ dir: (if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi) dpdegenmv: $(DPGOBJS) - $(FLINK) $(LOPT) $(DPGOBJS) -fopenmp -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) + $(FLINK) $(LOPT) $(DPGOBJS) -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) /bin/mv dpdegenmv $(EXEDIR) spdegenmv: $(SPGOBJS) $(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS) diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90 index 7f01051c..f5fedbe2 100644 --- a/test/cudakern/dpdegenmv.F90 +++ b/test/cudakern/dpdegenmv.F90 @@ -596,6 +596,7 @@ program pdgenmv integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes real(psb_dpk_) :: err, eps integer, parameter :: ntests=200, ngpu=50, ncnv=20 + !integer, parameter :: ntests=1, ngpu=1, ncnv=1 type(psb_d_coo_sparse_mat), target :: acoo type(psb_d_csr_sparse_mat), target :: acsr type(psb_d_ell_sparse_mat), target :: aell @@ -853,6 +854,11 @@ program pdgenmv eps = maxval(abs(x1(1:nr)-x2(1:nr))) call psb_amx(ctxt,eps) if (iam==0) write(*,*) 'Max diff on GPU',eps + if (.false.) then + do i=1,nr + write(0,*) i,x1(i),x2(i) + end do + end if if (dump) then write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx' call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname)