From 3ba2002e60c91c9dfe78c65997cea00538636076 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Fri, 24 May 2024 17:03:46 +0200 Subject: [PATCH] First attempt at running CSGA. To be debugged. --- cuda/Makefile | 3 +- cuda/d_csga_mod.F90 | 76 +++++---- cuda/fcusparse_dat.h | 2 + cuda/fcusparse_fct.h | 7 +- cuda/impl/Makefile | 6 +- cuda/psb_cuda_mod.F90 | 2 + cuda/psb_d_cuda_csga_mat_mod.F90 | 127 +++++++-------- cuda/psb_d_cuda_csrg_mat_mod.F90 | 11 +- cuda/spgpu/kernels/Makefile | 2 +- cuda/spgpu/kernels/dcsgamv.cu | 145 ++++++++++++++++++ cuda/spgpu/kernels/ell_spmv_base_template.cuh | 4 +- test/cudakern/dpdegenmv.F90 | 3 + 12 files changed, 267 insertions(+), 121 deletions(-) create mode 100644 cuda/spgpu/kernels/dcsgamv.cu diff --git a/cuda/Makefile b/cuda/Makefile index 395b6151..6e08eaa1 100755 --- a/cuda/Makefile +++ b/cuda/Makefile @@ -120,7 +120,8 @@ psb_cuda_mod.o: psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\ psb_d_cuda_diag_mat_mod.o psb_d_cuda_csga_mat_mod.o\ psb_d_cuda_hdiag_mat_mod.o psb_s_cuda_hdiag_mat_mod.o\ psb_s_cuda_dnsg_mat_mod.o psb_d_cuda_dnsg_mat_mod.o \ - psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o + psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o \ + psb_d_cuda_csga_mat_mod.o iobjs: $(FOBJS) $(MAKE) -C impl objs diff --git a/cuda/d_csga_mod.F90 b/cuda/d_csga_mod.F90 index f2cfd745..e86288fe 100644 --- a/cuda/d_csga_mod.F90 +++ b/cuda/d_csga_mod.F90 @@ -33,16 +33,13 @@ module d_csga_mod integer, parameter :: MAX_NNZ_PER_WG = 4096 integer, parameter :: MAX_GRID_SIZE = 65536 - type, bind(c) :: d_CAmat - type(c_ptr) :: Mat = c_null_ptr - end type d_CAmat interface CSGADeviceFree function d_CSGADeviceFree(Mat) & & bind(c,name="d_CSGADeviceFree") result(res) use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat + import d_CMat + type(d_CMat) :: Mat integer(c_int) :: res end function d_CSGADeviceFree end interface @@ -51,62 +48,61 @@ module d_csga_mod function d_CSGADeviceAlloc(Mat,nr,nc,nz) & & bind(c,name="d_CSGADeviceAlloc") result(res) use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat + import d_CMat + type(d_CMat) :: Mat integer(c_int), value :: nr, nc, nz integer(c_int) :: res end function d_CSGADeviceAlloc end interface - - interface CSGADeviceSetMatDiagType - function d_CSGADeviceSetMatDiagType(Mat,type) & - & bind(c,name="d_CSGADeviceSetMatDiagType") result(res) - use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat - integer(c_int),value :: type - integer(c_int) :: res - end function d_CSGADeviceSetMatDiagType - end interface - interface CSGADeviceSetMatFillMode - function d_CSGADeviceSetMatFillMode(Mat,type) & - & bind(c,name="d_CSGADeviceSetMatFillMode") result(res) - use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat - integer(c_int),value :: type - integer(c_int) :: res - end function d_CSGADeviceSetMatFillMode - end interface - interface CSGAHost2Device - function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,rowBlocks) & + function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) & & bind(c,name="d_CSGAHost2Device") result(res) use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat - integer(c_int), value :: m,n,nz + import d_CMat + type(d_CMat) :: Mat + integer(c_int), value :: m,n,nz,numBlocks integer(c_int) :: irp(*), ja(*), rowBlocks(*) - real(c_double) :: val(*) + real(c_double) :: val(*) integer(c_int) :: res end function d_CSGAHost2Device end interface interface CSGADevice2Host - function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,rowBlocks) & + function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) & & bind(c,name="d_CSGADevice2Host") result(res) use iso_c_binding - import d_CAmat - type(d_CAmat) :: Mat + import d_CMat + type(d_CMat) :: Mat integer(c_int), value :: m,n,nz + integer(c_int) :: numBlocks integer(c_int) :: irp(*), ja(*), rowBlocks(*) - real(c_double) :: val(*) + real(c_double) :: val(*) integer(c_int) :: res end function d_CSGADevice2Host end interface - - + interface CSGAComputeRowBlocks + subroutine d_CSGAComputeRowBlocks(m,irp,numBlocks,rowBlocks) & + & bind(c,name="d_CSGAComputeRowBlocks") + use iso_c_binding + integer(c_int), value :: m + integer(c_int) :: numBlocks + integer(c_int) :: irp(*), rowBlocks(*) + end subroutine d_CSGAComputeRowBlocks + end interface + + interface spmvCSGADevice + function d_spmvCSGADevice(Mat,alpha,x,beta,y) & + & bind(c,name="d_spmvCSGADevice") result(res) + use iso_c_binding + import d_Cmat + type(d_Cmat) :: Mat + type(c_ptr), value :: x + type(c_ptr), value :: y + real(c_double), value :: alpha,beta + integer(c_int) :: res + end function d_spmvCSGADevice + end interface end module d_csga_mod diff --git a/cuda/fcusparse_dat.h b/cuda/fcusparse_dat.h index 4b7cd6ce..879d15c1 100644 --- a/cuda/fcusparse_dat.h +++ b/cuda/fcusparse_dat.h @@ -51,6 +51,8 @@ typedef struct T_CSRGDeviceMat TYPE *val; int *irp; int *ja; + int *rowBlocks; + int numBlocks; } T_CSRGDeviceMat; /* Interoperability: type coming from Fortran side to distinguish D/S/C/Z. */ diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 12be21bd..7e757b2e 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -96,7 +96,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, #else cusparseDnVecDescr_t vecX, vecY; size_t bfsz; - if (T_CSRGIsNullMvDescr(cMat)) { cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *)); } @@ -246,9 +245,13 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz) return((int) CUSPARSE_STATUS_INVALID_VALUE); if ((cMat=(T_CSRGDeviceMat *) malloc(sizeof(T_CSRGDeviceMat)))==NULL) return((int) CUSPARSE_STATUS_ALLOC_FAILED); + + cMat->numBlocks = 0; + cMat->rowBlocks = NULL; cMat->m = nr; cMat->n = nc; cMat->nz = nz; + fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n); if (nr1 == 0) nr1 = 1; if (nz1 == 0) nz1 = 1; if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0) @@ -344,6 +347,8 @@ int T_CSRGDeviceFree(T_Cmat *Matrix) cMat->svbuffer=NULL; cMat->svbsize=0; #endif + if (cMat->rowBlocks != NULL) freeRemoteBuffer(cMat->rowBlocks); + cMat->numBlocks = 0; free(cMat); Matrix->mat = NULL; } diff --git a/cuda/impl/Makefile b/cuda/impl/Makefile index 12bf0747..38bff7fd 100755 --- a/cuda/impl/Makefile +++ b/cuda/impl/Makefile @@ -286,7 +286,11 @@ psb_s_cuda_hdiag_vect_mv.o \ psb_s_cuda_dnsg_mat_impl.o \ psb_d_cuda_dnsg_mat_impl.o \ psb_c_cuda_dnsg_mat_impl.o \ -psb_z_cuda_dnsg_mat_impl.o +psb_z_cuda_dnsg_mat_impl.o \ +psb_d_cuda_csga_to_gpu.o \ +psb_d_cuda_csga_from_gpu.o \ +psb_d_cuda_csga_mold.o \ +psb_d_cuda_csga_vect_mv.o objs: $(OBJS) diff --git a/cuda/psb_cuda_mod.F90 b/cuda/psb_cuda_mod.F90 index 81ce3e31..494e1bcd 100644 --- a/cuda/psb_cuda_mod.F90 +++ b/cuda/psb_cuda_mod.F90 @@ -63,6 +63,8 @@ module psb_cuda_mod use psb_c_cuda_hlg_mat_mod use psb_z_hll_mat_mod use psb_z_cuda_hlg_mat_mod + + use psb_d_cuda_csga_mat_mod use psb_s_cuda_csrg_mat_mod use psb_d_cuda_csrg_mat_mod diff --git a/cuda/psb_d_cuda_csga_mat_mod.F90 b/cuda/psb_d_cuda_csga_mat_mod.F90 index d6edf7f9..f0a7ecdf 100644 --- a/cuda/psb_d_cuda_csga_mat_mod.F90 +++ b/cuda/psb_d_cuda_csga_mat_mod.F90 @@ -41,11 +41,8 @@ module psb_d_cuda_csga_mat_mod ! ! Format for CSR Adaptive. ! - type(d_CAmat) :: deviceAMat - integer(psb_ipk_), allocatable :: rowBlocks(:) contains procedure, nopass :: get_fmt => d_cuda_csga_get_fmt - procedure, pass(a) :: sizeof => d_cuda_csga_sizeof procedure, pass(a) :: vect_mv => psb_d_cuda_csga_vect_mv !!$ procedure, pass(a) :: in_vect_sv => psb_d_cuda_csga_inner_vect_sv !!$ procedure, pass(a) :: csmm => psb_d_cuda_csga_csmm @@ -56,11 +53,11 @@ module psb_d_cuda_csga_mat_mod !!$ procedure, pass(a) :: allocate_mnnz => psb_d_cuda_csga_allocate_mnnz ! Note: we do *not* need the TO methods, because the parent type ! methods will work. - procedure, pass(a) :: cp_from_coo => psb_d_cuda_cp_csga_from_coo - procedure, pass(a) :: cp_from_fmt => psb_d_cuda_cp_csga_from_fmt - procedure, pass(a) :: mv_from_coo => psb_d_cuda_mv_csga_from_coo - procedure, pass(a) :: mv_from_fmt => psb_d_cuda_mv_csga_from_fmt - procedure, pass(a) :: free => d_cuda_csga_free +!!$ procedure, pass(a) :: cp_from_coo => psb_d_cuda_cp_csga_from_coo +!!$ procedure, pass(a) :: cp_from_fmt => psb_d_cuda_cp_csga_from_fmt +!!$ procedure, pass(a) :: mv_from_coo => psb_d_cuda_mv_csga_from_coo +!!$ procedure, pass(a) :: mv_from_fmt => psb_d_cuda_mv_csga_from_fmt +!!$ procedure, pass(a) :: free => d_cuda_csga_free procedure, pass(a) :: mold => psb_d_cuda_csga_mold !!$ procedure, pass(a) :: is_host => d_cuda_csga_is_host !!$ procedure, pass(a) :: is_dev => d_cuda_csga_is_dev @@ -74,7 +71,7 @@ module psb_d_cuda_csga_mat_mod final :: d_cuda_csga_finalize end type psb_d_cuda_csga_sparse_mat - private :: d_cuda_csga_free, d_cuda_csga_get_fmt, d_cuda_csga_sizeof + private :: d_cuda_csga_get_fmt !!$ !!$ interface @@ -145,41 +142,41 @@ module psb_d_cuda_csga_mat_mod end subroutine psb_d_cuda_csga_from_gpu end interface - interface - subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) - import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_ - class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - class(psb_d_coo_sparse_mat), intent(in) :: b - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_cuda_cp_csga_from_coo - end interface - - interface - subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) - import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_ - class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - class(psb_d_base_sparse_mat), intent(in) :: b - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_cuda_cp_csga_from_fmt - end interface - - interface - subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) - import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_ - class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - class(psb_d_coo_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_cuda_mv_csga_from_coo - end interface - - interface - subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) - import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_ - class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - class(psb_d_base_sparse_mat), intent(inout) :: b - integer(psb_ipk_), intent(out) :: info - end subroutine psb_d_cuda_mv_csga_from_fmt - end interface +!!$ interface +!!$ subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) +!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_ +!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a +!!$ class(psb_d_coo_sparse_mat), intent(in) :: b +!!$ integer(psb_ipk_), intent(out) :: info +!!$ end subroutine psb_d_cuda_cp_csga_from_coo +!!$ end interface +!!$ +!!$ interface +!!$ subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) +!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_ +!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a +!!$ class(psb_d_base_sparse_mat), intent(in) :: b +!!$ integer(psb_ipk_), intent(out) :: info +!!$ end subroutine psb_d_cuda_cp_csga_from_fmt +!!$ end interface +!!$ +!!$ interface +!!$ subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) +!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_ +!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a +!!$ class(psb_d_coo_sparse_mat), intent(inout) :: b +!!$ integer(psb_ipk_), intent(out) :: info +!!$ end subroutine psb_d_cuda_mv_csga_from_coo +!!$ end interface +!!$ +!!$ interface +!!$ subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) +!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_ +!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a +!!$ class(psb_d_base_sparse_mat), intent(inout) :: b +!!$ integer(psb_ipk_), intent(out) :: info +!!$ end subroutine psb_d_cuda_mv_csga_from_fmt +!!$ end interface !!$ interface !!$ subroutine psb_d_cuda_csga_csmv(alpha,a,x,beta,y,info,trans) @@ -237,22 +234,6 @@ contains ! == =================================== - function d_cuda_csga_sizeof(a) result(res) - implicit none - class(psb_d_cuda_csga_sparse_mat), intent(in) :: a - integer(psb_epk_) :: res - if (a%is_dev()) call a%sync() - res = 8 - res = res + psb_sizeof_dp * size(a%val) - res = res + psb_sizeof_ip * size(a%irp) - res = res + psb_sizeof_ip * size(a%ja) - res = res + psb_sizeof_ip * size(a%rowBlocks) - ! Should we account for the shadow data structure - ! on the GPU device side? - ! res = 2*res - - end function d_cuda_csga_sizeof - function d_cuda_csga_get_fmt() result(res) implicit none character(len=5) :: res @@ -337,18 +318,18 @@ contains !!$ !!$ end subroutine d_cuda_csga_sync - subroutine d_cuda_csga_free(a) - implicit none - integer(psb_ipk_) :: info - - class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - - info = CSGADeviceFree(a%deviceAMat) - call a%psb_d_csr_sparse_mat%free() - - return - - end subroutine d_cuda_csga_free +!!$ subroutine d_cuda_csga_free(a) +!!$ implicit none +!!$ integer(psb_ipk_) :: info +!!$ +!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a +!!$ +!!$ info = CSGADeviceFree(a%deviceMat) +!!$ call a%psb_d_csr_sparse_mat%free() +!!$ +!!$ return +!!$ +!!$ end subroutine d_cuda_csga_free subroutine d_cuda_csga_finalize(a) implicit none @@ -356,7 +337,7 @@ contains type(psb_d_cuda_csga_sparse_mat), intent(inout) :: a - info = CSGADeviceFree(a%deviceAMat) + info = CSRGDeviceFree(a%deviceMat) return diff --git a/cuda/psb_d_cuda_csrg_mat_mod.F90 b/cuda/psb_d_cuda_csrg_mat_mod.F90 index 101959bd..d6fadec2 100644 --- a/cuda/psb_d_cuda_csrg_mat_mod.F90 +++ b/cuda/psb_d_cuda_csrg_mat_mod.F90 @@ -49,6 +49,8 @@ module psb_d_cuda_csrg_mat_mod ! ! type(d_Cmat) :: deviceMat + integer(psb_ipk_), allocatable :: rowBlocks(:) + integer(psb_ipk_) :: numBlocks=0 integer(psb_ipk_) :: devstate = is_host contains @@ -252,9 +254,10 @@ contains integer(psb_epk_) :: res if (a%is_dev()) call a%sync() res = 8 - res = res + psb_sizeof_dp * size(a%val) - res = res + psb_sizeof_ip * size(a%irp) - res = res + psb_sizeof_ip * size(a%ja) + res = res + psb_sizeof_dp * psb_size(a%val) + res = res + psb_sizeof_ip * psb_size(a%irp) + res = res + psb_sizeof_ip * psb_size(a%ja) + res = res + psb_sizeof_ip * psb_size(a%rowBlocks) ! Should we account for the shadow data structure ! on the GPU device side? ! res = 2*res @@ -353,6 +356,8 @@ contains class(psb_d_cuda_csrg_sparse_mat), intent(inout) :: a info = CSRGDeviceFree(a%deviceMat) + if (allocated(a%rowBlocks)) deallocate(a%rowBlocks) + a%numBlocks=0 call a%psb_d_csr_sparse_mat%free() return diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile index c4ae38a0..cf8ab939 100644 --- a/cuda/spgpu/kernels/Makefile +++ b/cuda/spgpu/kernels/Makefile @@ -20,7 +20,7 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \ hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \ saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\ zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \ - sxyzw.o cxyzw.o dxyzw.o zxyzw.o + sxyzw.o cxyzw.o dxyzw.o zxyzw.o dcsgamv.o objs: $(OBJS) lib: objs diff --git a/cuda/spgpu/kernels/dcsgamv.cu b/cuda/spgpu/kernels/dcsgamv.cu new file mode 100644 index 00000000..5c229ef7 --- /dev/null +++ b/cuda/spgpu/kernels/dcsgamv.cu @@ -0,0 +1,145 @@ +#include "stdio.h" +#include "cudalang.h" +#include "cudadebug.h" +extern "C" +{ +#include "core.h" +#include "csga.h" +} + +#include "debug.h" + +//#define MAX_NNZ_PER_WG 6144 +#define MAX_NNZ_PER_WG 4096 +#define THREAD_BLOCK 1024 +#define MAX_GRID_SIZE 65536 +#define WARP_SIZE 32 + + +__device__ double warp_reduce(double val){ + for(int offset=warpSize/2; offset>0; offset/=2){ + val += __shfl_down_sync(0xffffffff,val, offset); + } + return val; +} + +__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, + int m, int n, int col_multivector, + int* rowBlocks, double* resultData, int baseIndex){ + __shared__ double vals[MAX_NNZ_PER_WG]; + __shared__ int cols[MAX_NNZ_PER_WG]; + + int startRow = rowBlocks[blockIdx.x]; + int stopRow = rowBlocks[blockIdx.x+1]; + long int numRows = stopRow - startRow; + int nnz = irp[stopRow]-irp[startRow]; + int tid = threadIdx.x; // indice del thread nel blocco + if (numRows > 1){ + //CSR-Stream + //printf("csr stream\n"); + + int localCol; + + for (int i = tid; i < nnz; i+= blockDim.x){ + localCol = irp[startRow]+i; + vals[i] = as[localCol]; + //vals[i] *= multivector[ja[localCol]*col_multivector+j]; + cols[i] = ja[localCol]; + } + int firstCol = irp[startRow]; + + __syncthreads(); + for (int t = tid; t < numRows*col_multivector; t += blockDim.x){ + int localRow = startRow + t/col_multivector; + int j = t%col_multivector; + double temp = 0; + for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){ + temp += vals[i]*multivector[cols[i]*col_multivector + j]; + } + resultData[localRow*col_multivector +j] = temp; + } + + __syncthreads(); + + } else { + //CSR-Vector + //printf("csr vector\n"); + int warpId = tid / 32; // Global warp index + int lane = tid &(32-1); // thread index within the warp + //one warp per row + double val; + int col; + double sum[64] = {0}; + if (nnz < 4096){ + int localCol; + for (int i = tid; i < nnz; i+= blockDim.x){ + localCol = irp[startRow]+i; + vals[i] = as[localCol]; + cols[i] = ja[localCol]; + } + } + __syncthreads(); + if (warpId < col_multivector){ + for (int col_m = warpId; col_m < col_multivector; col_m +=32){ + for (int i = irp[startRow] + lane; i < irp[startRow+1]; i +=32){ + if (nnz < 4096){ + val = vals[i-irp[startRow]]; + col = cols[i-irp[startRow]]; + } else { + val = as[i]; + col = ja[i]; + } + sum[col_m] += val*multivector[col*col_multivector + col_m]; + } + sum[col_m] = warp_reduce(sum[col_m]); + if (lane == 0){ + resultData[startRow*col_multivector + col_m] = sum[col_m]; + } + } + } + } +} + + +__host__ int dCSGAMV(spgpuHandle_t handle, + double beta, + double* y, + double alpha, + const double* as, + const int* ja, + const int* irp, + int m, + int n, + int numBlocks, + const int* rowBlocks, + const double *x, + int baseIndex) +{ + int maxBForACall = max(handle->maxGridSizeX, numBlocks); + int blockX = THREAD_BLOCK; + int gridX = maxBForACall; + int rp,rows, blcks, bp, numb; + dim3 blockSize(blockX); + dim3 gridSize(gridX); + + fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]); + + bp = 0; + rp = 0; + numb = numBlocks; + while (numb > maxBForACall) {//managing large vectors + blcks = maxBForACall; + rp = rowBlocks[bp]; + rows = rowBlocks[bp+blcks]-rp; + fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); + bp += blcks; + numb -= blcks; + } + blcks = numb; + rp = rowBlocks[bp]; + rows = rowBlocks[bp+blcks]-rp; + fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); + rp += rows; + fprintf(stderr," Final rows %d \n",rows); + return(SPGPU_SUCCESS); +} diff --git a/cuda/spgpu/kernels/ell_spmv_base_template.cuh b/cuda/spgpu/kernels/ell_spmv_base_template.cuh index fa39d8a6..964b9cd9 100644 --- a/cuda/spgpu/kernels/ell_spmv_base_template.cuh +++ b/cuda/spgpu/kernels/ell_spmv_base_template.cuh @@ -21,7 +21,9 @@ __device__ void CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4) (int i, VALUE_TYPE yVal, int outRow, - VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, const int* rS, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex) + VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, + const int* rP, int cMPitch, int rPPitch, const int* rS, int rows, + const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex) { VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90 index 85059e81..7f01051c 100644 --- a/test/cudakern/dpdegenmv.F90 +++ b/test/cudakern/dpdegenmv.F90 @@ -608,6 +608,7 @@ program pdgenmv #ifdef HAVE_CUDA type(psb_d_cuda_elg_sparse_mat), target :: aelg type(psb_d_cuda_csrg_sparse_mat), target :: acsrg + type(psb_d_cuda_csga_sparse_mat), target :: acsga #if CUDA_SHORT_VERSION <= 10 type(psb_d_cuda_hybg_sparse_mat), target :: ahybg #endif @@ -719,6 +720,8 @@ program pdgenmv agmold => ahdiag case('CSRG') agmold => acsrg + case('CSGA') + agmold => acsga case('DNSG') agmold => adnsg #if CUDA_SHORT_VERSION <= 10