First attempt at running CSGA. To be debugged.

8 months ago · 3ba2002e60
parent c35b3b9ef3
commit 3ba2002e60
12 changed files with 267 additions and 121 deletions
--- a/cuda/Makefile
+++ b/cuda/Makefile
@ -120,7 +120,8 @@ psb_cuda_mod.o:  psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\
 	psb_d_cuda_diag_mat_mod.o  psb_d_cuda_csga_mat_mod.o\
 	psb_d_cuda_hdiag_mat_mod.o psb_s_cuda_hdiag_mat_mod.o\
 	psb_s_cuda_dnsg_mat_mod.o psb_d_cuda_dnsg_mat_mod.o \
-	psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o 
+	psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o  \
+        psb_d_cuda_csga_mat_mod.o

 iobjs: $(FOBJS)
 	$(MAKE) -C impl objs
--- a/cuda/d_csga_mod.F90
+++ b/cuda/d_csga_mod.F90
@ -33,16 +33,13 @@ module d_csga_mod
  integer, parameter :: MAX_NNZ_PER_WG = 4096
  integer, parameter :: MAX_GRID_SIZE  = 65536
  
-  type, bind(c) :: d_CAmat
-    type(c_ptr) :: Mat = c_null_ptr
-  end type d_CAmat

  interface CSGADeviceFree
    function d_CSGADeviceFree(Mat) &
         & bind(c,name="d_CSGADeviceFree") result(res)
      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)  :: Mat
+      import  d_CMat
+      type(d_CMat)  :: Mat
      integer(c_int) :: res
    end function d_CSGADeviceFree
  end interface
@ -51,62 +48,61 @@ module d_csga_mod
    function d_CSGADeviceAlloc(Mat,nr,nc,nz) &
         & bind(c,name="d_CSGADeviceAlloc") result(res)
      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
+      import  d_CMat
+      type(d_CMat)          :: Mat
      integer(c_int), value :: nr, nc, nz
      integer(c_int)        :: res
    end function d_CSGADeviceAlloc
  end interface
-  
-  interface CSGADeviceSetMatDiagType
-    function d_CSGADeviceSetMatDiagType(Mat,type) &
-         & bind(c,name="d_CSGADeviceSetMatDiagType") result(res)
-      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int),value  :: type
-      integer(c_int)        :: res
-    end function d_CSGADeviceSetMatDiagType
-  end interface
    
-  interface CSGADeviceSetMatFillMode
-    function d_CSGADeviceSetMatFillMode(Mat,type) &
-         & bind(c,name="d_CSGADeviceSetMatFillMode") result(res)
-      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int),value  :: type
-      integer(c_int)        :: res
-    end function d_CSGADeviceSetMatFillMode
-  end interface
-  
  interface CSGAHost2Device
-    function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,rowBlocks) &
+    function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
         &  bind(c,name="d_CSGAHost2Device") result(res)
      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int), value :: m,n,nz
+      import  d_CMat
+      type(d_CMat)          :: Mat
+      integer(c_int), value :: m,n,nz,numBlocks
      integer(c_int)        :: irp(*), ja(*), rowBlocks(*)
-      real(c_double)         :: val(*)
+      real(c_double)        :: val(*)
      integer(c_int)        :: res
    end function d_CSGAHost2Device
  end interface
  
  interface CSGADevice2Host
-    function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,rowBlocks) &
+    function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
         &  bind(c,name="d_CSGADevice2Host") result(res)
      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
+      import  d_CMat
+      type(d_CMat)          :: Mat
      integer(c_int), value :: m,n,nz
+      integer(c_int)        :: numBlocks
      integer(c_int)        :: irp(*), ja(*), rowBlocks(*)
-      real(c_double)         :: val(*)
+      real(c_double)        :: val(*)
      integer(c_int)        :: res
    end function d_CSGADevice2Host
  end interface
-  

-  
+  interface CSGAComputeRowBlocks
+    subroutine d_CSGAComputeRowBlocks(m,irp,numBlocks,rowBlocks) &
+         &  bind(c,name="d_CSGAComputeRowBlocks") 
+      use iso_c_binding
+      integer(c_int), value :: m
+      integer(c_int)        :: numBlocks
+      integer(c_int)        :: irp(*), rowBlocks(*)
+    end subroutine d_CSGAComputeRowBlocks
+  end interface
+ 
+  interface spmvCSGADevice
+    function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
+         &  bind(c,name="d_spmvCSGADevice") result(res)
+      use iso_c_binding
+      import  d_Cmat
+      type(d_Cmat)          :: Mat
+      type(c_ptr), value    :: x
+      type(c_ptr), value    :: y
+      real(c_double), value :: alpha,beta
+      integer(c_int)        :: res
+    end function d_spmvCSGADevice
+  end interface

 end module d_csga_mod
--- a/cuda/fcusparse_dat.h
+++ b/cuda/fcusparse_dat.h
@ -51,6 +51,8 @@ typedef struct T_CSRGDeviceMat
  TYPE               *val;
  int                *irp;
  int                *ja;
+  int                *rowBlocks;
+  int                 numBlocks;
 } T_CSRGDeviceMat;

 /* Interoperability: type coming from Fortran side to distinguish D/S/C/Z. */
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@ -96,7 +96,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
 #else
  cusparseDnVecDescr_t vecX, vecY;
  size_t bfsz;
-
  if (T_CSRGIsNullMvDescr(cMat)) {
    cMat->spmvDescr = (cusparseSpMatDescr_t  *) malloc(sizeof(cusparseSpMatDescr_t  *));
  }
@ -246,9 +245,13 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
    return((int) CUSPARSE_STATUS_INVALID_VALUE);
  if ((cMat=(T_CSRGDeviceMat *) malloc(sizeof(T_CSRGDeviceMat)))==NULL)
    return((int) CUSPARSE_STATUS_ALLOC_FAILED);
+
+  cMat->numBlocks = 0;
+  cMat->rowBlocks = NULL;
  cMat->m  = nr;
  cMat->n  = nc;
  cMat->nz = nz;
+  fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
  if (nr1 == 0) nr1 = 1;
  if (nz1 == 0) nz1 = 1;
  if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)
@ -344,6 +347,8 @@ int T_CSRGDeviceFree(T_Cmat *Matrix)
    cMat->svbuffer=NULL;
    cMat->svbsize=0;
 #endif
+    if (cMat->rowBlocks != NULL) freeRemoteBuffer(cMat->rowBlocks);
+    cMat->numBlocks = 0;
    free(cMat);
    Matrix->mat = NULL;
  }
--- a/cuda/impl/Makefile
+++ b/cuda/impl/Makefile
@ -286,7 +286,11 @@ psb_s_cuda_hdiag_vect_mv.o \
 psb_s_cuda_dnsg_mat_impl.o \
 psb_d_cuda_dnsg_mat_impl.o \
 psb_c_cuda_dnsg_mat_impl.o \
-psb_z_cuda_dnsg_mat_impl.o 
+psb_z_cuda_dnsg_mat_impl.o \
+psb_d_cuda_csga_to_gpu.o \
+psb_d_cuda_csga_from_gpu.o \
+psb_d_cuda_csga_mold.o \
+psb_d_cuda_csga_vect_mv.o 


 objs: $(OBJS)
--- a/cuda/psb_cuda_mod.F90
+++ b/cuda/psb_cuda_mod.F90
@ -63,6 +63,8 @@ module psb_cuda_mod
  use psb_c_cuda_hlg_mat_mod
  use psb_z_hll_mat_mod
  use psb_z_cuda_hlg_mat_mod
+
+  use psb_d_cuda_csga_mat_mod
  
  use psb_s_cuda_csrg_mat_mod
  use psb_d_cuda_csrg_mat_mod
--- a/cuda/psb_d_cuda_csga_mat_mod.F90
+++ b/cuda/psb_d_cuda_csga_mat_mod.F90
@ -41,11 +41,8 @@ module psb_d_cuda_csga_mat_mod
    !
    ! Format for CSR Adaptive. 
    !
-    type(d_CAmat)    :: deviceAMat
-    integer(psb_ipk_), allocatable :: rowBlocks(:)
  contains
    procedure, nopass  :: get_fmt       => d_cuda_csga_get_fmt
-    procedure, pass(a) :: sizeof        => d_cuda_csga_sizeof
    procedure, pass(a) :: vect_mv       => psb_d_cuda_csga_vect_mv
 !!$    procedure, pass(a) :: in_vect_sv    => psb_d_cuda_csga_inner_vect_sv
 !!$    procedure, pass(a) :: csmm          => psb_d_cuda_csga_csmm
@ -56,11 +53,11 @@ module psb_d_cuda_csga_mat_mod
 !!$    procedure, pass(a) :: allocate_mnnz => psb_d_cuda_csga_allocate_mnnz
    ! Note: we do *not* need the TO methods, because the parent type
    ! methods will work. 
-    procedure, pass(a) :: cp_from_coo   => psb_d_cuda_cp_csga_from_coo
-    procedure, pass(a) :: cp_from_fmt   => psb_d_cuda_cp_csga_from_fmt
-    procedure, pass(a) :: mv_from_coo   => psb_d_cuda_mv_csga_from_coo
-    procedure, pass(a) :: mv_from_fmt   => psb_d_cuda_mv_csga_from_fmt
-    procedure, pass(a) :: free          => d_cuda_csga_free
+!!$    procedure, pass(a) :: cp_from_coo   => psb_d_cuda_cp_csga_from_coo
+!!$    procedure, pass(a) :: cp_from_fmt   => psb_d_cuda_cp_csga_from_fmt
+!!$    procedure, pass(a) :: mv_from_coo   => psb_d_cuda_mv_csga_from_coo
+!!$    procedure, pass(a) :: mv_from_fmt   => psb_d_cuda_mv_csga_from_fmt
+!!$    procedure, pass(a) :: free          => d_cuda_csga_free
    procedure, pass(a) :: mold          => psb_d_cuda_csga_mold
 !!$    procedure, pass(a) :: is_host       => d_cuda_csga_is_host
 !!$    procedure, pass(a) :: is_dev        => d_cuda_csga_is_dev
@ -74,7 +71,7 @@ module psb_d_cuda_csga_mat_mod
    final              :: d_cuda_csga_finalize
  end type psb_d_cuda_csga_sparse_mat

-  private :: d_cuda_csga_free,  d_cuda_csga_get_fmt, d_cuda_csga_sizeof
+  private :: d_cuda_csga_get_fmt

 !!$
 !!$  interface 
@ -145,41 +142,41 @@ module psb_d_cuda_csga_mat_mod
    end subroutine psb_d_cuda_csga_from_gpu
  end interface

-  interface 
-    subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_coo_sparse_mat), intent(in)     :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_cp_csga_from_coo
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_base_sparse_mat), intent(in)    :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_cp_csga_from_fmt
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_coo_sparse_mat), intent(inout)  :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_mv_csga_from_coo
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_base_sparse_mat), intent(inout) :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_mv_csga_from_fmt
-  end interface
+!!$  interface 
+!!$    subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_coo_sparse_mat), intent(in)     :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_cp_csga_from_coo
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_base_sparse_mat), intent(in)    :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_cp_csga_from_fmt
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_coo_sparse_mat), intent(inout)  :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_mv_csga_from_coo
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_base_sparse_mat), intent(inout) :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_mv_csga_from_fmt
+!!$  end interface
  
 !!$  interface 
 !!$    subroutine psb_d_cuda_csga_csmv(alpha,a,x,beta,y,info,trans) 
@ -237,22 +234,6 @@ contains
  ! == ===================================

  
-  function d_cuda_csga_sizeof(a) result(res)
-    implicit none 
-    class(psb_d_cuda_csga_sparse_mat), intent(in) :: a
-    integer(psb_epk_) :: res
-    if (a%is_dev()) call a%sync()
-    res = 8 
-    res = res + psb_sizeof_dp  * size(a%val)
-    res = res + psb_sizeof_ip * size(a%irp)
-    res = res + psb_sizeof_ip * size(a%ja)
-    res = res + psb_sizeof_ip * size(a%rowBlocks)
-    ! Should we account for the shadow data structure
-    ! on the GPU device side? 
-    ! res = 2*res
-      
-  end function d_cuda_csga_sizeof
-
  function d_cuda_csga_get_fmt() result(res)
    implicit none 
    character(len=5) :: res
@ -337,18 +318,18 @@ contains
 !!$
 !!$  end subroutine d_cuda_csga_sync

-  subroutine  d_cuda_csga_free(a) 
-    implicit none 
-    integer(psb_ipk_) :: info
-
-    class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-
-    info = CSGADeviceFree(a%deviceAMat)
-    call a%psb_d_csr_sparse_mat%free()
-    
-    return
-
-  end subroutine d_cuda_csga_free
+!!$  subroutine  d_cuda_csga_free(a) 
+!!$    implicit none 
+!!$    integer(psb_ipk_) :: info
+!!$
+!!$    class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$
+!!$    info = CSGADeviceFree(a%deviceMat)
+!!$    call a%psb_d_csr_sparse_mat%free()
+!!$    
+!!$    return
+!!$
+!!$  end subroutine d_cuda_csga_free

  subroutine  d_cuda_csga_finalize(a) 
    implicit none 
@ -356,7 +337,7 @@ contains
    
    type(psb_d_cuda_csga_sparse_mat), intent(inout) :: a

-    info = CSGADeviceFree(a%deviceAMat)
+    info = CSRGDeviceFree(a%deviceMat)
    
    return

--- a/cuda/psb_d_cuda_csrg_mat_mod.F90
+++ b/cuda/psb_d_cuda_csrg_mat_mod.F90
@ -49,6 +49,8 @@ module psb_d_cuda_csrg_mat_mod
    ! 
    ! 
    type(d_Cmat)    :: deviceMat
+    integer(psb_ipk_), allocatable :: rowBlocks(:)
+    integer(psb_ipk_) :: numBlocks=0
    integer(psb_ipk_) :: devstate  = is_host
    
  contains
@ -252,9 +254,10 @@ contains
    integer(psb_epk_) :: res
    if (a%is_dev()) call a%sync()
    res = 8 
-    res = res + psb_sizeof_dp  * size(a%val)
-    res = res + psb_sizeof_ip * size(a%irp)
-    res = res + psb_sizeof_ip * size(a%ja)
+    res = res + psb_sizeof_dp * psb_size(a%val)
+    res = res + psb_sizeof_ip * psb_size(a%irp)
+    res = res + psb_sizeof_ip * psb_size(a%ja)
+    res = res + psb_sizeof_ip * psb_size(a%rowBlocks)
    ! Should we account for the shadow data structure
    ! on the GPU device side? 
    ! res = 2*res
@ -353,6 +356,8 @@ contains
    class(psb_d_cuda_csrg_sparse_mat), intent(inout) :: a

    info = CSRGDeviceFree(a%deviceMat)
+    if (allocated(a%rowBlocks)) deallocate(a%rowBlocks)
+    a%numBlocks=0
    call a%psb_d_csr_sparse_mat%free()
    
    return
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@ -20,7 +20,7 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
  hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
  saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
  zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
-  sxyzw.o cxyzw.o dxyzw.o zxyzw.o
+  sxyzw.o cxyzw.o dxyzw.o zxyzw.o  dcsgamv.o

 objs: $(OBJS)
 lib: objs
--- a/cuda/spgpu/kernels/dcsgamv.cu
+++ b/cuda/spgpu/kernels/dcsgamv.cu
@ -0,0 +1,145 @@
+#include "stdio.h"
+#include "cudalang.h"
+#include "cudadebug.h"
+extern "C"
+{
+#include "core.h"
+#include "csga.h"
+}
+
+#include "debug.h"
+
+//#define MAX_NNZ_PER_WG 6144
+#define MAX_NNZ_PER_WG 4096
+#define THREAD_BLOCK   1024
+#define MAX_GRID_SIZE 65536
+#define WARP_SIZE 32
+
+
+__device__ double warp_reduce(double val){
+  for(int offset=warpSize/2; offset>0; offset/=2){
+    val += __shfl_down_sync(0xffffffff,val, offset);
+  }
+  return val;
+}
+
+__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
+			     int m, int n, int col_multivector,
+			     int* rowBlocks, double* resultData, int baseIndex){
+  __shared__ double vals[MAX_NNZ_PER_WG];
+  __shared__ int cols[MAX_NNZ_PER_WG];
+  
+  int startRow = rowBlocks[blockIdx.x];
+  int stopRow = rowBlocks[blockIdx.x+1];
+  long int numRows = stopRow - startRow;
+  int nnz = irp[stopRow]-irp[startRow];
+  int tid = threadIdx.x; // indice del thread nel blocco
+  if (numRows > 1){
+    //CSR-Stream
+    //printf("csr stream\n");
+    
+    int localCol;
+    
+    for (int i = tid; i < nnz; i+= blockDim.x){ 
+      localCol = irp[startRow]+i;
+      vals[i] = as[localCol];
+      //vals[i] *= multivector[ja[localCol]*col_multivector+j];
+      cols[i] = ja[localCol];
+    }
+    int firstCol = irp[startRow];
+    
+    __syncthreads();
+    for (int t = tid; t < numRows*col_multivector; t += blockDim.x){
+      int localRow = startRow + t/col_multivector;
+      int j = t%col_multivector;
+      double temp = 0; 
+      for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
+	temp += vals[i]*multivector[cols[i]*col_multivector + j];
+      }
+      resultData[localRow*col_multivector +j] = temp;
+    }
+    
+    __syncthreads();    
+    
+  } else {
+    //CSR-Vector
+    //printf("csr vector\n");
+    int warpId = tid / 32; // Global warp index
+    int lane = tid &(32-1); // thread index within the warp
+    //one warp per row
+    double val; 
+    int col;
+    double sum[64] = {0};   
+    if (nnz < 4096){
+      int localCol;
+      for (int i = tid; i < nnz; i+= blockDim.x){ 
+	localCol = irp[startRow]+i;
+	vals[i] = as[localCol];
+	cols[i] = ja[localCol];
+      }
+    }
+    __syncthreads();
+    if (warpId < col_multivector){
+      for (int col_m = warpId; col_m < col_multivector; col_m +=32){
+	for (int i = irp[startRow] + lane; i < irp[startRow+1]; i +=32){
+	  if (nnz < 4096){
+	    val = vals[i-irp[startRow]];
+	    col = cols[i-irp[startRow]];
+	  } else {
+	    val = as[i];
+	    col = ja[i];
+	  }
+	  sum[col_m] += val*multivector[col*col_multivector + col_m];     
+	}
+	sum[col_m] = warp_reduce(sum[col_m]);
+	if (lane == 0){
+	  resultData[startRow*col_multivector + col_m] = sum[col_m];   
+	}
+      }
+    }
+  }
+}
+
+
+__host__ int dCSGAMV(spgpuHandle_t handle, 
+	    double beta,
+	    double* y, 
+	    double alpha, 
+	    const double* as, 
+	    const int* ja,
+	    const int* irp,
+	    int m,
+	    int n,
+	    int  numBlocks,
+	    const int* rowBlocks,
+	    const double *x,
+	    int baseIndex)
+{
+  int maxBForACall = max(handle->maxGridSizeX, numBlocks);
+  int blockX = THREAD_BLOCK;
+  int gridX = maxBForACall;
+  int rp,rows, blcks, bp, numb;
+  dim3 blockSize(blockX);
+  dim3 gridSize(gridX);
+
+  fprintf(stderr," dcsgamv  %d  %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
+
+  bp = 0;
+  rp = 0;
+  numb = numBlocks;  
+  while (numb > maxBForACall) {//managing large vectors
+    blcks = maxBForACall;
+    rp = rowBlocks[bp];
+    rows = rowBlocks[bp+blcks]-rp;
+    fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+    bp   += blcks;
+    numb -= blcks;
+  }
+  blcks = numb;
+  rp = rowBlocks[bp];
+  rows = rowBlocks[bp+blcks]-rp;
+  fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+  rp += rows;
+  fprintf(stderr,"  Final  rows %d    \n",rows);
+  return(SPGPU_SUCCESS);
+}
--- a/cuda/spgpu/kernels/ell_spmv_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmv_base_template.cuh
@ -21,7 +21,9 @@
 __device__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4)
 (int i, VALUE_TYPE yVal, int outRow,
-	VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, const int* rS, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
+ VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM,
+ const int* rP, int cMPitch, int rPPitch, const int* rS, int rows,
+ const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
 {
 	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();

--- a/test/cudakern/dpdegenmv.F90
+++ b/test/cudakern/dpdegenmv.F90
@ -608,6 +608,7 @@ program pdgenmv
 #ifdef HAVE_CUDA
  type(psb_d_cuda_elg_sparse_mat), target   :: aelg
  type(psb_d_cuda_csrg_sparse_mat), target  :: acsrg
+  type(psb_d_cuda_csga_sparse_mat), target  :: acsga
 #if CUDA_SHORT_VERSION <= 10
  type(psb_d_cuda_hybg_sparse_mat), target  :: ahybg
 #endif
@ -719,6 +720,8 @@ program pdgenmv
    agmold => ahdiag
  case('CSRG')
    agmold => acsrg
+  case('CSGA')
+    agmold => acsga
  case('DNSG')
    agmold => adnsg
 #if CUDA_SHORT_VERSION <= 10