From 3ba2002e60c91c9dfe78c65997cea00538636076 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Fri, 24 May 2024 17:03:46 +0200
Subject: [PATCH] First attempt at running CSGA. To be debugged.

---
 cuda/Makefile                                 |   3 +-
 cuda/d_csga_mod.F90                           |  76 +++++----
 cuda/fcusparse_dat.h                          |   2 +
 cuda/fcusparse_fct.h                          |   7 +-
 cuda/impl/Makefile                            |   6 +-
 cuda/psb_cuda_mod.F90                         |   2 +
 cuda/psb_d_cuda_csga_mat_mod.F90              | 127 +++++++--------
 cuda/psb_d_cuda_csrg_mat_mod.F90              |  11 +-
 cuda/spgpu/kernels/Makefile                   |   2 +-
 cuda/spgpu/kernels/dcsgamv.cu                 | 145 ++++++++++++++++++
 cuda/spgpu/kernels/ell_spmv_base_template.cuh |   4 +-
 test/cudakern/dpdegenmv.F90                   |   3 +
 12 files changed, 267 insertions(+), 121 deletions(-)
 create mode 100644 cuda/spgpu/kernels/dcsgamv.cu

diff --git a/cuda/Makefile b/cuda/Makefile
index 395b6151..6e08eaa1 100755
--- a/cuda/Makefile
+++ b/cuda/Makefile
@@ -120,7 +120,8 @@ psb_cuda_mod.o:  psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\
 	psb_d_cuda_diag_mat_mod.o  psb_d_cuda_csga_mat_mod.o\
 	psb_d_cuda_hdiag_mat_mod.o psb_s_cuda_hdiag_mat_mod.o\
 	psb_s_cuda_dnsg_mat_mod.o psb_d_cuda_dnsg_mat_mod.o \
-	psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o 
+	psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o  \
+        psb_d_cuda_csga_mat_mod.o
 
 iobjs: $(FOBJS)
 	$(MAKE) -C impl objs
diff --git a/cuda/d_csga_mod.F90 b/cuda/d_csga_mod.F90
index f2cfd745..e86288fe 100644
--- a/cuda/d_csga_mod.F90
+++ b/cuda/d_csga_mod.F90
@@ -33,16 +33,13 @@ module d_csga_mod
   integer, parameter :: MAX_NNZ_PER_WG = 4096
   integer, parameter :: MAX_GRID_SIZE  = 65536
   
-  type, bind(c) :: d_CAmat
-    type(c_ptr) :: Mat = c_null_ptr
-  end type d_CAmat
 
   interface CSGADeviceFree
     function d_CSGADeviceFree(Mat) &
          & bind(c,name="d_CSGADeviceFree") result(res)
       use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)  :: Mat
+      import  d_CMat
+      type(d_CMat)  :: Mat
       integer(c_int) :: res
     end function d_CSGADeviceFree
   end interface
@@ -51,62 +48,61 @@ module d_csga_mod
     function d_CSGADeviceAlloc(Mat,nr,nc,nz) &
          & bind(c,name="d_CSGADeviceAlloc") result(res)
       use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
+      import  d_CMat
+      type(d_CMat)          :: Mat
       integer(c_int), value :: nr, nc, nz
       integer(c_int)        :: res
     end function d_CSGADeviceAlloc
   end interface
-  
-  interface CSGADeviceSetMatDiagType
-    function d_CSGADeviceSetMatDiagType(Mat,type) &
-         & bind(c,name="d_CSGADeviceSetMatDiagType") result(res)
-      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int),value  :: type
-      integer(c_int)        :: res
-    end function d_CSGADeviceSetMatDiagType
-  end interface
     
-  interface CSGADeviceSetMatFillMode
-    function d_CSGADeviceSetMatFillMode(Mat,type) &
-         & bind(c,name="d_CSGADeviceSetMatFillMode") result(res)
-      use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int),value  :: type
-      integer(c_int)        :: res
-    end function d_CSGADeviceSetMatFillMode
-  end interface
-  
   interface CSGAHost2Device
-    function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,rowBlocks) &
+    function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
          &  bind(c,name="d_CSGAHost2Device") result(res)
       use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
-      integer(c_int), value :: m,n,nz
+      import  d_CMat
+      type(d_CMat)          :: Mat
+      integer(c_int), value :: m,n,nz,numBlocks
       integer(c_int)        :: irp(*), ja(*), rowBlocks(*)
-      real(c_double)         :: val(*)
+      real(c_double)        :: val(*)
       integer(c_int)        :: res
     end function d_CSGAHost2Device
   end interface
   
   interface CSGADevice2Host
-    function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,rowBlocks) &
+    function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
          &  bind(c,name="d_CSGADevice2Host") result(res)
       use iso_c_binding
-      import  d_CAmat
-      type(d_CAmat)          :: Mat
+      import  d_CMat
+      type(d_CMat)          :: Mat
       integer(c_int), value :: m,n,nz
+      integer(c_int)        :: numBlocks
       integer(c_int)        :: irp(*), ja(*), rowBlocks(*)
-      real(c_double)         :: val(*)
+      real(c_double)        :: val(*)
       integer(c_int)        :: res
     end function d_CSGADevice2Host
   end interface
-  
 
-  
+  interface CSGAComputeRowBlocks
+    subroutine d_CSGAComputeRowBlocks(m,irp,numBlocks,rowBlocks) &
+         &  bind(c,name="d_CSGAComputeRowBlocks") 
+      use iso_c_binding
+      integer(c_int), value :: m
+      integer(c_int)        :: numBlocks
+      integer(c_int)        :: irp(*), rowBlocks(*)
+    end subroutine d_CSGAComputeRowBlocks
+  end interface
+ 
+  interface spmvCSGADevice
+    function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
+         &  bind(c,name="d_spmvCSGADevice") result(res)
+      use iso_c_binding
+      import  d_Cmat
+      type(d_Cmat)          :: Mat
+      type(c_ptr), value    :: x
+      type(c_ptr), value    :: y
+      real(c_double), value :: alpha,beta
+      integer(c_int)        :: res
+    end function d_spmvCSGADevice
+  end interface
 
 end module d_csga_mod
diff --git a/cuda/fcusparse_dat.h b/cuda/fcusparse_dat.h
index 4b7cd6ce..879d15c1 100644
--- a/cuda/fcusparse_dat.h
+++ b/cuda/fcusparse_dat.h
@@ -51,6 +51,8 @@ typedef struct T_CSRGDeviceMat
   TYPE               *val;
   int                *irp;
   int                *ja;
+  int                *rowBlocks;
+  int                 numBlocks;
 } T_CSRGDeviceMat;
 
 /* Interoperability: type coming from Fortran side to distinguish D/S/C/Z. */
diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h
index 12be21bd..7e757b2e 100644
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@@ -96,7 +96,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
 #else
   cusparseDnVecDescr_t vecX, vecY;
   size_t bfsz;
-
   if (T_CSRGIsNullMvDescr(cMat)) {
     cMat->spmvDescr = (cusparseSpMatDescr_t  *) malloc(sizeof(cusparseSpMatDescr_t  *));
   }
@@ -246,9 +245,13 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
     return((int) CUSPARSE_STATUS_INVALID_VALUE);
   if ((cMat=(T_CSRGDeviceMat *) malloc(sizeof(T_CSRGDeviceMat)))==NULL)
     return((int) CUSPARSE_STATUS_ALLOC_FAILED);
+
+  cMat->numBlocks = 0;
+  cMat->rowBlocks = NULL;
   cMat->m  = nr;
   cMat->n  = nc;
   cMat->nz = nz;
+  fprintf(stderr,"Setting M %d  N %d \n",cMat->m,cMat->n);
   if (nr1 == 0) nr1 = 1;
   if (nz1 == 0) nz1 = 1;
   if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)
@@ -344,6 +347,8 @@ int T_CSRGDeviceFree(T_Cmat *Matrix)
     cMat->svbuffer=NULL;
     cMat->svbsize=0;
 #endif
+    if (cMat->rowBlocks != NULL) freeRemoteBuffer(cMat->rowBlocks);
+    cMat->numBlocks = 0;
     free(cMat);
     Matrix->mat = NULL;
   }
diff --git a/cuda/impl/Makefile b/cuda/impl/Makefile
index 12bf0747..38bff7fd 100755
--- a/cuda/impl/Makefile
+++ b/cuda/impl/Makefile
@@ -286,7 +286,11 @@ psb_s_cuda_hdiag_vect_mv.o \
 psb_s_cuda_dnsg_mat_impl.o \
 psb_d_cuda_dnsg_mat_impl.o \
 psb_c_cuda_dnsg_mat_impl.o \
-psb_z_cuda_dnsg_mat_impl.o 
+psb_z_cuda_dnsg_mat_impl.o \
+psb_d_cuda_csga_to_gpu.o \
+psb_d_cuda_csga_from_gpu.o \
+psb_d_cuda_csga_mold.o \
+psb_d_cuda_csga_vect_mv.o 
 
 
 objs: $(OBJS)
diff --git a/cuda/psb_cuda_mod.F90 b/cuda/psb_cuda_mod.F90
index 81ce3e31..494e1bcd 100644
--- a/cuda/psb_cuda_mod.F90
+++ b/cuda/psb_cuda_mod.F90
@@ -63,6 +63,8 @@ module psb_cuda_mod
   use psb_c_cuda_hlg_mat_mod
   use psb_z_hll_mat_mod
   use psb_z_cuda_hlg_mat_mod
+
+  use psb_d_cuda_csga_mat_mod
   
   use psb_s_cuda_csrg_mat_mod
   use psb_d_cuda_csrg_mat_mod
diff --git a/cuda/psb_d_cuda_csga_mat_mod.F90 b/cuda/psb_d_cuda_csga_mat_mod.F90
index d6edf7f9..f0a7ecdf 100644
--- a/cuda/psb_d_cuda_csga_mat_mod.F90
+++ b/cuda/psb_d_cuda_csga_mat_mod.F90
@@ -41,11 +41,8 @@ module psb_d_cuda_csga_mat_mod
     !
     ! Format for CSR Adaptive. 
     !
-    type(d_CAmat)    :: deviceAMat
-    integer(psb_ipk_), allocatable :: rowBlocks(:)
   contains
     procedure, nopass  :: get_fmt       => d_cuda_csga_get_fmt
-    procedure, pass(a) :: sizeof        => d_cuda_csga_sizeof
     procedure, pass(a) :: vect_mv       => psb_d_cuda_csga_vect_mv
 !!$    procedure, pass(a) :: in_vect_sv    => psb_d_cuda_csga_inner_vect_sv
 !!$    procedure, pass(a) :: csmm          => psb_d_cuda_csga_csmm
@@ -56,11 +53,11 @@ module psb_d_cuda_csga_mat_mod
 !!$    procedure, pass(a) :: allocate_mnnz => psb_d_cuda_csga_allocate_mnnz
     ! Note: we do *not* need the TO methods, because the parent type
     ! methods will work. 
-    procedure, pass(a) :: cp_from_coo   => psb_d_cuda_cp_csga_from_coo
-    procedure, pass(a) :: cp_from_fmt   => psb_d_cuda_cp_csga_from_fmt
-    procedure, pass(a) :: mv_from_coo   => psb_d_cuda_mv_csga_from_coo
-    procedure, pass(a) :: mv_from_fmt   => psb_d_cuda_mv_csga_from_fmt
-    procedure, pass(a) :: free          => d_cuda_csga_free
+!!$    procedure, pass(a) :: cp_from_coo   => psb_d_cuda_cp_csga_from_coo
+!!$    procedure, pass(a) :: cp_from_fmt   => psb_d_cuda_cp_csga_from_fmt
+!!$    procedure, pass(a) :: mv_from_coo   => psb_d_cuda_mv_csga_from_coo
+!!$    procedure, pass(a) :: mv_from_fmt   => psb_d_cuda_mv_csga_from_fmt
+!!$    procedure, pass(a) :: free          => d_cuda_csga_free
     procedure, pass(a) :: mold          => psb_d_cuda_csga_mold
 !!$    procedure, pass(a) :: is_host       => d_cuda_csga_is_host
 !!$    procedure, pass(a) :: is_dev        => d_cuda_csga_is_dev
@@ -74,7 +71,7 @@ module psb_d_cuda_csga_mat_mod
     final              :: d_cuda_csga_finalize
   end type psb_d_cuda_csga_sparse_mat
 
-  private :: d_cuda_csga_free,  d_cuda_csga_get_fmt, d_cuda_csga_sizeof
+  private :: d_cuda_csga_get_fmt
 
 !!$
 !!$  interface 
@@ -145,41 +142,41 @@ module psb_d_cuda_csga_mat_mod
     end subroutine psb_d_cuda_csga_from_gpu
   end interface
 
-  interface 
-    subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_coo_sparse_mat), intent(in)     :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_cp_csga_from_coo
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_base_sparse_mat), intent(in)    :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_cp_csga_from_fmt
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_coo_sparse_mat), intent(inout)  :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_mv_csga_from_coo
-  end interface
-  
-  interface 
-    subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) 
-      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
-      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-      class(psb_d_base_sparse_mat), intent(inout) :: b
-      integer(psb_ipk_), intent(out)              :: info
-    end subroutine psb_d_cuda_mv_csga_from_fmt
-  end interface
+!!$  interface 
+!!$    subroutine psb_d_cuda_cp_csga_from_coo(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_coo_sparse_mat), intent(in)     :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_cp_csga_from_coo
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_base_sparse_mat), intent(in)    :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_cp_csga_from_fmt
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_mv_csga_from_coo(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_coo_sparse_mat), intent(inout)  :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_mv_csga_from_coo
+!!$  end interface
+!!$  
+!!$  interface 
+!!$    subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info) 
+!!$      import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
+!!$      class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$      class(psb_d_base_sparse_mat), intent(inout) :: b
+!!$      integer(psb_ipk_), intent(out)              :: info
+!!$    end subroutine psb_d_cuda_mv_csga_from_fmt
+!!$  end interface
   
 !!$  interface 
 !!$    subroutine psb_d_cuda_csga_csmv(alpha,a,x,beta,y,info,trans) 
@@ -237,22 +234,6 @@ contains
   ! == ===================================
 
   
-  function d_cuda_csga_sizeof(a) result(res)
-    implicit none 
-    class(psb_d_cuda_csga_sparse_mat), intent(in) :: a
-    integer(psb_epk_) :: res
-    if (a%is_dev()) call a%sync()
-    res = 8 
-    res = res + psb_sizeof_dp  * size(a%val)
-    res = res + psb_sizeof_ip * size(a%irp)
-    res = res + psb_sizeof_ip * size(a%ja)
-    res = res + psb_sizeof_ip * size(a%rowBlocks)
-    ! Should we account for the shadow data structure
-    ! on the GPU device side? 
-    ! res = 2*res
-      
-  end function d_cuda_csga_sizeof
-
   function d_cuda_csga_get_fmt() result(res)
     implicit none 
     character(len=5) :: res
@@ -337,18 +318,18 @@ contains
 !!$
 !!$  end subroutine d_cuda_csga_sync
 
-  subroutine  d_cuda_csga_free(a) 
-    implicit none 
-    integer(psb_ipk_) :: info
-
-    class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
-
-    info = CSGADeviceFree(a%deviceAMat)
-    call a%psb_d_csr_sparse_mat%free()
-    
-    return
-
-  end subroutine d_cuda_csga_free
+!!$  subroutine  d_cuda_csga_free(a) 
+!!$    implicit none 
+!!$    integer(psb_ipk_) :: info
+!!$
+!!$    class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
+!!$
+!!$    info = CSGADeviceFree(a%deviceMat)
+!!$    call a%psb_d_csr_sparse_mat%free()
+!!$    
+!!$    return
+!!$
+!!$  end subroutine d_cuda_csga_free
 
   subroutine  d_cuda_csga_finalize(a) 
     implicit none 
@@ -356,7 +337,7 @@ contains
     
     type(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
 
-    info = CSGADeviceFree(a%deviceAMat)
+    info = CSRGDeviceFree(a%deviceMat)
     
     return
 
diff --git a/cuda/psb_d_cuda_csrg_mat_mod.F90 b/cuda/psb_d_cuda_csrg_mat_mod.F90
index 101959bd..d6fadec2 100644
--- a/cuda/psb_d_cuda_csrg_mat_mod.F90
+++ b/cuda/psb_d_cuda_csrg_mat_mod.F90
@@ -49,6 +49,8 @@ module psb_d_cuda_csrg_mat_mod
     ! 
     ! 
     type(d_Cmat)    :: deviceMat
+    integer(psb_ipk_), allocatable :: rowBlocks(:)
+    integer(psb_ipk_) :: numBlocks=0
     integer(psb_ipk_) :: devstate  = is_host
     
   contains
@@ -252,9 +254,10 @@ contains
     integer(psb_epk_) :: res
     if (a%is_dev()) call a%sync()
     res = 8 
-    res = res + psb_sizeof_dp  * size(a%val)
-    res = res + psb_sizeof_ip * size(a%irp)
-    res = res + psb_sizeof_ip * size(a%ja)
+    res = res + psb_sizeof_dp * psb_size(a%val)
+    res = res + psb_sizeof_ip * psb_size(a%irp)
+    res = res + psb_sizeof_ip * psb_size(a%ja)
+    res = res + psb_sizeof_ip * psb_size(a%rowBlocks)
     ! Should we account for the shadow data structure
     ! on the GPU device side? 
     ! res = 2*res
@@ -353,6 +356,8 @@ contains
     class(psb_d_cuda_csrg_sparse_mat), intent(inout) :: a
 
     info = CSRGDeviceFree(a%deviceMat)
+    if (allocated(a%rowBlocks)) deallocate(a%rowBlocks)
+    a%numBlocks=0
     call a%psb_d_csr_sparse_mat%free()
     
     return
diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile
index c4ae38a0..cf8ab939 100644
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@@ -20,7 +20,7 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
   hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
   saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
   zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
-  sxyzw.o cxyzw.o dxyzw.o zxyzw.o
+  sxyzw.o cxyzw.o dxyzw.o zxyzw.o  dcsgamv.o
 
 objs: $(OBJS)
 lib: objs
diff --git a/cuda/spgpu/kernels/dcsgamv.cu b/cuda/spgpu/kernels/dcsgamv.cu
new file mode 100644
index 00000000..5c229ef7
--- /dev/null
+++ b/cuda/spgpu/kernels/dcsgamv.cu
@@ -0,0 +1,145 @@
+#include "stdio.h"
+#include "cudalang.h"
+#include "cudadebug.h"
+extern "C"
+{
+#include "core.h"
+#include "csga.h"
+}
+
+#include "debug.h"
+
+//#define MAX_NNZ_PER_WG 6144
+#define MAX_NNZ_PER_WG 4096
+#define THREAD_BLOCK   1024
+#define MAX_GRID_SIZE 65536
+#define WARP_SIZE 32
+
+
+__device__ double warp_reduce(double val){
+  for(int offset=warpSize/2; offset>0; offset/=2){
+    val += __shfl_down_sync(0xffffffff,val, offset);
+  }
+  return val;
+}
+
+__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
+			     int m, int n, int col_multivector,
+			     int* rowBlocks, double* resultData, int baseIndex){
+  __shared__ double vals[MAX_NNZ_PER_WG];
+  __shared__ int cols[MAX_NNZ_PER_WG];
+  
+  int startRow = rowBlocks[blockIdx.x];
+  int stopRow = rowBlocks[blockIdx.x+1];
+  long int numRows = stopRow - startRow;
+  int nnz = irp[stopRow]-irp[startRow];
+  int tid = threadIdx.x; // indice del thread nel blocco
+  if (numRows > 1){
+    //CSR-Stream
+    //printf("csr stream\n");
+    
+    int localCol;
+    
+    for (int i = tid; i < nnz; i+= blockDim.x){ 
+      localCol = irp[startRow]+i;
+      vals[i] = as[localCol];
+      //vals[i] *= multivector[ja[localCol]*col_multivector+j];
+      cols[i] = ja[localCol];
+    }
+    int firstCol = irp[startRow];
+    
+    __syncthreads();
+    for (int t = tid; t < numRows*col_multivector; t += blockDim.x){
+      int localRow = startRow + t/col_multivector;
+      int j = t%col_multivector;
+      double temp = 0; 
+      for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
+	temp += vals[i]*multivector[cols[i]*col_multivector + j];
+      }
+      resultData[localRow*col_multivector +j] = temp;
+    }
+    
+    __syncthreads();    
+    
+  } else {
+    //CSR-Vector
+    //printf("csr vector\n");
+    int warpId = tid / 32; // Global warp index
+    int lane = tid &(32-1); // thread index within the warp
+    //one warp per row
+    double val; 
+    int col;
+    double sum[64] = {0};   
+    if (nnz < 4096){
+      int localCol;
+      for (int i = tid; i < nnz; i+= blockDim.x){ 
+	localCol = irp[startRow]+i;
+	vals[i] = as[localCol];
+	cols[i] = ja[localCol];
+      }
+    }
+    __syncthreads();
+    if (warpId < col_multivector){
+      for (int col_m = warpId; col_m < col_multivector; col_m +=32){
+	for (int i = irp[startRow] + lane; i < irp[startRow+1]; i +=32){
+	  if (nnz < 4096){
+	    val = vals[i-irp[startRow]];
+	    col = cols[i-irp[startRow]];
+	  } else {
+	    val = as[i];
+	    col = ja[i];
+	  }
+	  sum[col_m] += val*multivector[col*col_multivector + col_m];     
+	}
+	sum[col_m] = warp_reduce(sum[col_m]);
+	if (lane == 0){
+	  resultData[startRow*col_multivector + col_m] = sum[col_m];   
+	}
+      }
+    }
+  }
+}
+
+
+__host__ int dCSGAMV(spgpuHandle_t handle, 
+	    double beta,
+	    double* y, 
+	    double alpha, 
+	    const double* as, 
+	    const int* ja,
+	    const int* irp,
+	    int m,
+	    int n,
+	    int  numBlocks,
+	    const int* rowBlocks,
+	    const double *x,
+	    int baseIndex)
+{
+  int maxBForACall = max(handle->maxGridSizeX, numBlocks);
+  int blockX = THREAD_BLOCK;
+  int gridX = maxBForACall;
+  int rp,rows, blcks, bp, numb;
+  dim3 blockSize(blockX);
+  dim3 gridSize(gridX);
+
+  fprintf(stderr," dcsgamv  %d  %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
+
+  bp = 0;
+  rp = 0;
+  numb = numBlocks;  
+  while (numb > maxBForACall) {//managing large vectors
+    blcks = maxBForACall;
+    rp = rowBlocks[bp];
+    rows = rowBlocks[bp+blcks]-rp;
+    fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+    bp   += blcks;
+    numb -= blcks;
+  }
+  blcks = numb;
+  rp = rowBlocks[bp];
+  rows = rowBlocks[bp+blcks]-rp;
+  fprintf(stderr,"  rp %d  rows %d  bp %d  \n",rp,rows,bp);
+  rp += rows;
+  fprintf(stderr,"  Final  rows %d    \n",rows);
+  return(SPGPU_SUCCESS);
+}
diff --git a/cuda/spgpu/kernels/ell_spmv_base_template.cuh b/cuda/spgpu/kernels/ell_spmv_base_template.cuh
index fa39d8a6..964b9cd9 100644
--- a/cuda/spgpu/kernels/ell_spmv_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmv_base_template.cuh
@@ -21,7 +21,9 @@
 __device__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4)
 (int i, VALUE_TYPE yVal, int outRow,
-	VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, const int* rS, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
+ VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM,
+ const int* rP, int cMPitch, int rPPitch, const int* rS, int rows,
+ const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
 {
 	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
 
diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90
index 85059e81..7f01051c 100644
--- a/test/cudakern/dpdegenmv.F90
+++ b/test/cudakern/dpdegenmv.F90
@@ -608,6 +608,7 @@ program pdgenmv
 #ifdef HAVE_CUDA
   type(psb_d_cuda_elg_sparse_mat), target   :: aelg
   type(psb_d_cuda_csrg_sparse_mat), target  :: acsrg
+  type(psb_d_cuda_csga_sparse_mat), target  :: acsga
 #if CUDA_SHORT_VERSION <= 10
   type(psb_d_cuda_hybg_sparse_mat), target  :: ahybg
 #endif
@@ -719,6 +720,8 @@ program pdgenmv
     agmold => ahdiag
   case('CSRG')
     agmold => acsrg
+  case('CSGA')
+    agmold => acsga
   case('DNSG')
     agmold => adnsg
 #if CUDA_SHORT_VERSION <= 10