ELG SpMM (not compiling)

11 months ago · 6b8199f84b
parent 9daa04c3dc
commit 6b8199f84b
23 changed files with 1076 additions and 31 deletions
--- a/cuda/elldev.c
+++ b/cuda/elldev.c
@ -148,6 +148,9 @@ int FallocEllDevice(void** deviceMat,unsigned int rows, unsigned int maxRowSize,
  return(i);
 }

+//
+// Single Precision Float
+//
 void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, int* rP, int* rS, 
 		 int avgRowSize, int maxRowSize, int rows, int pitch, float *x, float beta, int firstIndex)
 {
@ -168,7 +171,7 @@ void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, i
      x += vPitch;		
    }
 }
-//new
+
 int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 		       float beta, void* deviceY)
 { int i=SPGPU_SUCCESS;
@ -191,7 +194,31 @@ int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return(i);
 }

+int spmmEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			float beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuSellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
+          (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (float*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}

+//
+// Double Precision
+//
 void
 dspmdmm_gpu (double *z,int s, int vPitch, double *y, double alpha, double* cM, int* rP,
 	     int* rS, int avgRowSize, int maxRowSize, int rows, int pitch, 
@ -237,6 +264,31 @@ int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+			double beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuDellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
+          (double*)y->v_, y->pitch_, alpha, (double*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (double*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Single Precision Float Complex
+//
 void
 cspmdmm_gpu (cuFloatComplex *z, int s, int vPitch, cuFloatComplex *y,  
 	     cuFloatComplex alpha, cuFloatComplex* cM,
@ -276,6 +328,31 @@ int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* device
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuCellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (cuFloatComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Double Precision Complex
+//
 void
 zspmdmm_gpu (cuDoubleComplex *z, int s, int vPitch, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* cM,
 	     int* rP, int* rS, int avgRowSize, int maxRowSize, int rows, int pitch,
@ -314,6 +391,28 @@ int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* devi
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuZellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (cuDoubleComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
 int writeEllDeviceFloat(void* deviceMat, float* val, int* ja, int ldj, int* irn, int *idiag)
 { int i;
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
--- a/cuda/elldev.h
+++ b/cuda/elldev.h
@ -113,16 +113,6 @@ int readEllDeviceDouble(void* deviceMat, double* val, int* ja, int ldj, int* irn
 int readEllDeviceFloatComplex(void* deviceMat, float complex* val, int* ja, int ldj, int* irn, int *idiag);
 int readEllDeviceDoubleComplex(void* deviceMat, double complex* val, int* ja, int ldj, int* irn, int *idiag);

-int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
-		       float beta, void* deviceY);
-int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
-			double beta, void* deviceY);
-int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
-			      float complex beta, void* deviceY);
-int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
-			       double complex beta, void* deviceY);
-
-

 int psiCopyCooToElgFloat(int nr, int nc, int nza, int hacksz, int ldv, int nzm, int *irn,
 			  int *idisp, int *ja, float *val, void *deviceMat);
--- a/cuda/elldev_mod.F90
+++ b/cuda/elldev_mod.F90
@ -318,4 +318,35 @@ module elldev_mod
    end function spmvEllDeviceDoubleComplex
  end interface

+  interface spmmEllDevice
+    function spmmEllDeviceFloat(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceFloat')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value 	:: deviceMat, x, y
+      real(c_float),value     	:: alpha, beta
+    end function spmmEllDeviceFloat
+    function spmmEllDeviceDouble(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceDouble')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value	:: deviceMat, x, y 
+      real(c_double),value     	:: alpha,  beta
+    end function spmmEllDeviceDouble
+    function spmmEllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceFloatComplex')
+      use iso_c_binding
+      integer(c_int)		     :: res
+      type(c_ptr), value	     :: deviceMat, x, y 
+      complex(c_float_complex),value :: alpha,  beta
+    end function spmmEllDeviceFloatComplex
+    function spmmEllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceDoubleComplex')
+      use iso_c_binding
+      integer(c_int)		      :: res
+      type(c_ptr), value	      :: deviceMat, x, y 
+      complex(c_double_complex),value :: alpha,  beta
+    end function spmmEllDeviceDoubleComplex
+  end interface
+
 end module elldev_mod
--- a/cuda/hlldev.c
+++ b/cuda/hlldev.c
@ -264,7 +264,7 @@ int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* devic
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
+  spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
          (cuFloatComplex *)devMat->cM, devMat->rP,
          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
@ -285,7 +285,7 @@ int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* devic
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+  spgpuChellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
          devMat->rP, devMat->hackSize, devMat->hackOffs,
          devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
@ -310,7 +310,7 @@ int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* dev
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
+  spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
          (cuDoubleComplex *)devMat->cM, devMat->rP,
          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
@ -331,7 +331,7 @@ int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* dev
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+  spgpuZhellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
          devMat->rP, devMat->hackSize, devMat->hackOffs,
          devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
--- a/cuda/impl/psb_c_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_c_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_c_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_c_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_c_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_c_cuda_elg_vect_mv
+
+subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_c_cuda_elg_mat_mod, psb_protect_name => psb_c_cuda_elg_multivect_mv
+  use psb_c_cuda_vect_mod
+  implicit none 
+  class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_vect_type), intent(inout) :: x
+  class(psb_c_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='c_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= czero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_c_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_c_vect_cuda)
+      select type(yy => y) 
+      type is (psb_c_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= czero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_c_cuda_elg_multivect_mv
--- a/cuda/impl/psb_d_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_elg_csmm.F90
@ -105,7 +105,7 @@ subroutine psb_d_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
--- a/cuda/impl/psb_d_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_elg_vect_mv.F90
@ -176,7 +176,7 @@ subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
          if (beta /= dzero) then 
            if (yy%is_host()) call yy%sync()
          end if
-          info = spmvEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+          info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
               & beta,yy%deviceVect)
          if (info /= 0) then 
            call psb_errpush(psb_err_from_subroutine_ai_,name,&
--- a/cuda/impl/psb_s_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_s_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_s_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_s_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_s_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_s_cuda_elg_vect_mv
+
+subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_s_cuda_elg_mat_mod, psb_protect_name => psb_s_cuda_elg_multivect_mv
+  use psb_s_cuda_vect_mod
+  implicit none 
+  class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_vect_type), intent(inout) :: x
+  class(psb_s_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  real(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='s_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= szero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_s_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_s_vect_cuda)
+      select type(yy => y) 
+      type is (psb_s_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= szero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_s_cuda_elg_multivect_mv
--- a/cuda/impl/psb_z_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_z_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_z_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_z_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_z_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_z_cuda_elg_vect_mv
+
+subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_z_cuda_elg_mat_mod, psb_protect_name => psb_z_cuda_elg_multivect_mv
+  use psb_z_cuda_vect_mod
+  implicit none 
+  class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
+  complex(psb_dpk_), intent(in)       :: alpha, beta
+  class(psb_z_base_vect_type), intent(inout) :: x
+  class(psb_z_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_dpk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='z_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= zzero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_z_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_z_vect_cuda)
+      select type(yy => y) 
+      type is (psb_z_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= zzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_z_cuda_elg_multivect_mv
--- a/cuda/psb_c_cuda_elg_mat_mod.F90
+++ b/cuda/psb_c_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_c_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => c_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => c_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_c_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_c_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_c_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_c_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_c_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_c_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_c_cuda_elg_vect_mv
+    subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_elg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
+      class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_s_cuda_elg_mat_mod.F90
+++ b/cuda/psb_s_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_s_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => s_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => s_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_s_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_s_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_s_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_s_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_s_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_s_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_s_cuda_elg_vect_mv
+    subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_elg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
+      class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_z_cuda_elg_mat_mod.F90
+++ b/cuda/psb_z_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_z_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => z_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => z_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_z_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_z_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_z_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_z_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_z_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_z_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_z_cuda_elg_vect_mv
+    subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_elg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
+      class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/spgpu/ell.h
+++ b/cuda/spgpu/ell.h
@ -70,6 +70,51 @@ void spgpuSellspmv (spgpuHandle_t handle,
 	float beta,
 	int baseIndex);

+/** 
+* \fn void spgpuSellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgRowSize,int maxRowSize,int rows, const __device float *x,int xpitch,float beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuSellspmm(spgpuHandle_t handle,
+	int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device float *x,
+	int xpitch,
+	float beta,
+	int baseIndex);
+
 /** 
 * \fn void spgpuDellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -107,6 +152,50 @@ void spgpuDellspmv (spgpuHandle_t handle,
 	double beta,
 	int baseIndex);

+/** 
+* \fn void spgpuDellspmm (	int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device double *x,int xpitch,double beta,int baseIndex)
+ * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuDellspmm(spgpuHandle_t handle,
+	int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device double *x,
+	int xpitch,
+	double beta,
+	int baseIndex);

 /** 
 * \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -145,6 +234,51 @@ void spgpuCellspmv (spgpuHandle_t handle,
 	cuFloatComplex beta,
 	int baseIndex);

+/** 
+* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
+ * Computes single precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuCellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device cuFloatComplex *x,
+	int xpitch,
+	cuFloatComplex beta,
+	int baseIndex);
+
 /** 
 * \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
 * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -182,6 +316,50 @@ void spgpuZellspmv (spgpuHandle_t handle,
 	cuDoubleComplex beta,
 	int baseIndex);
 	
+/** 
+* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
+ * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuZellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device cuDoubleComplex *x,
+	int xpitch,
+	cuDoubleComplex beta,
+	int baseIndex);
 	
 /** 
 * \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device float *aVal, int baseIndex)
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@ -14,8 +14,9 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
  cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\
  dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\
  dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \
-  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
-  ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
+  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o ell_dcsput.o \
+  ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
+  ell_cspmm.o ell_dspmm.o ell_sspmm.o ell_zspmm.o \
  hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
  hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
  igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
--- a/cuda/spgpu/kernels/ell_cspmm.cu
+++ b/cuda/spgpu/kernels/ell_cspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuFloatComplex
+#define TYPE_SYMBOL C
+#define TEX_FETCH_TYPE cuFloatComplex
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_dspmm.cu
+++ b/cuda/spgpu/kernels/ell_dspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <stdio.h>
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE double
+#define TYPE_SYMBOL D
+#define TEX_FETCH_TYPE int2
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base.cuh
@ -0,0 +1,200 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+
+#define PRE_CONCAT(A, B) A ## B
+#define CONCAT(A, B) PRE_CONCAT(A, B)
+
+#undef GEN_SPGPU_ELL_NAME
+#undef X_TEX
+#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
+
+__device__ __host__ static float zero_float() { return 0.0f; }
+__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
+__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
+
+__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
+__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
+__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
+
+__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
+__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
+__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
+
+__device__ static float readValue_float(float fetch) { return fetch; }
+__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
+
+// host or c.c >= 1.3 
+#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
+__device__ __host__ static double zero_double() { return 0.0; }
+__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
+__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
+
+__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
+__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
+__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
+
+__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
+__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
+__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
+
+__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
+__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
+{
+	cuDoubleComplex c;
+	c.x = __hiloint2double (fetch.y, fetch.x);
+	c.y = __hiloint2double (fetch.w, fetch.z);
+	return c;
+}
+#endif
+
+#if 0
+// Texture cache management
+texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
+
+#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
+#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
+
+__device__ static VALUE_TYPE 
+fetchTex (int pointer)
+{
+	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
+	return CONCAT(readValue_,VALUE_TYPE) (fetch);
+}
+#endif
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
+#define GEN_SPGPU_ELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
+#include "ell_spmm_base_template.cuh"
+#if 0
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
+#define GEN_SPGPU_ELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
+#undef USE_PREFETCHING
+#define USE_PREFETCHING
+#include "ell_spmm_base_template.cuh"
+#define ENABLE_CACHE
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
+#define GEN_SPGPU_ELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
+#include "ell_spmm_base_template.cuh"
+#undef GEN_SPGPU_ELL_NAME
+#undef USE_PREFETCHING
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
+#define GEN_SPGPU_ELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
+#include "ell_spmm_base_template.cuh"
+#endif
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
+
+void
+GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta,
+	int baseIndex)
+{
+// TODO
+  VALUE_TYPE *px,*py,*pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    cM = cM + maxNForACall;
+	rP = rP + maxNForACall;
+	rS = rS + maxNForACall;
+    rows -= maxNForACall;
+  }
+
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
+    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
+
+    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+  }
+  
+  cudaCheckError("CUDA error on hell_spmm");
+}
--- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh
@ -0,0 +1,104 @@
+/* 
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define IDX2
+#define THREAD_BLOCK 128
+#define MMBSZ 8
+
+__global__ void
+CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int cMPitch, int rPPitch, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+ // TODO
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+    
+  if (i < rows) {
+    int j;
+
+    rP += rPPitch;
+    cM += cMPitch;
+
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }
+
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += rPPitch;
+      
+      value = cM[0];
+      cM += cMPitch;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	    fetch = px[pointer]; 
+	    temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	    px = px + xPitch;
+      }
+    }
+
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+    }
+  }
+}
+
+void
+CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch,
+   const int* rS,  const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows,
+   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
+							    alpha, cM, rP, cMPitch, rPPitch, rS, rows,
+							    x, xPitch, beta, baseIndex);
+}
--- a/cuda/spgpu/kernels/ell_sspmm.cu
+++ b/cuda/spgpu/kernels/ell_sspmm.cu
@ -0,0 +1,34 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE float
+#define TYPE_SYMBOL S
+#define TEX_FETCH_TYPE float
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_zspmm.cu
+++ b/cuda/spgpu/kernels/ell_zspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuDoubleComplex
+#define TYPE_SYMBOL Z
+#define TEX_FETCH_TYPE int4
+#include "ell_spmm_base.cuh"