From 9daa04c3dcb0594cde09156e12619481401769e8 Mon Sep 17 00:00:00 2001
From: gabrielequatrana <gabriele.quatrana@outlook.com>
Date: Thu, 18 Apr 2024 22:35:27 +0200
Subject: [PATCH] Updated HLG SpMM (s,d,c,z)

---
 base/modules/serial/psb_c_base_mat_mod.F90    |   1 +
 base/modules/serial/psb_s_base_mat_mod.F90    |   1 +
 base/modules/serial/psb_z_base_mat_mod.F90    |   1 +
 cuda/hlldev.c                                 | 164 +++++---
 cuda/hlldev_mod.F90                           |  36 ++
 cuda/impl/psb_c_cuda_hlg_csmm.F90             |   8 +-
 cuda/impl/psb_c_cuda_hlg_vect_mv.F90          |  89 +++++
 cuda/impl/psb_d_cuda_hlg_csmm.F90             |   2 +-
 cuda/impl/psb_d_cuda_hlg_vect_mv.F90          |   2 +-
 cuda/impl/psb_s_cuda_hlg_csmm.F90             |   8 +-
 cuda/impl/psb_s_cuda_hlg_vect_mv.F90          |  89 +++++
 cuda/impl/psb_z_cuda_hlg_csmm.F90             |   8 +-
 cuda/impl/psb_z_cuda_hlg_vect_mv.F90          |  89 +++++
 cuda/psb_c_cuda_hlg_mat_mod.F90               |  10 +
 cuda/psb_s_cuda_hlg_mat_mod.F90               |  10 +
 cuda/psb_z_cuda_hlg_mat_mod.F90               |  10 +
 cuda/spgpu/hell.h                             | 181 ++++++++-
 cuda/spgpu/kernels/Makefile                   |   3 +-
 cuda/spgpu/kernels/hell_cspmm.cu              |  35 ++
 cuda/spgpu/kernels/hell_dspmm.cu              |  35 ++
 cuda/spgpu/kernels/hell_dspmv.cu              | 371 ------------------
 cuda/spgpu/kernels/hell_spmm_base.cuh         | 275 +++++++++++++
 .../spgpu/kernels/hell_spmm_base_template.cuh | 197 ++++++++++
 cuda/spgpu/kernels/hell_sspmm.cu              |  34 ++
 cuda/spgpu/kernels/hell_zspmm.cu              |  35 ++
 25 files changed, 1237 insertions(+), 457 deletions(-)
 create mode 100644 cuda/spgpu/kernels/hell_cspmm.cu
 create mode 100644 cuda/spgpu/kernels/hell_dspmm.cu
 create mode 100644 cuda/spgpu/kernels/hell_spmm_base.cuh
 create mode 100644 cuda/spgpu/kernels/hell_spmm_base_template.cuh
 create mode 100644 cuda/spgpu/kernels/hell_sspmm.cu
 create mode 100644 cuda/spgpu/kernels/hell_zspmm.cu

diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90
index 33982e3a..52fd8111 100644
--- a/base/modules/serial/psb_c_base_mat_mod.F90
+++ b/base/modules/serial/psb_c_base_mat_mod.F90
@@ -35,6 +35,7 @@ module psb_c_base_mat_mod
 
   use psb_base_mat_mod
   use psb_c_base_vect_mod
+  use psb_c_base_multivect_mod
 
 
   !> \namespace  psb_base_mod  \class  psb_c_base_sparse_mat
diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90
index 92bda7d8..29b1c2dc 100644
--- a/base/modules/serial/psb_s_base_mat_mod.F90
+++ b/base/modules/serial/psb_s_base_mat_mod.F90
@@ -35,6 +35,7 @@ module psb_s_base_mat_mod
 
   use psb_base_mat_mod
   use psb_s_base_vect_mod
+  use psb_s_base_multivect_mod
 
 
   !> \namespace  psb_base_mod  \class  psb_s_base_sparse_mat
diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90
index 3e8196f4..0cee142e 100644
--- a/base/modules/serial/psb_z_base_mat_mod.F90
+++ b/base/modules/serial/psb_z_base_mat_mod.F90
@@ -35,6 +35,7 @@ module psb_z_base_mat_mod
 
   use psb_base_mat_mod
   use psb_z_base_vect_mod
+  use psb_z_base_multivect_mod
 
 
   !> \namespace  psb_base_mod  \class  psb_z_base_sparse_mat
diff --git a/cuda/hlldev.c b/cuda/hlldev.c
index eb78c16c..a5862a29 100644
--- a/cuda/hlldev.c
+++ b/cuda/hlldev.c
@@ -156,7 +156,9 @@ int FallocHllDevice(void** deviceMat,unsigned int hksize, unsigned int rows,  un
   return(i);
 }
 
-
+//
+// Single Precision Float
+//
 int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 			float beta, void* deviceY)
 {
@@ -170,108 +172,170 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
   /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
   /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
-	       devMat->baseIndex);*/
-
-  spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, (float *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
+  spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha,
+          (float *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (float *)x->v_, beta, devMat->baseIndex);
 
   return SPGPU_SUCCESS;
 }
 
-void
-dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP,
-		int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows,
-		double *x, int xPitch, double beta, int firstIndex)
+int spmmHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			float beta, void* deviceY)
 {
-  int i=0;
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
   spgpuHandle_t handle=psb_cudaGetHandle();
-#if defined(NEW_MM)
-  spgpuDhellspmm(handle, count, (double*) z, zPitch, (double*)z, zPitch,
-		  alpha, (double*) cM, rP,hackSize, hackOffs, rS, NULL,
-		  rows, (double*)x, xPitch, beta, firstIndex);
-#else
-  
-  for (i=0; i<count; i++) {
-    spgpuDhellspmv (handle, (double*) z, (double*)z, alpha,
-		    (double*) cM, rP,
-		    hackSize, hackOffs, rS, NULL,
-		    avgNnzPerRow, rows, (double*)x, beta, firstIndex);
-    z += zPitch;
-    x += xPitch;		
-  }
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
+  spgpuShellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
+          (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (float*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
 }
 
-//new
+//
+// Double Precision
+//
 int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
 		       double beta, void* deviceY)
 {
   HllDevice *devMat = (HllDevice *) deviceMat;
   struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
   struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
 
 #ifdef VERBOSE
   /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
   /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
   /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_,
-		  alpha, (double *)devMat->cM, 
-		  devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
-		  devMat->avgNzr, devMat->rows,
-		  (double *)x->v_, x->pitch_, beta, devMat->baseIndex);
+  spgpuDhellspmv (handle, (double *)y->v_, (double *)y->v_, alpha,
+		  (double *)devMat->cM, devMat->rP,
+		  devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (double *)x->v_, beta, devMat->baseIndex);
 
   return SPGPU_SUCCESS;
 }
 
-int spmvHllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX, 
-		       float complex beta, void* deviceY)
+int spmmHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+		       double beta, void* deviceY)
 {
   HllDevice *devMat = (HllDevice *) deviceMat;
   struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
   struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
   spgpuHandle_t handle=psb_cudaGetHandle();
 
-  cuFloatComplex a = make_cuFloatComplex(crealf(alpha),cimagf(alpha));
-  cuFloatComplex b = make_cuFloatComplex(crealf(beta),cimagf(beta));
 #ifdef VERBOSE
   /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
   /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
   /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
-	       devMat->baseIndex);*/
+  spgpuDhellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
+          (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (double *)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
 
-  spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, a, (cuFloatComplex *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
-		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, b, devMat->baseIndex);
-  
   return SPGPU_SUCCESS;
 }
 
-int spmvHllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX, 
-		       double complex beta, void* deviceY)
+//
+// Single Precision Float Complex
+//
+int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
 {
   HllDevice *devMat = (HllDevice *) deviceMat;
   struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
   struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
   spgpuHandle_t handle=psb_cudaGetHandle();
 
-  cuDoubleComplex a = make_cuDoubleComplex(creal(alpha),cimag(alpha));
-  cuDoubleComplex b = make_cuDoubleComplex(creal(beta),cimag(beta));
 #ifdef VERBOSE
   /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
   /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
   /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
+  spgpuShellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
+          (cuFloatComplex *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
 
-  spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, a, (cuDoubleComplex *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
-		  devMat->avgNzr,devMat->rows, (cuDoubleComplex *)x->v_, b, devMat->baseIndex);
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Double Precision Complex
+//
+int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
+          (cuDoubleComplex *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
 
   return SPGPU_SUCCESS;
 }
diff --git a/cuda/hlldev_mod.F90 b/cuda/hlldev_mod.F90
index 90b8e13c..edac8956 100644
--- a/cuda/hlldev_mod.F90
+++ b/cuda/hlldev_mod.F90
@@ -265,4 +265,40 @@ module hlldev_mod
 
   end interface
 
+interface spmmHllDevice
+
+  function spmmHllDeviceFloat(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceFloat')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    real(c_float),value     	:: alpha, beta
+  end function spmmHllDeviceFloat
+
+  function spmmHllDeviceDouble(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceDouble')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    real(c_double),value     	:: alpha,  beta
+  end function spmmHllDeviceDouble
+
+  function spmmHllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceFloatComplex')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    complex(c_float_complex),value :: alpha,  beta
+  end function spmmHllDeviceFloatComplex
+
+  function spmmHllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceDoubleComplex')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    complex(c_double_complex),value :: alpha,  beta
+  end function spmmHllDeviceDoubleComplex
+
+end interface
+
 end module hlldev_mod
diff --git a/cuda/impl/psb_c_cuda_hlg_csmm.F90 b/cuda/impl/psb_c_cuda_hlg_csmm.F90
index 88aa53a8..f3d3dd29 100644
--- a/cuda/impl/psb_c_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_c_cuda_hlg_csmm.F90
@@ -98,16 +98,16 @@ subroutine psb_c_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
     if (info == 0) &
          & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
     if (info == 0) &
          & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
 
     if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
     if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
     if (info /= 0) goto 9999
     call freeMultiVecDevice(gpX)
     call freeMultiVecDevice(gpY)
diff --git a/cuda/impl/psb_c_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_c_cuda_hlg_vect_mv.F90
index 3789ef17..6ddbe0fd 100644
--- a/cuda/impl/psb_c_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_hlg_vect_mv.F90
@@ -117,3 +117,92 @@ subroutine psb_c_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
   return
 
 end subroutine psb_c_cuda_hlg_vect_mv
+
+subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hlldev_mod
+  use psb_vectordev_mod
+  use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_multivect_mv
+  use psb_c_cuda_vect_mod
+  implicit none 
+  class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_vect_type), intent(inout) :: x
+  class(psb_c_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='c_cuda_hlg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= czero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    if (a%is_dev()) call a%sync()    
+    call a%psb_c_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_c_vect_cuda)
+      select type(yy => y) 
+      type is (psb_c_vect_cuda)
+        if (xx%is_host()) call xx%sync()
+        if (beta /= dzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      rx = x%get_vect()
+      ry = y%get_vect()
+      if (a%is_dev()) call a%sync()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_c_cuda_hlg_multivect_mv
\ No newline at end of file
diff --git a/cuda/impl/psb_d_cuda_hlg_csmm.F90 b/cuda/impl/psb_d_cuda_hlg_csmm.F90
index 78a1820a..862cfe41 100644
--- a/cuda/impl/psb_d_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_hlg_csmm.F90
@@ -105,7 +105,7 @@ subroutine psb_d_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
          & info = writeMultiVecDevice(gpY,y,size(y,1))
 
     if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
     if (info == 0) &
          & info = readMultiVecDevice(gpY,y,size(y,1))
     if (info /= 0) goto 9999
diff --git a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90
index 6e32fc71..2c87f8f7 100644
--- a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90
@@ -172,7 +172,7 @@ subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
           if (beta /= dzero) then 
             if (yy%is_host()) call yy%sync()
           end if
-          info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+          info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
                & beta,yy%deviceVect)
           if (info /= 0) then 
             call psb_errpush(psb_err_from_subroutine_ai_,name,&
diff --git a/cuda/impl/psb_s_cuda_hlg_csmm.F90 b/cuda/impl/psb_s_cuda_hlg_csmm.F90
index 0dc28c7f..60443a8b 100644
--- a/cuda/impl/psb_s_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_s_cuda_hlg_csmm.F90
@@ -98,16 +98,16 @@ subroutine psb_s_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
     if (info == 0) &
          & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
     if (info == 0) &
          & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
 
     if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
     if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
     if (info /= 0) goto 9999
     call freeMultiVecDevice(gpX)
     call freeMultiVecDevice(gpY)
diff --git a/cuda/impl/psb_s_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_s_cuda_hlg_vect_mv.F90
index 94696949..3c4974c2 100644
--- a/cuda/impl/psb_s_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_hlg_vect_mv.F90
@@ -117,3 +117,92 @@ subroutine psb_s_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
   return
 
 end subroutine psb_s_cuda_hlg_vect_mv
+
+subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hlldev_mod
+  use psb_vectordev_mod
+  use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_multivect_mv
+  use psb_s_cuda_vect_mod
+  implicit none 
+  class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_vect_type), intent(inout) :: x
+  class(psb_s_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  real(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='s_cuda_hlg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= szero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    if (a%is_dev()) call a%sync()    
+    call a%psb_s_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_s_vect_cuda)
+      select type(yy => y) 
+      type is (psb_s_vect_cuda)
+        if (xx%is_host()) call xx%sync()
+        if (beta /= dzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      rx = x%get_vect()
+      ry = y%get_vect()
+      if (a%is_dev()) call a%sync()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_s_cuda_hlg_multivect_mv
diff --git a/cuda/impl/psb_z_cuda_hlg_csmm.F90 b/cuda/impl/psb_z_cuda_hlg_csmm.F90
index 8eb30ef9..b8ec358d 100644
--- a/cuda/impl/psb_z_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_z_cuda_hlg_csmm.F90
@@ -98,16 +98,16 @@ subroutine psb_z_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
     if (info == 0) &
          & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
     if (info == 0) &
          & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
     if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
 
     if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
     if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
     if (info /= 0) goto 9999
     call freeMultiVecDevice(gpX)
     call freeMultiVecDevice(gpY)
diff --git a/cuda/impl/psb_z_cuda_hlg_vect_mv.F90 b/cuda/impl/psb_z_cuda_hlg_vect_mv.F90
index e2e93b85..d09326bd 100644
--- a/cuda/impl/psb_z_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_hlg_vect_mv.F90
@@ -117,3 +117,92 @@ subroutine psb_z_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
   return
 
 end subroutine psb_z_cuda_hlg_vect_mv
+
+subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+    use psb_base_mod
+    use hlldev_mod
+    use psb_vectordev_mod
+    use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_multivect_mv
+    use psb_z_cuda_vect_mod
+    implicit none 
+    class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
+    complex(psb_dpk_), intent(in)       :: alpha, beta
+    class(psb_z_base_vect_type), intent(inout) :: x
+    class(psb_z_base_vect_type), intent(inout) :: y
+    integer(psb_ipk_), intent(out)             :: info
+    character, optional, intent(in)  :: trans
+    complex(psb_dpk_), allocatable      :: rx(:), ry(:)
+    logical           :: tra
+    character         :: trans_
+    Integer(Psb_ipk_) :: err_act
+    character(len=20) :: name='z_cuda_hlg_multivect_mv'
+  
+    call psb_erractionsave(err_act)
+    info = psb_success_
+  
+    if (present(trans)) then
+      trans_ = trans
+    else
+      trans_ = 'N'
+    end if
+  
+    if (.not.a%is_asb()) then 
+      info = psb_err_invalid_mat_state_
+      call psb_errpush(info,name)
+      goto 9999
+    endif
+  
+    tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+    if (tra) then 
+      if (.not.x%is_host()) call x%sync()
+      if (beta /= zzero) then 
+        if (.not.y%is_host()) call y%sync()
+      end if
+      if (a%is_dev()) call a%sync()    
+      call a%psb_z_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+      call y%set_host()
+    else
+      if (a%is_host()) call a%sync()    
+      select type (xx => x) 
+      type is (psb_z_vect_cuda)
+        select type(yy => y) 
+        type is (psb_z_vect_cuda)
+          if (xx%is_host()) call xx%sync()
+          if (beta /= dzero) then 
+            if (yy%is_host()) call yy%sync()
+          end if
+          info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+               & beta,yy%deviceVect)
+          if (info /= 0) then 
+            call psb_errpush(psb_err_from_subroutine_ai_,name,&
+                 & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+            info = psb_err_from_subroutine_ai_
+            goto 9999
+          end if
+          call yy%set_dev()
+        class default
+          rx = xx%get_vect()
+          ry = y%get_vect()
+          if (a%is_dev()) call a%sync()
+          call a%spmm(alpha,rx,beta,ry,info)
+          call y%bld(ry)
+        end select
+      class default
+        rx = x%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+  
+    end if
+    if (info /= 0) goto 9999
+    call psb_erractionrestore(err_act)
+    return
+  
+  9999 call psb_error_handler(err_act)
+  
+    return
+  
+  end subroutine psb_z_cuda_hlg_multivect_mv
diff --git a/cuda/psb_c_cuda_hlg_mat_mod.F90 b/cuda/psb_c_cuda_hlg_mat_mod.F90
index e98f2474..4ef5b019 100644
--- a/cuda/psb_c_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_c_cuda_hlg_mat_mod.F90
@@ -55,6 +55,7 @@ module psb_c_cuda_hlg_mat_mod
   contains
     procedure, nopass  :: get_fmt       => c_cuda_hlg_get_fmt
     procedure, pass(a) :: sizeof        => c_cuda_hlg_sizeof
+    procedure, pass(a) :: multivect_mv  => psb_c_cuda_hlg_multivect_mv
     procedure, pass(a) :: vect_mv       => psb_c_cuda_hlg_vect_mv
     procedure, pass(a) :: csmm          => psb_c_cuda_hlg_csmm
     procedure, pass(a) :: csmv          => psb_c_cuda_hlg_csmv
@@ -97,6 +98,15 @@ module psb_c_cuda_hlg_mat_mod
       integer(psb_ipk_), intent(out)             :: info
       character, optional, intent(in)            :: trans
     end subroutine psb_c_cuda_hlg_vect_mv
+    subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_hlg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
+      class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_cuda_hlg_multivect_mv
   end interface
 
   interface 
diff --git a/cuda/psb_s_cuda_hlg_mat_mod.F90 b/cuda/psb_s_cuda_hlg_mat_mod.F90
index 4f7b4b6f..a7099ea4 100644
--- a/cuda/psb_s_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_s_cuda_hlg_mat_mod.F90
@@ -56,6 +56,7 @@ module psb_s_cuda_hlg_mat_mod
     procedure, nopass  :: get_fmt       => s_cuda_hlg_get_fmt
     procedure, pass(a) :: sizeof        => s_cuda_hlg_sizeof
     procedure, pass(a) :: vect_mv       => psb_s_cuda_hlg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_s_cuda_hlg_multivect_mv
     procedure, pass(a) :: csmm          => psb_s_cuda_hlg_csmm
     procedure, pass(a) :: csmv          => psb_s_cuda_hlg_csmv
     procedure, pass(a) :: in_vect_sv    => psb_s_cuda_hlg_inner_vect_sv
@@ -97,6 +98,15 @@ module psb_s_cuda_hlg_mat_mod
       integer(psb_ipk_), intent(out)             :: info
       character, optional, intent(in)            :: trans
     end subroutine psb_s_cuda_hlg_vect_mv
+    subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_hlg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
+      class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_cuda_hlg_multivect_mv
   end interface
 
   interface 
diff --git a/cuda/psb_z_cuda_hlg_mat_mod.F90 b/cuda/psb_z_cuda_hlg_mat_mod.F90
index 2acf43f1..61f30374 100644
--- a/cuda/psb_z_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_z_cuda_hlg_mat_mod.F90
@@ -56,6 +56,7 @@ module psb_z_cuda_hlg_mat_mod
     procedure, nopass  :: get_fmt       => z_cuda_hlg_get_fmt
     procedure, pass(a) :: sizeof        => z_cuda_hlg_sizeof
     procedure, pass(a) :: vect_mv       => psb_z_cuda_hlg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_z_cuda_hlg_multivect_mv
     procedure, pass(a) :: csmm          => psb_z_cuda_hlg_csmm
     procedure, pass(a) :: csmv          => psb_z_cuda_hlg_csmv
     procedure, pass(a) :: in_vect_sv    => psb_z_cuda_hlg_inner_vect_sv
@@ -97,6 +98,15 @@ module psb_z_cuda_hlg_mat_mod
       integer(psb_ipk_), intent(out)             :: info
       character, optional, intent(in)            :: trans
     end subroutine psb_z_cuda_hlg_vect_mv
+    subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_hlg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
+      class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_cuda_hlg_multivect_mv
   end interface
 
   interface 
diff --git a/cuda/spgpu/hell.h b/cuda/spgpu/hell.h
index e62f9ae0..2c339b80 100644
--- a/cuda/spgpu/hell.h
+++ b/cuda/spgpu/hell.h
@@ -68,7 +68,46 @@ void spgpuShellspmv (spgpuHandle_t handle,
 	float beta,
 	int baseIndex);
 
-
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device float *x,int xpitch,float beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuShellspmm(spgpuHandle_t handle,
+	int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device float *x,
+	int xpitch,
+	float beta,
+	int baseIndex);
 
 /** 
 * \fn void spgpuDhellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
@@ -104,26 +143,47 @@ void spgpuDhellspmv (spgpuHandle_t handle,
 	const __device double *x, 
 	double beta,
 	int baseIndex);
-#if defined(NEW_MM)
+
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device double *x,int xpitch,double beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
 void spgpuDhellspmm(spgpuHandle_t handle,
-		     int count,
-		     __device double *z,
-		     int zpitch,
-		     const __device double *y,
-		     int ypitch,
-		     double alpha, 
-		     const __device double* cM, 
-		     const __device int* rP,
-		     int hackSize,
-		     const __device int* hackOffsets, 
-		     const __device int* rS,
-		     const __device int* rIdx, 
-		     int rows, 
-		     const __device double *x,
-		     int xpitch,
-		     double beta,
-		     int baseIndex);
-#endif
+	int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device double *x,
+	int xpitch,
+	double beta,
+	int baseIndex);
 
 /** 
 * \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@@ -160,7 +220,46 @@ void spgpuChellspmv (spgpuHandle_t handle,
 	cuFloatComplex beta,
 	int baseIndex);
 
-
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuChellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device cuFloatComplex *x,
+	int xpitch,
+	cuFloatComplex beta,
+	int baseIndex);
 
 /** 
 * \fn void spgpuZhellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
@@ -197,6 +296,46 @@ void spgpuZhellspmv (spgpuHandle_t handle,
 	cuDoubleComplex beta,
 	int baseIndex);
 
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuZhellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device cuDoubleComplex *x,
+	int xpitch,
+	cuDoubleComplex beta,
+	int baseIndex);
 
 /** @}*/
 
diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile
index c4ae38a0..8a71263f 100644
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@@ -17,7 +17,8 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
   dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
   ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
   hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
-  hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
+  hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
+  igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
   saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
   zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
   sxyzw.o cxyzw.o dxyzw.o zxyzw.o
diff --git a/cuda/spgpu/kernels/hell_cspmm.cu b/cuda/spgpu/kernels/hell_cspmm.cu
new file mode 100644
index 00000000..dea4c60b
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_cspmm.cu
@@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuFloatComplex
+#define TYPE_SYMBOL C
+#define TEX_FETCH_TYPE cuFloatComplex
+#include "hell_spmm_base.cuh"
diff --git a/cuda/spgpu/kernels/hell_dspmm.cu b/cuda/spgpu/kernels/hell_dspmm.cu
new file mode 100644
index 00000000..e6b263b6
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_dspmm.cu
@@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <stdio.h>
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE double
+#define TYPE_SYMBOL D
+#define TEX_FETCH_TYPE int2
+#include "hell_spmm_base.cuh"
diff --git a/cuda/spgpu/kernels/hell_dspmv.cu b/cuda/spgpu/kernels/hell_dspmv.cu
index 509131d4..4e120869 100644
--- a/cuda/spgpu/kernels/hell_dspmv.cu
+++ b/cuda/spgpu/kernels/hell_dspmv.cu
@@ -32,374 +32,3 @@ extern "C"
 #define TYPE_SYMBOL D
 #define TEX_FETCH_TYPE int2
 #include "hell_spmv_base.cuh"
-
-
-
-#if 0
-
-#define MMBSZ 8
-
-#undef GEN_SPGPU_HELL_NAME
-#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
-#undef GEN_SPGPU_HELL_NAME_VANILLA
-#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
-
-
-__global__ void
-CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
-  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
-   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
-   int hackSize, const int* hackOffsets, const int* rS, int rows,
-   const VALUE_TYPE *x, int xPitch, 
-   VALUE_TYPE beta, int baseIndex)
-{
-  VALUE_TYPE *pz,*px,*py;
-  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
-  VALUE_TYPE yVal; 
-  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
-
-  int *rrP;
-  VALUE_TYPE *rcM;
-  
-  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
-  unsigned int gridSize =  gridDim.x * blockDim.x;
-  
-  while (i < rows) {
-    int j; 
-    int hackId = i / hackSize;
-    int hackLaneId = i % hackSize;
-    
-    int hackOffset;
-    unsigned int laneId = threadIdx.x % 32;
-    if (laneId == 0)
-      hackOffset = hackOffsets[hackId];
-    //__syncthreads();
-    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
-    
-    rrP = (int *) rP + hackOffset; 
-    rcM = (VALUE_TYPE *) cM + hackOffset; 
-    
-    int rowSize = rS[i];
-    for (int k=0; k<count; k++) {
-      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
-    }      
-    
-    for (int j = 0; j < rowSize; j++) {
-      int pointer;
-      VALUE_TYPE value;
-      VALUE_TYPE fetch;
-      
-      pointer = rrP[0] - baseIndex;
-      rrP += hackSize;
-      
-      value = rcM[0];
-      rcM += hackSize;
-
-      px = (VALUE_TYPE *) x;
-      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	px = px + xPitch;
-      }
-    }
-    // Since z and y are accessed with the same offset by the same thread,
-    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
-    py = (VALUE_TYPE *) y;
-    pz = z;
-    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
-      for (int k=0; k<count; k++) {
-	yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
-					 yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
-	py += yPitch;
-	pz += zPitch;
-      }
-    } else {
-      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
-	pz += zPitch;
-      }
-    }
-    
-    i += gridSize;
-  }
-}
-
-
-void
-GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
-     (spgpuHandle_t handle,
-        int count, 
-        VALUE_TYPE* z,
-        int zPitch,
-	const VALUE_TYPE *y,
-        int  yPitch,
-	VALUE_TYPE alpha, 
-	const VALUE_TYPE* cM, 
-	const int* rP, 
-	int hackSize,
-	const __device int* hackOffsets, 
-	const __device int* rS,
-	const __device int* rIdx, 
-	int rows, 
-	const VALUE_TYPE *x,
-        int xPitch,
-	VALUE_TYPE beta, 
-      int baseIndex)
-{
-  VALUE_TYPE *px,*py, *pz;
-  int cnt, c1;
-
-  dim3 block (THREAD_BLOCK, 1);
-  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
-  // Should we generalize the code to 1/2/4/8 threads per row?
-  // And maybe adjust THREAD_BLOCK size? 
-  int shrMemSize,maxShmemSz;
-  int numMp=getGPUMultiProcessors();
-  int maxThMp=getGPUMaxThreadsPerMP();
-  int nmblksMp=maxThMp/THREAD_BLOCK;
-  int nmblk=nmblksMp*numMp;
-  dim3 grid (nmblk);
-
-  maxShmemSz=getGPUSharedMemPerBlock();
-  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
-  if (shrMemSize > maxShmemSz) {
-    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
-    return;
-  }
-  cnt = count;
-  px = (VALUE_TYPE *) x;
-  py = (VALUE_TYPE *) y;
-  pz = (VALUE_TYPE *) z;	  
-  while (cnt > 2*MMBSZ) {
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
-      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
-							      alpha, cM, rP, hackSize, hackOffsets,
-							      rS, rows, px, xPitch, beta, baseIndex);
-    px += xPitch*MMBSZ;
-    py += yPitch*MMBSZ;
-    pz += zPitch*MMBSZ;
-    cnt -= MMBSZ;
-  }
-  if (cnt > MMBSZ) {
-    c1 = cnt/2;
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
-      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
-							      alpha, cM, rP, hackSize, hackOffsets,
-							      rS, rows, px, xPitch, beta, baseIndex);
-    cnt -= c1;
-  }
-  if (cnt > MMBSZ) {
-    fprintf(stderr,"Invalid residual count %d\n",cnt);
-  } else if (cnt > 0){
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
-      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
-							      alpha, cM, rP, hackSize, hackOffsets,
-							      rS, rows, px, xPitch, beta, baseIndex);
-  }
-  cudaCheckError("CUDA error on hell_spmm");
-}
-
-#elif defined(NEW_MM)
-
-#define MMBSZ 8
-
-#undef GEN_SPGPU_HELL_NAME
-#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
-#undef GEN_SPGPU_HELL_NAME_VANILLA
-#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
-
-
-__global__ void
-CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
-  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
-   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
-   int hackSize, const int* hackOffsets, const int* rS, int rows,
-   const VALUE_TYPE *x, int xPitch, 
-   VALUE_TYPE beta, int baseIndex)
-{
-  VALUE_TYPE *pz,*px,*py;
-  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
-  VALUE_TYPE yVal; 
-  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
-        
-  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
-    
-  if (i < rows) {
-    int j; 
-    int hackId = i / hackSize;
-    int hackLaneId = i % hackSize;
-    
-    int hackOffset;
-    unsigned int laneId = threadIdx.x % 32;
-    if (laneId == 0)
-      hackOffset = hackOffsets[hackId];
-    //__syncthreads();
-    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
-    
-    rP += hackOffset; 
-    cM += hackOffset; 
-    
-    int rowSize = rS[i];
-    for (int k=0; k<count; k++) {
-      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
-    }      
-    
-    for (int j = 0; j < rowSize; j++) {
-      int pointer;
-      VALUE_TYPE value;
-      VALUE_TYPE fetch;
-      
-      pointer = rP[0] - baseIndex;
-      rP += hackSize;
-      
-      value = cM[0];
-      cM += hackSize;
-
-      px = (VALUE_TYPE *) x;
-      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	px = px + xPitch;
-      }
-    }
-    // Since z and y are accessed with the same offset by the same thread,
-    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
-    py = (VALUE_TYPE *) y;
-    pz = z;
-    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
-      for (int k=0; k<count; k++) {
-	yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
-	py += yPitch;
-	pz += zPitch;
-      }
-    else
-      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
-	pz += zPitch;
-      }
-  }
-}
-
-
-void
-CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL))
-  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
-   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
-   const int* rS,  const __device int* rIdx, int rows,
-   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
-{
-  dim3 block (THREAD_BLOCK, 1);
-  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
-  // Should we generalize the code to 1/2/4/8 threads per row?
-  // And maybe adjust THREAD_BLOCK size? 
-  int shrMemSize,maxShmemSz;
-  maxShmemSz=getGPUSharedMemPerBlock();
-  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
-  CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
-    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
-							    alpha, cM, rP, hackSize, hackOffsets, rS, rows,
-							    x, xPitch, beta, baseIndex);
-}
-
-void
-GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
-     (spgpuHandle_t handle,
-        int count, 
-        VALUE_TYPE* z,
-        int zPitch,
-	const VALUE_TYPE *y,
-        int  yPitch,
-	VALUE_TYPE alpha, 
-	const VALUE_TYPE* cM, 
-	const int* rP, 
-	int hackSize,
-	const __device int* hackOffsets, 
-	const __device int* rS,
-	const __device int* rIdx, 
-	int rows, 
-	const VALUE_TYPE *x,
-        int xPitch,
-	VALUE_TYPE beta, 
-      int baseIndex)
-{
-  VALUE_TYPE *px,*py, *pz;
-  int cnt;
-  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
-  
-  // maxNForACall should be a multiple of hackSize
-  maxNForACall = (maxNForACall/hackSize)*hackSize;
-  int maxShmemSz;
-  maxShmemSz=getGPUSharedMemPerBlock();
-  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
-  while (rows > maxNForACall) {//managing large vectors
-    cnt = count;
-    px = (VALUE_TYPE *) x;
-    py = (VALUE_TYPE *) y;
-    pz = (VALUE_TYPE *) z;	  
-    while (cnt > MMBSZ) {
-      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
-      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
-							  py, yPitch,
-							  alpha, cM, rP,
-							  hackSize, hackOffsets,
-							  rS, rIdx,
-							  maxNForACall,
-							  px, xPitch, beta, baseIndex);
-      px += xPitch*MMBSZ;
-      py += yPitch*MMBSZ;
-      pz += zPitch*MMBSZ;
-      cnt -= MMBSZ;
-    }
-    if (cnt >0) {
-      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
-							  py, yPitch,
-							  alpha, cM, rP,
-							  hackSize, hackOffsets,
-							  rS, rIdx,
-							  maxNForACall,
-							  px, xPitch, beta, baseIndex);
-    }
-
-    y = y + maxNForACall;
-    z = z + maxNForACall;
-    hackOffsets = hackOffsets + maxNForACall/hackSize;
-    rS = rS + maxNForACall;
-    
-    rows -= maxNForACall;
-  }
-  cnt = count;
-  px = (VALUE_TYPE *) x;
-  py = (VALUE_TYPE *) y;
-  pz = (VALUE_TYPE *) z;	  
-  while (cnt > MMBSZ) {
-    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
-    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
-							alpha, cM, rP, hackSize, hackOffsets,
-							rS, rIdx, rows,
-							px, xPitch, beta, baseIndex);
-    px += xPitch*MMBSZ;
-    py += yPitch*MMBSZ;
-    pz += zPitch*MMBSZ;
-    cnt -= MMBSZ;
-  }
-  if (cnt >0) {
-    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
-							py, yPitch,
-							alpha, cM, rP,
-							hackSize, hackOffsets,
-							rS, rIdx,
-							rows,
-							px, xPitch, beta, baseIndex);
-  }
-  
-  
-  cudaCheckError("CUDA error on hell_spmm");
-}
-
-
-#endif
-
diff --git a/cuda/spgpu/kernels/hell_spmm_base.cuh b/cuda/spgpu/kernels/hell_spmm_base.cuh
new file mode 100644
index 00000000..1aaf6b85
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_spmm_base.cuh
@@ -0,0 +1,275 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ *  
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define PRE_CONCAT(A, B) A ## B
+#define CONCAT(A, B) PRE_CONCAT(A, B)
+
+#undef GEN_SPGPU_HELL_NAME
+#undef X_TEX
+#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
+
+__device__ __host__ static float zero_float() { return 0.0f; }
+__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
+__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
+
+__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
+__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
+__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
+
+__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
+__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
+__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
+
+__device__ static float readValue_float(float fetch) { return fetch; }
+__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
+
+// host or c.c >= 1.3 
+#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
+__device__ __host__ static double zero_double() { return 0.0; }
+__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
+__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
+
+__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
+__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
+__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
+
+__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
+__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
+__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
+
+__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
+__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
+{
+	cuDoubleComplex c;
+	c.x = __hiloint2double (fetch.y, fetch.x);
+	c.y = __hiloint2double (fetch.w, fetch.z);
+	return c;
+}
+#endif
+#if 0 
+// Texture cache management
+texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
+
+#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
+#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
+
+__device__ static VALUE_TYPE 
+fetchTex (int pointer)
+{
+	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
+	return CONCAT(readValue_,VALUE_TYPE) (fetch);
+}
+#endif
+#if __CUDA_ARCH__ < 300
+extern __shared__ int dynShrMem[]; 
+#endif
+
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
+#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#if 0
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
+#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
+#undef USE_PREFETCHING
+#define USE_PREFETCHING
+#include "hell_spmm_base_template.cuh"
+#define ENABLE_CACHE
+#undef GEN_SPGPU_HELL_NAME
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
+#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#undef USE_PREFETCHING
+#endif
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
+#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
+
+#if 0
+
+void
+GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+    int count, 
+    VALUE_TYPE* z,
+    int zPitch,
+	const VALUE_TYPE *y,
+    int  yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP, 
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const VALUE_TYPE *x,
+    int xPitch,
+	VALUE_TYPE beta, 
+    int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt, c1;
+
+  dim3 block (THREAD_BLOCK, 1);
+  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  int numMp=getGPUMultiProcessors();
+  int maxThMp=getGPUMaxThreadsPerMP();
+  int nmblksMp=maxThMp/THREAD_BLOCK;
+  int nmblk=nmblksMp*numMp;
+  dim3 grid (nmblk);
+
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  if (shrMemSize > maxShmemSz) {
+    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
+    return;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > 2*MMBSZ) {
+    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt > MMBSZ) {
+    c1 = cnt/2;
+    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    cnt -= c1;
+  }
+  if (cnt > MMBSZ) {
+    fprintf(stderr,"Invalid residual count %d\n",cnt);
+  } else if (cnt > 0){
+    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+  }
+  cudaCheckError("CUDA error on hell_spmm");
+}
+
+#endif
+
+void
+GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+    int count, 
+    VALUE_TYPE* z,
+    int zPitch,
+	const VALUE_TYPE *y,
+    int  yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP, 
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const VALUE_TYPE *x,
+    int xPitch,
+	VALUE_TYPE beta, 
+    int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+  
+  // maxNForACall should be a multiple of hackSize
+  maxNForACall = (maxNForACall/hackSize)*hackSize;
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    hackOffsets = hackOffsets + maxNForACall/hackSize;
+    rS = rS + maxNForACall;
+    
+    rows -= maxNForACall;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
+							alpha, cM, rP, hackSize, hackOffsets,
+							rS, rIdx, rows,
+							px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							py, yPitch,
+							alpha, cM, rP,
+							hackSize, hackOffsets,
+							rS, rIdx,
+							rows,
+							px, xPitch, beta, baseIndex);
+  }
+  
+  
+  cudaCheckError("CUDA error on hell_spmm");
+}
diff --git a/cuda/spgpu/kernels/hell_spmm_base_template.cuh b/cuda/spgpu/kernels/hell_spmm_base_template.cuh
new file mode 100644
index 00000000..1d085875
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh
@@ -0,0 +1,197 @@
+/* 
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define IDX2
+#define THREAD_BLOCK 128
+#define MMBSZ 8
+
+#if 0
+
+__global__ void
+CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int hackSize, const int* hackOffsets, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+
+  int *rrP;
+  VALUE_TYPE *rcM;
+  
+  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
+  unsigned int gridSize =  gridDim.x * blockDim.x;
+  
+  while (i < rows) {
+    int j; 
+    int hackId = i / hackSize;
+    int hackLaneId = i % hackSize;
+    
+    int hackOffset;
+    unsigned int laneId = threadIdx.x % 32;
+    if (laneId == 0)
+      hackOffset = hackOffsets[hackId];
+    //__syncthreads();
+    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
+    
+    rrP = (int *) rP + hackOffset; 
+    rcM = (VALUE_TYPE *) cM + hackOffset; 
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rrP[0] - baseIndex;
+      rrP += hackSize;
+      
+      value = rcM[0];
+      rcM += hackSize;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	fetch = px[pointer]; 
+	temp[k][threadIdx.x] =
+	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+	yVal = py[i];
+	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
+					 yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
+	py += yPitch;
+	pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
+	pz += zPitch;
+      }
+    }
+    
+    i += gridSize;
+  }
+}
+
+#endif
+
+__global__ void
+CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int hackSize, const int* hackOffsets, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+    
+  if (i < rows) {
+    int j; 
+    int hackId = i / hackSize;
+    int hackLaneId = i % hackSize;
+    
+    int hackOffset;
+    unsigned int laneId = threadIdx.x % 32;
+    if (laneId == 0)
+      hackOffset = hackOffsets[hackId];
+    //__syncthreads();
+    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
+    
+    rP += hackOffset; 
+    cM += hackOffset; 
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += hackSize;
+      
+      value = cM[0];
+      cM += hackSize;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	fetch = px[pointer]; 
+	temp[k][threadIdx.x] =
+	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
+      for (int k=0; k<count; k++) {
+	yVal = py[i];
+	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
+	py += yPitch;
+	pz += zPitch;
+      }
+    else
+      for (int k=0; k<count; k++) {
+	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
+	pz += zPitch;
+      }
+  }
+}
+
+
+void
+CONCAT(_,GEN_SPGPU_HELL_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
+   const int* rS,  const __device int* rIdx, int rows,
+   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
+							    alpha, cM, rP, hackSize, hackOffsets, rS, rows,
+							    x, xPitch, beta, baseIndex);
+}
diff --git a/cuda/spgpu/kernels/hell_sspmm.cu b/cuda/spgpu/kernels/hell_sspmm.cu
new file mode 100644
index 00000000..7f50c035
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_sspmm.cu
@@ -0,0 +1,34 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE float
+#define TYPE_SYMBOL S
+#define TEX_FETCH_TYPE float
+#include "hell_spmm_base.cuh"
diff --git a/cuda/spgpu/kernels/hell_zspmm.cu b/cuda/spgpu/kernels/hell_zspmm.cu
new file mode 100644
index 00000000..13a12046
--- /dev/null
+++ b/cuda/spgpu/kernels/hell_zspmm.cu
@@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuDoubleComplex
+#define TYPE_SYMBOL Z
+#define TEX_FETCH_TYPE int4
+#include "hell_spmm_base.cuh"