SpMM HDIAG working

9 months ago · 4ff0f112a9
parent bdd04a6911
commit 4ff0f112a9
20 changed files with 905 additions and 156 deletions
--- a/base/modules/serial/psb_c_base_mat_mod.F90
+++ b/base/modules/serial/psb_c_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_c_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_c_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_c_base_csmv
-    procedure, pass(a) :: csmm        => psb_c_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_c_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_c_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_c_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_c_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_c_base_cssv
-    procedure, pass(a) :: cssm        => psb_c_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_c_base_scals
-    procedure, pass(a) :: scalv       => psb_c_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_c_base_maxval
-    procedure, pass(a) :: spnmi       => psb_c_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_c_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_c_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_c_base_arwsum
-    procedure, pass(a) :: colsum      => psb_c_base_colsum
-    procedure, pass(a) :: aclsum      => psb_c_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_c_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_c_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_c_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_c_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_c_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_c_base_csmv
+    procedure, pass(a) :: csmm         => psb_c_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_c_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_c_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_c_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_c_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_c_base_cssv
+    procedure, pass(a) :: cssm         => psb_c_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_c_base_scals
+    procedure, pass(a) :: scalv        => psb_c_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_c_base_maxval
+    procedure, pass(a) :: spnmi        => psb_c_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_c_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_c_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_c_base_arwsum
+    procedure, pass(a) :: colsum       => psb_c_base_colsum
+    procedure, pass(a) :: aclsum       => psb_c_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_c_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_c_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_c_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_c_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_c_base_sparse_mat

  private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_c_base_mat_mod
    end subroutine psb_c_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_c_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_c_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_c_base_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_c_base_sparse_mat
--- a/base/modules/serial/psb_s_base_mat_mod.F90
+++ b/base/modules/serial/psb_s_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_s_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_s_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_s_base_csmv
-    procedure, pass(a) :: csmm        => psb_s_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_s_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_s_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_s_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_s_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_s_base_cssv
-    procedure, pass(a) :: cssm        => psb_s_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_s_base_scals
-    procedure, pass(a) :: scalv       => psb_s_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_s_base_maxval
-    procedure, pass(a) :: spnmi       => psb_s_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_s_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_s_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_s_base_arwsum
-    procedure, pass(a) :: colsum      => psb_s_base_colsum
-    procedure, pass(a) :: aclsum      => psb_s_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_s_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_s_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_s_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_s_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_s_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_s_base_csmv
+    procedure, pass(a) :: csmm         => psb_s_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_s_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_s_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_s_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_s_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_s_base_cssv
+    procedure, pass(a) :: cssm         => psb_s_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_s_base_scals
+    procedure, pass(a) :: scalv        => psb_s_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_s_base_maxval
+    procedure, pass(a) :: spnmi        => psb_s_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_s_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_s_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_s_base_arwsum
+    procedure, pass(a) :: colsum       => psb_s_base_colsum
+    procedure, pass(a) :: aclsum       => psb_s_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_s_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_s_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_s_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_s_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_s_base_sparse_mat

  private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_s_base_mat_mod
    end subroutine psb_s_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_s_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_s_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_s_base_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_s_base_sparse_mat
--- a/base/modules/serial/psb_z_base_mat_mod.F90
+++ b/base/modules/serial/psb_z_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_z_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_z_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_z_base_csmv
-    procedure, pass(a) :: csmm        => psb_z_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_z_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_z_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_z_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_z_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_z_base_cssv
-    procedure, pass(a) :: cssm        => psb_z_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_z_base_scals
-    procedure, pass(a) :: scalv       => psb_z_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_z_base_maxval
-    procedure, pass(a) :: spnmi       => psb_z_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_z_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_z_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_z_base_arwsum
-    procedure, pass(a) :: colsum      => psb_z_base_colsum
-    procedure, pass(a) :: aclsum      => psb_z_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_z_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_z_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_z_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_z_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_z_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_z_base_csmv
+    procedure, pass(a) :: csmm         => psb_z_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_z_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_z_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_z_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_z_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_z_base_cssv
+    procedure, pass(a) :: cssm         => psb_z_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_z_base_scals
+    procedure, pass(a) :: scalv        => psb_z_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_z_base_maxval
+    procedure, pass(a) :: spnmi        => psb_z_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_z_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_z_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_z_base_arwsum
+    procedure, pass(a) :: colsum       => psb_z_base_colsum
+    procedure, pass(a) :: aclsum       => psb_z_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_z_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_z_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_z_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_z_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_z_base_sparse_mat

  private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_z_base_mat_mod
    end subroutine psb_z_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_z_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_z_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_z_base_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_z_base_sparse_mat
--- a/base/serial/impl/psb_c_base_mat_impl.F90
+++ b/base/serial/impl/psb_c_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_c_base_vect_mv

+subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv
+  implicit none
+  class(psb_c_base_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_multivect_type), intent(inout) :: x
+  class(psb_c_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_c_base_multivect_mv
+
 subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv
  use psb_c_base_vect_mod
--- a/base/serial/impl/psb_s_base_mat_impl.F90
+++ b/base/serial/impl/psb_s_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_s_base_vect_mv

+subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv
+  implicit none
+  class(psb_s_base_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_multivect_type), intent(inout) :: x
+  class(psb_s_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_s_base_multivect_mv
+
 subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv
  use psb_s_base_vect_mod
--- a/base/serial/impl/psb_z_base_mat_impl.F90
+++ b/base/serial/impl/psb_z_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_z_base_vect_mv

+subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv
+  implicit none
+  class(psb_z_base_sparse_mat), intent(in) :: a
+  complex(psb_dpk_), intent(in)       :: alpha, beta
+  class(psb_z_base_multivect_type), intent(inout) :: x
+  class(psb_z_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_z_base_multivect_mv
+
 subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv
  use psb_z_base_vect_mod
--- a/cuda/hdiagdev.c
+++ b/cuda/hdiagdev.c
@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+			  double beta, void* deviceY)
+{
+  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_,
+      (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
+      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
+      devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta);
+
+  return SPGPU_SUCCESS;
+}
+
 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets)
 { int i=0,fo,fa,j,k,p;
  char buf_a[255], buf_o[255],tmp[255];
@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			  float beta, void* deviceY)
+{
+  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_,
+      (float *)y->v_, y->pitch_, alpha, (float *)devMat->cM,
+      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
+      devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta);
+
+  return SPGPU_SUCCESS;
+}
--- a/cuda/hdiagdev.h
+++ b/cuda/hdiagdev.h
@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params);
 void freeHdiagDevice(void* remoteMatrix);

 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
-int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
-			  float beta, void* deviceY);
-
 int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
-int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
-			  double beta, void* deviceY);
+//int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
+//int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);

+//int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);

 #endif
--- a/cuda/hdiagdev_mod.F90
+++ b/cuda/hdiagdev_mod.F90
@ -196,4 +196,35 @@ module hdiagdev_mod
 !!$    end function spmvHdiagDeviceDoubleComplex
  end interface spmvHdiagDevice

+  interface spmmHdiagDevice
+    function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmHdiagDeviceFloat')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value 	:: deviceMat, x, y
+      real(c_float),value     	:: alpha, beta
+    end function spmmHdiagDeviceFloat
+    function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmHdiagDeviceDouble')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value	:: deviceMat, x, y 
+      real(c_double),value     	:: alpha,  beta
+    end function spmmHdiagDeviceDouble
+!!$    function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+!!$         & result(res) bind(c,name='spmmHdiagDeviceFloatComplex')
+!!$      use iso_c_binding
+!!$      integer(c_int)		     :: res
+!!$      type(c_ptr), value	     :: deviceMat, x, y 
+!!$      complex(c_float_complex),value :: alpha,  beta
+!!$    end function spmmHdiagDeviceFloatComplex
+!!$    function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+!!$         & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex')
+!!$      use iso_c_binding
+!!$      integer(c_int)		      :: res
+!!$      type(c_ptr), value	      :: deviceMat, x, y 
+!!$      complex(c_double_complex),value :: alpha,  beta
+!!$    end function spmmHdiagDeviceDoubleComplex
+  end interface spmmHdiagDevice
+  
 end module hdiagdev_mod
--- a/cuda/impl/Makefile
+++ b/cuda/impl/Makefile
@ -288,13 +288,7 @@ psb_s_cuda_hdiag_vect_mv.o \
 psb_s_cuda_dnsg_mat_impl.o \
 psb_d_cuda_dnsg_mat_impl.o \
 psb_c_cuda_dnsg_mat_impl.o \
-psb_z_cuda_dnsg_mat_impl.o \
-psb_z_cuda_hdiag_csmv.o \
-psb_z_cuda_hdiag_csmm.o \
-psb_z_cuda_hdiag_vect_mv.o \
-psb_c_cuda_hdiag_csmv.o \
-psb_c_cuda_hdiag_csmm.o \
-psb_c_cuda_hdiag_vect_mv.o
+psb_z_cuda_dnsg_mat_impl.o

 objs: $(OBJS)
 lib:  objs
--- a/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
@ -124,7 +124,7 @@ subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
    use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv
    use psb_d_cuda_multivect_mod
    implicit none 
-    class(psb_d_cuda_hdiag_mat_mod), intent(in) :: a
+    class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
    real(psb_dpk_), intent(in)       :: alpha, beta
    class(psb_d_base_multivect_type), intent(inout) :: x
    class(psb_d_base_multivect_type), intent(inout) :: y
--- a/cuda/impl/psb_s_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_csmm.F90
@ -89,7 +89,7 @@ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)

  if (tra) then 
    if (a%is_dev()) call a%sync()
-    call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
  else
    !
    ! Just to test, move X/Y to/from the GPU.
--- a/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
@ -124,7 +124,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
 	use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv
 	use psb_s_cuda_multivect_mod
 	implicit none 
-	class(psb_s_cuda_hdiag_mat_mod), intent(in) :: a
+	class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
 	real(psb_spk_), intent(in)       :: alpha, beta
 	class(psb_s_base_multivect_type), intent(inout) :: x
 	class(psb_s_base_multivect_type), intent(inout) :: y
@ -159,7 +159,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
 		if (beta /= dzero) then 
 			if (.not.y%is_host()) call y%sync()
 		end if
-		call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+		call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
 		call y%set_host()
 	else
 		if (a%is_host()) call a%sync()    
--- a/cuda/spgpu/hdia.h
+++ b/cuda/spgpu/hdia.h
@ -27,6 +27,10 @@
 extern "C" {
 #endif

+// DIA/HDIA Compressed Matrix Format routines
+
+/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
+#define DIA_PITCH_ALIGN_BYTE 128

 /** 
 * \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
 	float beta);


+/** 
+* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuShdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device float *x, 
+	int xpitch,
+	float beta);
+
 /** 
 * \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
 	const double *x, 
 	double beta);

+/** 
+* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuDhdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device double *x, 
+	int xpitch,
+	double beta);

 /** 
 * \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
 	const cuFloatComplex *x, 
 	cuFloatComplex beta);

+/** 
+* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuChdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device cuFloatComplex *x, 
+	int xpitch,
+	cuFloatComplex beta);

 /** 
 * \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
 	int cols, 
 	const cuDoubleComplex *x, 
 	cuDoubleComplex beta);
+
+/** 
+* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuZhdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device cuDoubleComplex *x, 
+	int xpitch,
+	cuDoubleComplex beta);
+
 		
 /** @}*/

--- a/cuda/spgpu/kernels/ell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base.cuh
@ -99,6 +99,86 @@ fetchTex (int pointer)
 #undef GEN_SPGPU_ELL_NAME
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)

+#if 0
+
+void
+GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP,
+  int cMPitch,
+  int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+  int avgNnzPerRow,
+  int maxNnzPerRow,
+	int rows, 
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta,
+	int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt, c1;
+
+  dim3 block (THREAD_BLOCK, 1);
+  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  int numMp=getGPUMultiProcessors();
+  int maxThMp=getGPUMaxThreadsPerMP();
+  int nmblksMp=maxThMp/THREAD_BLOCK;
+  int nmblk=nmblksMp*numMp;
+  dim3 grid (nmblk);
+
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  if (shrMemSize > maxShmemSz) {
+    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
+    return;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > 2*MMBSZ) {
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt > MMBSZ) {
+    c1 = cnt/2;
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    cnt -= c1;
+  }
+  if (cnt > MMBSZ) {
+    fprintf(stderr,"Invalid residual count %d\n",cnt);
+  } else if (cnt > 0){
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+  }
+  cudaCheckError("CUDA error on ell_spmm");
+}
+
+#endif
+
 void
 GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
@ -110,12 +190,12 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP,
-    int cMPitch,
-    int rPPitch,
+  int cMPitch,
+  int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
-    int avgNnzPerRow,
-    int maxNnzPerRow,
+  int avgNnzPerRow,
+  int maxNnzPerRow,
 	int rows, 
 	const VALUE_TYPE *x,
 	int xPitch,
@ -193,4 +273,4 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
  }
  
  cudaCheckError("CUDA error on ell_spmm");
-}
+}
--- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh
@ -17,6 +17,75 @@
 #define THREAD_BLOCK 128
 #define MMBSZ 8

+#if 0
+
+__global__ void
+CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int cMPitch, int rPPitch, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+  
+  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
+  unsigned int gridSize =  gridDim.x * blockDim.x;
+  
+  while (i < rows) {
+
+    rS += i; rP += i; cM += i;
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += rPPitch;
+      
+      value = cM[0];
+      cM += cMPitch;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	      fetch = px[pointer]; 
+	      temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	      px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+    }
+    
+    i += gridSize;
+  }
+}
+
+#endif
+
 __global__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
@ -53,9 +122,9 @@ CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)

      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	    fetch = px[pointer]; 
-	    temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	    px = px + xPitch;
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        px = px + xPitch;
      }
    }

@ -97,4 +166,4 @@ CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
 							    alpha, cM, rP, cMPitch, rPPitch, rS, rows,
 							    x, xPitch, beta, baseIndex);
-}
+}
--- a/cuda/spgpu/kernels/hdia_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base.cuh
@ -74,27 +74,115 @@ fetchTex (int pointer)
 	return CONCAT(readValue_,VALUE_TYPE) (fetch);
 }
 #endif
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla)
-#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla)
-#include "hdia_spmv_base_template.cuh"
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
+#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
+#include "hdia_spmm_base_template.cuh"
 #if 0 
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch)
-#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
+#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
 #undef USE_PREFETCHING
 #define USE_PREFETCHING
-#include "hdia_spmv_base_template.cuh"
+#include "hdia_spmm_base_template.cuh"
 #define ENABLE_CACHE
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch)
-#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch)
-#include "hdia_spmv_base_template.cuh"
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
+#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
+#include "hdia_spmm_base_template.cuh"
 #undef GEN_SPGPU_HDIA_NAME
 #undef USE_PREFETCHING
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache)
-#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache)
-#include "hdia_spmv_base_template.cuh"
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
+#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
+#include "hdia_spmm_base_template.cuh"
 #endif
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)

+void
+GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* hdiaOffsets,
+  int hackSize,
+	const __device int* hackOffsets,
+	int rows, 
+  int cols,
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+  
+  // maxNForACall should be a multiple of hackSize
+  maxNForACall = (maxNForACall/hackSize)*hackSize;
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  maxNForACall, cols,
+							  px, xPitch, beta);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  maxNForACall, cols,
+							  px, xPitch, beta);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    hackOffsets = hackOffsets + maxNForACall/hackSize;
+    rows -= maxNForACall;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  rows, cols,
+							  px, xPitch, beta);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  rows, cols,
+							  px, xPitch, beta);
+  }
+  
+  cudaCheckError("CUDA error on hdiag_spmm");
+}
--- a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
@ -15,4 +15,127 @@
 */
 
 #define THREAD_BLOCK 128
+#define MMBSZ 8

+__global__ void
+CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
+	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
+  int hackSize, const int* hackOffsets, int rows, int cols,
+  const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int hackCount = (rows + hackSize - 1)/hackSize;
+
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+
+  int hackId = i / hackSize;
+  int hackLaneId = i % hackSize;
+
+  // shared between offsetsChunks and warpHackOffsetTemp
+	extern __shared__ int dynShrMem[]; 
+    
+  int hackOffset = 0;
+	int nextOffset = 0;
+
+  unsigned int laneId = threadIdx.x % warpSize;
+	unsigned int warpId = threadIdx.x / warpSize;
+
+  if (laneId == 0 && i < rows) {
+    hackOffset = hackOffsets[hackId];
+		nextOffset = hackOffsets[hackId+1];
+  }
+
+  hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);		
+	nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
+    
+  if (hackId >= hackCount)
+		return;
+    
+  cM += hackOffset*hackSize + hackLaneId;
+	hdiaOffsets += hackOffset;
+
+  for (int k=0; k<count; k++) {
+    temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+  }
+
+  // diags for this hack is next hackOffset minus current hackOffset
+	int diags = nextOffset - hackOffset;
+    
+	// Warp oriented
+	int rounds = (diags + warpSize - 1)/warpSize;
+	
+	volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
+
+  for (int r = 0; r < rounds; r++) {
+    // in the last round diags will be <= warpSize
+		if (laneId < diags)
+			offsetsChunk[laneId] = hdiaOffsets[laneId];
+
+    if (i < rows) {
+			int dCount = min(diags, warpSize);
+
+      for (int j = 0; j < dCount; ++j) {
+        int column = offsetsChunk[j] + i;
+
+				if(column >= 0 && column < cols) {
+          px = (VALUE_TYPE *) x;
+          for (int k = 0; k < count; k++) {
+            VALUE_TYPE xValue = px[column];
+            VALUE_TYPE mValue = cM[0];
+            temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
+            px = px + xPitch;
+          }
+        }
+        cM += hackSize;
+      }
+    }
+    diags -= warpSize;
+    hdiaOffsets += warpSize;
+  }
+
+  // Since z and y are accessed with the same offset by the same thread,
+	// and the write to z follows the y read, y and z can share the same base address (in-place computing).
+	if (i >= rows)
+    return;
+
+  py = (VALUE_TYPE *) y;
+  pz = z;
+
+  if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
+    for (int k=0; k<count; k++) {
+      yVal = py[i];
+      pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+      py += yPitch;
+      pz += zPitch;
+    }
+  else
+    for (int k=0; k<count; k++) {
+      pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+      pz += zPitch;
+    }
+}
+
+void
+CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
+	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
+	const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
+  int xPitch, VALUE_TYPE beta)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz = getGPUSharedMemPerBlock();
+  shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
+							    alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
+							    x, xPitch, beta);
+}
--- a/cuda/spgpu/kernels/hell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base.cuh
@ -106,11 +106,11 @@ extern __shared__ int dynShrMem[];
 void
 GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
-    int count, 
-    VALUE_TYPE* z,
-    int zPitch,
+  int count, 
+  VALUE_TYPE* z,
+  int zPitch,
 	const VALUE_TYPE *y,
-    int  yPitch,
+  int  yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP, 
@ -120,9 +120,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	const __device int* rIdx, 
 	int rows, 
 	const VALUE_TYPE *x,
-    int xPitch,
+  int xPitch,
 	VALUE_TYPE beta, 
-    int baseIndex)
+  int baseIndex)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt, c1;
@ -149,7 +149,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  py = (VALUE_TYPE *) y;
  pz = (VALUE_TYPE *) z;	  
  while (cnt > 2*MMBSZ) {
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -160,7 +160,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  }
  if (cnt > MMBSZ) {
    c1 = cnt/2;
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -169,7 +169,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  if (cnt > MMBSZ) {
    fprintf(stderr,"Invalid residual count %d\n",cnt);
  } else if (cnt > 0){
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -182,11 +182,11 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 void
 GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
-    int count, 
-    VALUE_TYPE* z,
-    int zPitch,
+  int count, 
+  VALUE_TYPE* z,
+  int zPitch,
 	const VALUE_TYPE *y,
-    int  yPitch,
+  int  yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP, 
@ -196,9 +196,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	const __device int* rIdx, 
 	int rows, 
 	const VALUE_TYPE *x,
-    int xPitch,
+  int xPitch,
 	VALUE_TYPE beta, 
-    int baseIndex)
+  int baseIndex)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt;
@ -270,6 +270,5 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 							px, xPitch, beta, baseIndex);
  }
  
-  
  cudaCheckError("CUDA error on hell_spmm");
 }
--- a/cuda/spgpu/kernels/hell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh
@ -71,10 +71,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)

      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	px = px + xPitch;
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	      px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
@ -83,16 +82,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
      for (int k=0; k<count; k++) {
-	yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
-					 yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
-	py += yPitch;
-	pz += zPitch;
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
      }
    } else {
      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
-	pz += zPitch;
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
      }
    }
    
@ -150,10 +148,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)

      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	px = px + xPitch;
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
@ -162,15 +159,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
      for (int k=0; k<count; k++) {
-	yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
-	py += yPitch;
-	pz += zPitch;
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
      }
    else
      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
-	pz += zPitch;
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
      }
  }
 }