SpMM HDIAG working

9 months ago · 4ff0f112a9
parent bdd04a6911
commit 4ff0f112a9
20 changed files with 905 additions and 156 deletions
--- a/base/modules/serial/psb_c_base_mat_mod.F90
+++ b/base/modules/serial/psb_c_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_c_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_c_base_vect_mv
+    procedure, pass(a) :: vect_mv      => psb_c_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_c_base_csmv
+    procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv
-    procedure, pass(a) :: csmm        => psb_c_base_csmm
+    procedure, pass(a) :: csmv         => psb_c_base_csmv
-    generic, public    :: spmm        => csmm, csmv, vect_mv
+    procedure, pass(a) :: csmm         => psb_c_base_csmm
-    procedure, pass(a) :: in_vect_sv  => psb_c_base_inner_vect_sv
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
-    procedure, pass(a) :: inner_cssv  => psb_c_base_inner_cssv
+    procedure, pass(a) :: in_vect_sv   => psb_c_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssm  => psb_c_base_inner_cssm
+    procedure, pass(a) :: inner_cssv   => psb_c_base_inner_cssv
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: inner_cssm   => psb_c_base_inner_cssm
-    procedure, pass(a) :: vect_cssv   => psb_c_base_vect_cssv
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: cssv        => psb_c_base_cssv
+    procedure, pass(a) :: vect_cssv    => psb_c_base_vect_cssv
-    procedure, pass(a) :: cssm        => psb_c_base_cssm
+    procedure, pass(a) :: cssv         => psb_c_base_cssv
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
+    procedure, pass(a) :: cssm         => psb_c_base_cssm
-    procedure, pass(a) :: scals       => psb_c_base_scals
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scalv       => psb_c_base_scal
+    procedure, pass(a) :: scals        => psb_c_base_scals
-    generic, public    :: scal        => scals, scalv
+    procedure, pass(a) :: scalv        => psb_c_base_scal
-    procedure, pass(a) :: maxval      => psb_c_base_maxval
+    generic, public    :: scal         => scals, scalv
-    procedure, pass(a) :: spnmi       => psb_c_base_csnmi
+    procedure, pass(a) :: maxval       => psb_c_base_maxval
-    procedure, pass(a) :: spnm1       => psb_c_base_csnm1
+    procedure, pass(a) :: spnmi        => psb_c_base_csnmi
-    procedure, pass(a) :: rowsum      => psb_c_base_rowsum
+    procedure, pass(a) :: spnm1        => psb_c_base_csnm1
-    procedure, pass(a) :: arwsum      => psb_c_base_arwsum
+    procedure, pass(a) :: rowsum       => psb_c_base_rowsum
-    procedure, pass(a) :: colsum      => psb_c_base_colsum
+    procedure, pass(a) :: arwsum       => psb_c_base_arwsum
-    procedure, pass(a) :: aclsum      => psb_c_base_aclsum
+    procedure, pass(a) :: colsum       => psb_c_base_colsum
-    procedure, pass(a) :: scalpid     => psb_c_base_scalplusidentity
+    procedure, pass(a) :: aclsum       => psb_c_base_aclsum
-    procedure, pass(a) :: spaxpby     => psb_c_base_spaxpby
+    procedure, pass(a) :: scalpid      => psb_c_base_scalplusidentity
-    procedure, pass(a) :: cmpval      => psb_c_base_cmpval
+    procedure, pass(a) :: spaxpby      => psb_c_base_spaxpby
-    procedure, pass(a) :: cmpmat      => psb_c_base_cmpmat
+    procedure, pass(a) :: cmpval       => psb_c_base_cmpval
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: cmpmat       => psb_c_base_cmpmat
    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_c_base_sparse_mat
  private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_c_base_mat_mod
    end subroutine psb_c_base_vect_mv
  end interface
  !> Function  multivect_mv:
  !! \memberof  psb_c_base_sparse_mat
  !! \brief Product by an encapsulated array type(psb_c_multivect_type)
  !!
  !!        Compute
  !!           Y = alpha*op(A)*X + beta*Y
  !!        Usually the unwrapping of the encapsulated multivector is done
  !!        here, so that all the derived classes need only the
  !!        versions with the standard arrays.
  !!        Must be overridden explicitly in case of non standard memory
  !!        management; an example would be external memory allocation
  !!        in attached processors such as GPUs.
  !!
  !!
  !! \param alpha  Scaling factor for Ax
  !! \param A      the input sparse matrix
  !! \param x      the input X
  !! \param beta   Scaling factor for y
  !! \param y      the input/output  Y
  !! \param info   return code
  !! \param trans  [N] Whether to use A (N), its transpose (T)
  !!               or its conjugate transpose (C)
  !!
  !
  interface
    subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
      import
      class(psb_c_base_sparse_mat), intent(in) :: a
      complex(psb_spk_), intent(in)       :: alpha, beta
      class(psb_c_base_multivect_type), intent(inout) :: x
      class(psb_c_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_c_base_multivect_mv
  end interface
  !
  !> Function  cssm:
  !! \memberof  psb_c_base_sparse_mat
--- a/base/modules/serial/psb_s_base_mat_mod.F90
+++ b/base/modules/serial/psb_s_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_s_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_s_base_vect_mv
+    procedure, pass(a) :: vect_mv      => psb_s_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_s_base_csmv
+    procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv
-    procedure, pass(a) :: csmm        => psb_s_base_csmm
+    procedure, pass(a) :: csmv         => psb_s_base_csmv
-    generic, public    :: spmm        => csmm, csmv, vect_mv
+    procedure, pass(a) :: csmm         => psb_s_base_csmm
-    procedure, pass(a) :: in_vect_sv  => psb_s_base_inner_vect_sv
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
-    procedure, pass(a) :: inner_cssv  => psb_s_base_inner_cssv
+    procedure, pass(a) :: in_vect_sv   => psb_s_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssm  => psb_s_base_inner_cssm
+    procedure, pass(a) :: inner_cssv   => psb_s_base_inner_cssv
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: inner_cssm   => psb_s_base_inner_cssm
-    procedure, pass(a) :: vect_cssv   => psb_s_base_vect_cssv
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: cssv        => psb_s_base_cssv
+    procedure, pass(a) :: vect_cssv    => psb_s_base_vect_cssv
-    procedure, pass(a) :: cssm        => psb_s_base_cssm
+    procedure, pass(a) :: cssv         => psb_s_base_cssv
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
+    procedure, pass(a) :: cssm         => psb_s_base_cssm
-    procedure, pass(a) :: scals       => psb_s_base_scals
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scalv       => psb_s_base_scal
+    procedure, pass(a) :: scals        => psb_s_base_scals
-    generic, public    :: scal        => scals, scalv
+    procedure, pass(a) :: scalv        => psb_s_base_scal
-    procedure, pass(a) :: maxval      => psb_s_base_maxval
+    generic, public    :: scal         => scals, scalv
-    procedure, pass(a) :: spnmi       => psb_s_base_csnmi
+    procedure, pass(a) :: maxval       => psb_s_base_maxval
-    procedure, pass(a) :: spnm1       => psb_s_base_csnm1
+    procedure, pass(a) :: spnmi        => psb_s_base_csnmi
-    procedure, pass(a) :: rowsum      => psb_s_base_rowsum
+    procedure, pass(a) :: spnm1        => psb_s_base_csnm1
-    procedure, pass(a) :: arwsum      => psb_s_base_arwsum
+    procedure, pass(a) :: rowsum       => psb_s_base_rowsum
-    procedure, pass(a) :: colsum      => psb_s_base_colsum
+    procedure, pass(a) :: arwsum       => psb_s_base_arwsum
-    procedure, pass(a) :: aclsum      => psb_s_base_aclsum
+    procedure, pass(a) :: colsum       => psb_s_base_colsum
-    procedure, pass(a) :: scalpid     => psb_s_base_scalplusidentity
+    procedure, pass(a) :: aclsum       => psb_s_base_aclsum
-    procedure, pass(a) :: spaxpby     => psb_s_base_spaxpby
+    procedure, pass(a) :: scalpid      => psb_s_base_scalplusidentity
-    procedure, pass(a) :: cmpval      => psb_s_base_cmpval
+    procedure, pass(a) :: spaxpby      => psb_s_base_spaxpby
-    procedure, pass(a) :: cmpmat      => psb_s_base_cmpmat
+    procedure, pass(a) :: cmpval       => psb_s_base_cmpval
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: cmpmat       => psb_s_base_cmpmat
    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_s_base_sparse_mat
  private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_s_base_mat_mod
    end subroutine psb_s_base_vect_mv
  end interface
  !> Function  multivect_mv:
  !! \memberof  psb_s_base_sparse_mat
  !! \brief Product by an encapsulated array type(psb_s_multivect_type)
  !!
  !!        Compute
  !!           Y = alpha*op(A)*X + beta*Y
  !!        Usually the unwrapping of the encapsulated multivector is done
  !!        here, so that all the derived classes need only the
  !!        versions with the standard arrays.
  !!        Must be overridden explicitly in case of non standard memory
  !!        management; an example would be external memory allocation
  !!        in attached processors such as GPUs.
  !!
  !!
  !! \param alpha  Scaling factor for Ax
  !! \param A      the input sparse matrix
  !! \param x      the input X
  !! \param beta   Scaling factor for y
  !! \param y      the input/output  Y
  !! \param info   return code
  !! \param trans  [N] Whether to use A (N), its transpose (T)
  !!               or its conjugate transpose (C)
  !!
  !
  interface
    subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
      import
      class(psb_s_base_sparse_mat), intent(in) :: a
      real(psb_spk_), intent(in)       :: alpha, beta
      class(psb_s_base_multivect_type), intent(inout) :: x
      class(psb_s_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_s_base_multivect_mv
  end interface
  !
  !> Function  cssm:
  !! \memberof  psb_s_base_sparse_mat
--- a/base/modules/serial/psb_z_base_mat_mod.F90
+++ b/base/modules/serial/psb_z_base_mat_mod.F90
@ -104,33 +104,34 @@ module psb_z_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_z_base_vect_mv
+    procedure, pass(a) :: vect_mv      => psb_z_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_z_base_csmv
+    procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv
-    procedure, pass(a) :: csmm        => psb_z_base_csmm
+    procedure, pass(a) :: csmv         => psb_z_base_csmv
-    generic, public    :: spmm        => csmm, csmv, vect_mv
+    procedure, pass(a) :: csmm         => psb_z_base_csmm
-    procedure, pass(a) :: in_vect_sv  => psb_z_base_inner_vect_sv
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
-    procedure, pass(a) :: inner_cssv  => psb_z_base_inner_cssv
+    procedure, pass(a) :: in_vect_sv   => psb_z_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssm  => psb_z_base_inner_cssm
+    procedure, pass(a) :: inner_cssv   => psb_z_base_inner_cssv
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: inner_cssm   => psb_z_base_inner_cssm
-    procedure, pass(a) :: vect_cssv   => psb_z_base_vect_cssv
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: cssv        => psb_z_base_cssv
+    procedure, pass(a) :: vect_cssv    => psb_z_base_vect_cssv
-    procedure, pass(a) :: cssm        => psb_z_base_cssm
+    procedure, pass(a) :: cssv         => psb_z_base_cssv
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
+    procedure, pass(a) :: cssm         => psb_z_base_cssm
-    procedure, pass(a) :: scals       => psb_z_base_scals
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scalv       => psb_z_base_scal
+    procedure, pass(a) :: scals        => psb_z_base_scals
-    generic, public    :: scal        => scals, scalv
+    procedure, pass(a) :: scalv        => psb_z_base_scal
-    procedure, pass(a) :: maxval      => psb_z_base_maxval
+    generic, public    :: scal         => scals, scalv
-    procedure, pass(a) :: spnmi       => psb_z_base_csnmi
+    procedure, pass(a) :: maxval       => psb_z_base_maxval
-    procedure, pass(a) :: spnm1       => psb_z_base_csnm1
+    procedure, pass(a) :: spnmi        => psb_z_base_csnmi
-    procedure, pass(a) :: rowsum      => psb_z_base_rowsum
+    procedure, pass(a) :: spnm1        => psb_z_base_csnm1
-    procedure, pass(a) :: arwsum      => psb_z_base_arwsum
+    procedure, pass(a) :: rowsum       => psb_z_base_rowsum
-    procedure, pass(a) :: colsum      => psb_z_base_colsum
+    procedure, pass(a) :: arwsum       => psb_z_base_arwsum
-    procedure, pass(a) :: aclsum      => psb_z_base_aclsum
+    procedure, pass(a) :: colsum       => psb_z_base_colsum
-    procedure, pass(a) :: scalpid     => psb_z_base_scalplusidentity
+    procedure, pass(a) :: aclsum       => psb_z_base_aclsum
-    procedure, pass(a) :: spaxpby     => psb_z_base_spaxpby
+    procedure, pass(a) :: scalpid      => psb_z_base_scalplusidentity
-    procedure, pass(a) :: cmpval      => psb_z_base_cmpval
+    procedure, pass(a) :: spaxpby      => psb_z_base_spaxpby
-    procedure, pass(a) :: cmpmat      => psb_z_base_cmpmat
+    procedure, pass(a) :: cmpval       => psb_z_base_cmpval
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: cmpmat       => psb_z_base_cmpmat
    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_z_base_sparse_mat
  private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_z_base_mat_mod
    end subroutine psb_z_base_vect_mv
  end interface
  !> Function  multivect_mv:
  !! \memberof  psb_z_base_sparse_mat
  !! \brief Product by an encapsulated array type(psb_z_multivect_type)
  !!
  !!        Compute
  !!           Y = alpha*op(A)*X + beta*Y
  !!        Usually the unwrapping of the encapsulated multivector is done
  !!        here, so that all the derived classes need only the
  !!        versions with the standard arrays.
  !!        Must be overridden explicitly in case of non standard memory
  !!        management; an example would be external memory allocation
  !!        in attached processors such as GPUs.
  !!
  !!
  !! \param alpha  Scaling factor for Ax
  !! \param A      the input sparse matrix
  !! \param x      the input X
  !! \param beta   Scaling factor for y
  !! \param y      the input/output  Y
  !! \param info   return code
  !! \param trans  [N] Whether to use A (N), its transpose (T)
  !!               or its conjugate transpose (C)
  !!
  !
  interface
    subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
      import
      class(psb_z_base_sparse_mat), intent(in) :: a
      complex(psb_dpk_), intent(in)       :: alpha, beta
      class(psb_z_base_multivect_type), intent(inout) :: x
      class(psb_z_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_z_base_multivect_mv
  end interface
  !
  !> Function  cssm:
  !! \memberof  psb_z_base_sparse_mat
--- a/base/serial/impl/psb_c_base_mat_impl.F90
+++ b/base/serial/impl/psb_c_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_c_base_vect_mv
 subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
  use psb_error_mod
  use psb_const_mod
  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv
  implicit none
  class(psb_c_base_sparse_mat), intent(in) :: a
  complex(psb_spk_), intent(in)       :: alpha, beta
  class(psb_c_base_multivect_type), intent(inout) :: x
  class(psb_c_base_multivect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  ! For the time being we just throw everything back
  ! onto the normal routines.
  call x%sync()
  call y%sync()
  call a%spmm(alpha,x%v,beta,y%v,info,trans)
  call y%set_host()
 end subroutine psb_c_base_multivect_mv
 subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv
  use psb_c_base_vect_mod
--- a/base/serial/impl/psb_s_base_mat_impl.F90
+++ b/base/serial/impl/psb_s_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_s_base_vect_mv
 subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
  use psb_error_mod
  use psb_const_mod
  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv
  implicit none
  class(psb_s_base_sparse_mat), intent(in) :: a
  real(psb_spk_), intent(in)       :: alpha, beta
  class(psb_s_base_multivect_type), intent(inout) :: x
  class(psb_s_base_multivect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  ! For the time being we just throw everything back
  ! onto the normal routines.
  call x%sync()
  call y%sync()
  call a%spmm(alpha,x%v,beta,y%v,info,trans)
  call y%set_host()
 end subroutine psb_s_base_multivect_mv
 subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv
  use psb_s_base_vect_mod
--- a/base/serial/impl/psb_z_base_mat_impl.F90
+++ b/base/serial/impl/psb_z_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_z_base_vect_mv
 subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
  use psb_error_mod
  use psb_const_mod
  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv
  implicit none
  class(psb_z_base_sparse_mat), intent(in) :: a
  complex(psb_dpk_), intent(in)       :: alpha, beta
  class(psb_z_base_multivect_type), intent(inout) :: x
  class(psb_z_base_multivect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  ! For the time being we just throw everything back
  ! onto the normal routines.
  call x%sync()
  call y%sync()
  call a%spmm(alpha,x%v,beta,y%v,info,trans)
  call y%set_host()
 end subroutine psb_z_base_multivect_mv
 subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv
  use psb_z_base_vect_mod
--- a/cuda/hdiagdev.c
+++ b/cuda/hdiagdev.c
@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }
 int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
 			  double beta, void* deviceY)
 {
  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_,
      (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
      devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta);
  return SPGPU_SUCCESS;
 }
 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets)
 { int i=0,fo,fa,j,k,p;
  char buf_a[255], buf_o[255],tmp[255];
@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }
 int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 			  float beta, void* deviceY)
 {
  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_,
      (float *)y->v_, y->pitch_, alpha, (float *)devMat->cM,
      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
      devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta);
  return SPGPU_SUCCESS;
 }
--- a/cuda/hdiagdev.h
+++ b/cuda/hdiagdev.h
@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params);
 void freeHdiagDevice(void* remoteMatrix);
 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
 int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 			  float beta, void* deviceY);
 int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
-int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+//int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
-			  double beta, void* deviceY);
+//int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
 //int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
 //int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
 //int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
 //int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
 #endif
--- a/cuda/hdiagdev_mod.F90
+++ b/cuda/hdiagdev_mod.F90
@ -196,4 +196,35 @@ module hdiagdev_mod
 !!$    end function spmvHdiagDeviceDoubleComplex
  end interface spmvHdiagDevice
  interface spmmHdiagDevice
    function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmHdiagDeviceFloat')
      use iso_c_binding
      integer(c_int)		:: res
      type(c_ptr), value 	:: deviceMat, x, y
      real(c_float),value     	:: alpha, beta
    end function spmmHdiagDeviceFloat
    function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmHdiagDeviceDouble')
      use iso_c_binding
      integer(c_int)		:: res
      type(c_ptr), value	:: deviceMat, x, y 
      real(c_double),value     	:: alpha,  beta
    end function spmmHdiagDeviceDouble
 !!$    function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
 !!$         & result(res) bind(c,name='spmmHdiagDeviceFloatComplex')
 !!$      use iso_c_binding
 !!$      integer(c_int)		     :: res
 !!$      type(c_ptr), value	     :: deviceMat, x, y 
 !!$      complex(c_float_complex),value :: alpha,  beta
 !!$    end function spmmHdiagDeviceFloatComplex
 !!$    function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
 !!$         & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex')
 !!$      use iso_c_binding
 !!$      integer(c_int)		      :: res
 !!$      type(c_ptr), value	      :: deviceMat, x, y 
 !!$      complex(c_double_complex),value :: alpha,  beta
 !!$    end function spmmHdiagDeviceDoubleComplex
  end interface spmmHdiagDevice
 end module hdiagdev_mod
--- a/cuda/impl/Makefile
+++ b/cuda/impl/Makefile
@ -288,13 +288,7 @@ psb_s_cuda_hdiag_vect_mv.o \
 psb_s_cuda_dnsg_mat_impl.o \
 psb_d_cuda_dnsg_mat_impl.o \
 psb_c_cuda_dnsg_mat_impl.o \
-psb_z_cuda_dnsg_mat_impl.o \
+psb_z_cuda_dnsg_mat_impl.o
 psb_z_cuda_hdiag_csmv.o \
 psb_z_cuda_hdiag_csmm.o \
 psb_z_cuda_hdiag_vect_mv.o \
 psb_c_cuda_hdiag_csmv.o \
 psb_c_cuda_hdiag_csmm.o \
 psb_c_cuda_hdiag_vect_mv.o
 objs: $(OBJS)
 lib:  objs
--- a/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
@ -124,7 +124,7 @@ subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
    use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv
    use psb_d_cuda_multivect_mod
    implicit none 
-    class(psb_d_cuda_hdiag_mat_mod), intent(in) :: a
+    class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
    real(psb_dpk_), intent(in)       :: alpha, beta
    class(psb_d_base_multivect_type), intent(inout) :: x
    class(psb_d_base_multivect_type), intent(inout) :: y
--- a/cuda/impl/psb_s_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_csmm.F90
@ -89,7 +89,7 @@ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
  if (tra) then 
    if (a%is_dev()) call a%sync()
-    call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
  else
    !
    ! Just to test, move X/Y to/from the GPU.
--- a/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
@ -124,7 +124,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
 	use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv
 	use psb_s_cuda_multivect_mod
 	implicit none 
-	class(psb_s_cuda_hdiag_mat_mod), intent(in) :: a
+	class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
 	real(psb_spk_), intent(in)       :: alpha, beta
 	class(psb_s_base_multivect_type), intent(inout) :: x
 	class(psb_s_base_multivect_type), intent(inout) :: y
@ -159,7 +159,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
 		if (beta /= dzero) then 
 			if (.not.y%is_host()) call y%sync()
 		end if
-		call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+		call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
 		call y%set_host()
 	else
 		if (a%is_host()) call a%sync()    
--- a/cuda/spgpu/hdia.h
+++ b/cuda/spgpu/hdia.h
@ -27,6 +27,10 @@
 extern "C" {
 #endif
 // DIA/HDIA Compressed Matrix Format routines
 /// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
 #define DIA_PITCH_ALIGN_BYTE 128
 /** 
 * \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
 	float beta);
 /** 
 * \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param dM The stacked HDIA non zero values allocation pointer
 * \param offsets The stacked HDIA diagonals offsets vector
 * \param hackSize The constant size of every hack (must be a multiple of 32) 
 * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
 * \param rows the rows count
 * \param cols the columns count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 */
 void 
 spgpuShdiaspmm (spgpuHandle_t handle, 
  int count,
 	__device float *z,
 	int zpitch,
 	const __device float *y,
 	int ypitch,
 	float alpha, 
 	const __device float* dM, 
 	const __device int* offsets, 
 	int hackSize, 
 	const __device int* hackOffsets,
 	int rows,
 	int cols, 
 	const __device float *x, 
 	int xpitch,
 	float beta);
 /** 
 * \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
 	const double *x, 
 	double beta);
 /** 
 * \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param dM The stacked HDIA non zero values allocation pointer
 * \param offsets The stacked HDIA diagonals offsets vector
 * \param hackSize The constant size of every hack (must be a multiple of 32) 
 * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
 * \param rows the rows count
 * \param cols the columns count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 */
 void 
 spgpuDhdiaspmm (spgpuHandle_t handle, 
  int count,
 	__device double *z,
 	int zpitch,
 	const __device double *y,
 	int ypitch,
 	double alpha, 
 	const __device double* dM, 
 	const __device int* offsets, 
 	int hackSize, 
 	const __device int* hackOffsets,
 	int rows,
 	int cols, 
 	const __device double *x, 
 	int xpitch,
 	double beta);
 /** 
 * \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
 	const cuFloatComplex *x, 
 	cuFloatComplex beta);
 /** 
 * \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param dM The stacked HDIA non zero values allocation pointer
 * \param offsets The stacked HDIA diagonals offsets vector
 * \param hackSize The constant size of every hack (must be a multiple of 32) 
 * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
 * \param rows the rows count
 * \param cols the columns count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 */
 void 
 spgpuChdiaspmm (spgpuHandle_t handle, 
  int count,
 	__device cuFloatComplex *z,
 	int zpitch,
 	const __device cuFloatComplex *y,
 	int ypitch,
 	cuFloatComplex alpha, 
 	const __device cuFloatComplex* dM, 
 	const __device int* offsets, 
 	int hackSize, 
 	const __device int* hackOffsets,
 	int rows,
 	int cols, 
 	const __device cuFloatComplex *x, 
 	int xpitch,
 	cuFloatComplex beta);
 /** 
 * \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
 	int cols, 
 	const cuDoubleComplex *x, 
 	cuDoubleComplex beta);
 /** 
 * \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param dM The stacked HDIA non zero values allocation pointer
 * \param offsets The stacked HDIA diagonals offsets vector
 * \param hackSize The constant size of every hack (must be a multiple of 32) 
 * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
 * \param rows the rows count
 * \param cols the columns count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 */
 void 
 spgpuZhdiaspmm (spgpuHandle_t handle, 
  int count,
 	__device cuDoubleComplex *z,
 	int zpitch,
 	const __device cuDoubleComplex *y,
 	int ypitch,
 	cuDoubleComplex alpha, 
 	const __device cuDoubleComplex* dM, 
 	const __device int* offsets, 
 	int hackSize, 
 	const __device int* hackOffsets,
 	int rows,
 	int cols, 
 	const __device cuDoubleComplex *x, 
 	int xpitch,
 	cuDoubleComplex beta);
 /** @}*/
--- a/cuda/spgpu/kernels/ell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base.cuh
@ -99,6 +99,86 @@ fetchTex (int pointer)
 #undef GEN_SPGPU_ELL_NAME
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
 #if 0
 void
 GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
 	int count,
 	VALUE_TYPE* z,
 	int zPitch,
 	const VALUE_TYPE *y,
 	int yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP,
  int cMPitch,
  int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
  int avgNnzPerRow,
  int maxNnzPerRow,
 	int rows, 
 	const VALUE_TYPE *x,
 	int xPitch,
 	VALUE_TYPE beta,
 	int baseIndex)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt, c1;
  dim3 block (THREAD_BLOCK, 1);
  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
  // Should we generalize the code to 1/2/4/8 threads per row?
  // And maybe adjust THREAD_BLOCK size? 
  int shrMemSize,maxShmemSz;
  int numMp=getGPUMultiProcessors();
  int maxThMp=getGPUMaxThreadsPerMP();
  int nmblksMp=maxThMp/THREAD_BLOCK;
  int nmblk=nmblksMp*numMp;
  dim3 grid (nmblk);
  maxShmemSz=getGPUSharedMemPerBlock();
  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
  if (shrMemSize > maxShmemSz) {
    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
    return;
  }
  cnt = count;
  px = (VALUE_TYPE *) x;
  py = (VALUE_TYPE *) y;
  pz = (VALUE_TYPE *) z;	  
  while (cnt > 2*MMBSZ) {
    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
 							      alpha, cM, rP, cMPitch, rPPitch,
 							      rS, rows, px, xPitch, beta, baseIndex);
    px += xPitch*MMBSZ;
    py += yPitch*MMBSZ;
    pz += zPitch*MMBSZ;
    cnt -= MMBSZ;
  }
  if (cnt > MMBSZ) {
    c1 = cnt/2;
    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
 							      alpha, cM, rP, cMPitch, rPPitch,
 							      rS, rows, px, xPitch, beta, baseIndex);
    cnt -= c1;
  }
  if (cnt > MMBSZ) {
    fprintf(stderr,"Invalid residual count %d\n",cnt);
  } else if (cnt > 0){
    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
 							      alpha, cM, rP, cMPitch, rPPitch,
 							      rS, rows, px, xPitch, beta, baseIndex);
  }
  cudaCheckError("CUDA error on ell_spmm");
 }
 #endif
 void
 GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
@ -110,12 +190,12 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP,
-    int cMPitch,
+  int cMPitch,
-    int rPPitch,
+  int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
-    int avgNnzPerRow,
+  int avgNnzPerRow,
-    int maxNnzPerRow,
+  int maxNnzPerRow,
 	int rows, 
 	const VALUE_TYPE *x,
 	int xPitch,
@ -193,4 +273,4 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
  }
  cudaCheckError("CUDA error on ell_spmm");
-}
+}
--- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh
@ -17,6 +17,75 @@
 #define THREAD_BLOCK 128
 #define MMBSZ 8
 #if 0
 __global__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
   int cMPitch, int rPPitch, const int* rS, int rows,
   const VALUE_TYPE *x, int xPitch, 
   VALUE_TYPE beta, int baseIndex)
 {
  VALUE_TYPE *pz,*px,*py;
  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
  VALUE_TYPE yVal; 
  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
  unsigned int gridSize =  gridDim.x * blockDim.x;
  while (i < rows) {
    rS += i; rP += i; cM += i;
    int rowSize = rS[i];
    for (int k=0; k<count; k++) {
      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
    }      
    for (int j = 0; j < rowSize; j++) {
      int pointer;
      VALUE_TYPE value;
      VALUE_TYPE fetch;
      pointer = rP[0] - baseIndex;
      rP += rPPitch;
      value = cM[0];
      cM += cMPitch;
      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
 	      fetch = px[pointer]; 
 	      temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
 	      px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
    py = (VALUE_TYPE *) y;
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
      for (int k=0; k<count; k++) {
        yVal = py[i];
        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
        py += yPitch;
        pz += zPitch;
      }
    } else {
      for (int k=0; k<count; k++) {
        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
        pz += zPitch;
      }
    }
    i += gridSize;
  }
 }
 #endif
 __global__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
@ -53,9 +122,9 @@ CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	    fetch = px[pointer]; 
+        fetch = px[pointer]; 
-	    temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	    px = px + xPitch;
+        px = px + xPitch;
      }
    }
@ -97,4 +166,4 @@ CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
 							    alpha, cM, rP, cMPitch, rPPitch, rS, rows,
 							    x, xPitch, beta, baseIndex);
-}
+}
--- a/cuda/spgpu/kernels/hdia_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base.cuh
@ -74,27 +74,115 @@ fetchTex (int pointer)
 	return CONCAT(readValue_,VALUE_TYPE) (fetch);
 }
 #endif
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
-#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla)
+#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
-#include "hdia_spmv_base_template.cuh"
+#include "hdia_spmm_base_template.cuh"
 #if 0 
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
-#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch)
+#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
 #undef USE_PREFETCHING
 #define USE_PREFETCHING
-#include "hdia_spmv_base_template.cuh"
+#include "hdia_spmm_base_template.cuh"
 #define ENABLE_CACHE
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
-#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch)
+#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
-#include "hdia_spmv_base_template.cuh"
+#include "hdia_spmm_base_template.cuh"
 #undef GEN_SPGPU_HDIA_NAME
 #undef USE_PREFETCHING
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
-#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache)
+#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
-#include "hdia_spmv_base_template.cuh"
+#include "hdia_spmm_base_template.cuh"
 #endif
 #undef GEN_SPGPU_HDIA_NAME
-#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv)
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)
 void
 GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
 	int count,
 	VALUE_TYPE* z,
 	int zPitch,
 	const VALUE_TYPE *y,
 	int yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* hdiaOffsets,
  int hackSize,
 	const __device int* hackOffsets,
 	int rows, 
  int cols,
 	const VALUE_TYPE *x,
 	int xPitch,
 	VALUE_TYPE beta)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt;
  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
  // maxNForACall should be a multiple of hackSize
  maxNForACall = (maxNForACall/hackSize)*hackSize;
  int maxShmemSz;
  maxShmemSz=getGPUSharedMemPerBlock();
  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
  while (rows > maxNForACall) {//managing large vectors
    cnt = count;
    px = (VALUE_TYPE *) x;
    py = (VALUE_TYPE *) y;
    pz = (VALUE_TYPE *) z;	  
    while (cnt > MMBSZ) {
      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, hdiaOffsets,
 							  hackSize, hackOffsets,
 							  maxNForACall, cols,
 							  px, xPitch, beta);
      px += xPitch*MMBSZ;
      py += yPitch*MMBSZ;
      pz += zPitch*MMBSZ;
      cnt -= MMBSZ;
    }
    if (cnt >0) {
      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, hdiaOffsets,
 							  hackSize, hackOffsets,
 							  maxNForACall, cols,
 							  px, xPitch, beta);
    }
    y = y + maxNForACall;
    z = z + maxNForACall;
    hackOffsets = hackOffsets + maxNForACall/hackSize;
    rows -= maxNForACall;
  }
  cnt = count;
  px = (VALUE_TYPE *) x;
  py = (VALUE_TYPE *) y;
  pz = (VALUE_TYPE *) z;	  
  while (cnt > MMBSZ) {
    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, hdiaOffsets,
 							  hackSize, hackOffsets,
 							  rows, cols,
 							  px, xPitch, beta);
    px += xPitch*MMBSZ;
    py += yPitch*MMBSZ;
    pz += zPitch*MMBSZ;
    cnt -= MMBSZ;
  }
  if (cnt >0) {
    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, hdiaOffsets,
 							  hackSize, hackOffsets,
 							  rows, cols,
 							  px, xPitch, beta);
  }
  cudaCheckError("CUDA error on hdiag_spmm");
 }
--- a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
@ -15,4 +15,127 @@
 */
 #define THREAD_BLOCK 128
 #define MMBSZ 8
 __global__ void
 CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
  (int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
 	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
  int hackSize, const int* hackOffsets, int rows, int cols,
  const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
 {
  VALUE_TYPE *pz,*px,*py;
  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
  VALUE_TYPE yVal; 
  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
  int hackCount = (rows + hackSize - 1)/hackSize;
  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
  int hackId = i / hackSize;
  int hackLaneId = i % hackSize;
  // shared between offsetsChunks and warpHackOffsetTemp
 	extern __shared__ int dynShrMem[]; 
  int hackOffset = 0;
 	int nextOffset = 0;
  unsigned int laneId = threadIdx.x % warpSize;
 	unsigned int warpId = threadIdx.x / warpSize;
  if (laneId == 0 && i < rows) {
    hackOffset = hackOffsets[hackId];
 		nextOffset = hackOffsets[hackId+1];
  }
  hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);		
 	nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
  if (hackId >= hackCount)
 		return;
  cM += hackOffset*hackSize + hackLaneId;
 	hdiaOffsets += hackOffset;
  for (int k=0; k<count; k++) {
    temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
  }
  // diags for this hack is next hackOffset minus current hackOffset
 	int diags = nextOffset - hackOffset;
 	// Warp oriented
 	int rounds = (diags + warpSize - 1)/warpSize;
 	volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
  for (int r = 0; r < rounds; r++) {
    // in the last round diags will be <= warpSize
 		if (laneId < diags)
 			offsetsChunk[laneId] = hdiaOffsets[laneId];
    if (i < rows) {
 			int dCount = min(diags, warpSize);
      for (int j = 0; j < dCount; ++j) {
        int column = offsetsChunk[j] + i;
 				if(column >= 0 && column < cols) {
          px = (VALUE_TYPE *) x;
          for (int k = 0; k < count; k++) {
            VALUE_TYPE xValue = px[column];
            VALUE_TYPE mValue = cM[0];
            temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
            px = px + xPitch;
          }
        }
        cM += hackSize;
      }
    }
    diags -= warpSize;
    hdiaOffsets += warpSize;
  }
  // Since z and y are accessed with the same offset by the same thread,
 	// and the write to z follows the y read, y and z can share the same base address (in-place computing).
 	if (i >= rows)
    return;
  py = (VALUE_TYPE *) y;
  pz = z;
  if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
    for (int k=0; k<count; k++) {
      yVal = py[i];
      pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
      py += yPitch;
      pz += zPitch;
    }
  else
    for (int k=0; k<count; k++) {
      pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
      pz += zPitch;
    }
 }
 void
 CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
 	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
 	const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
  int xPitch, VALUE_TYPE beta)
 {
  dim3 block (THREAD_BLOCK, 1);
  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
  // Should we generalize the code to 1/2/4/8 threads per row?
  // And maybe adjust THREAD_BLOCK size? 
  int shrMemSize,maxShmemSz;
  maxShmemSz = getGPUSharedMemPerBlock();
  shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
  CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) 
    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
 							    alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
 							    x, xPitch, beta);
 }
--- a/cuda/spgpu/kernels/hell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base.cuh
@ -106,11 +106,11 @@ extern __shared__ int dynShrMem[];
 void
 GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
-    int count, 
+  int count, 
-    VALUE_TYPE* z,
+  VALUE_TYPE* z,
-    int zPitch,
+  int zPitch,
 	const VALUE_TYPE *y,
-    int  yPitch,
+  int  yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP, 
@ -120,9 +120,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	const __device int* rIdx, 
 	int rows, 
 	const VALUE_TYPE *x,
-    int xPitch,
+  int xPitch,
 	VALUE_TYPE beta, 
-    int baseIndex)
+  int baseIndex)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt, c1;
@ -149,7 +149,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  py = (VALUE_TYPE *) y;
  pz = (VALUE_TYPE *) z;	  
  while (cnt > 2*MMBSZ) {
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -160,7 +160,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  }
  if (cnt > MMBSZ) {
    c1 = cnt/2;
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -169,7 +169,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
  if (cnt > MMBSZ) {
    fprintf(stderr,"Invalid residual count %d\n",cnt);
  } else if (cnt > 0){
-    CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
 							      alpha, cM, rP, hackSize, hackOffsets,
 							      rS, rows, px, xPitch, beta, baseIndex);
@ -182,11 +182,11 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 void
 GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
-    int count, 
+  int count, 
-    VALUE_TYPE* z,
+  VALUE_TYPE* z,
-    int zPitch,
+  int zPitch,
 	const VALUE_TYPE *y,
-    int  yPitch,
+  int  yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP, 
@ -196,9 +196,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	const __device int* rIdx, 
 	int rows, 
 	const VALUE_TYPE *x,
-    int xPitch,
+  int xPitch,
 	VALUE_TYPE beta, 
-    int baseIndex)
+  int baseIndex)
 {
  VALUE_TYPE *px,*py, *pz;
  int cnt;
@ -270,6 +270,5 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 							px, xPitch, beta, baseIndex);
  }
  cudaCheckError("CUDA error on hell_spmm");
 }
--- a/cuda/spgpu/kernels/hell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh
@ -71,10 +71,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
+        fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	      px = px + xPitch;
 	px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
@ -83,16 +82,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
      for (int k=0; k<count; k++) {
-	yVal = py[i];
+        yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
-					 yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
+        py += yPitch;
-	py += yPitch;
+        pz += zPitch;
 	pz += zPitch;
      }
    } else {
      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
-	pz += zPitch;
+        pz += zPitch;
      }
    }
@ -150,10 +148,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
-	fetch = px[pointer]; 
+        fetch = px[pointer]; 
-	temp[k][threadIdx.x] =
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
-	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        px = px + xPitch;
 	px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
@ -162,15 +159,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
      for (int k=0; k<count; k++) {
-	yVal = py[i];
+        yVal = py[i];
-	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]));
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
-	py += yPitch;
+        py += yPitch;
-	pz += zPitch;
+        pz += zPitch;
      }
    else
      for (int k=0; k<count; k++) {
-	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[k][threadIdx.x]);
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
-	pz += zPitch;
+        pz += zPitch;
      }
  }
 }