Merge remote-tracking branch 'origin/cuda-multivect' into psblas-bgmres

5 months ago · c08431d71e
parent cf315660e1 4ff0f112a9
commit c08431d71e
86 changed files with 4460 additions and 2021 deletions
--- a/base/modules/serial/psb_c_base_mat_mod.F90
+++ b/base/modules/serial/psb_c_base_mat_mod.F90
@ -35,6 +35,7 @@ module psb_c_base_mat_mod

  use psb_base_mat_mod
  use psb_c_base_vect_mod
+  use psb_c_base_multivect_mod


  !> \namespace  psb_base_mod  \class  psb_c_base_sparse_mat
@ -103,33 +104,34 @@ module psb_c_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_c_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_c_base_csmv
-    procedure, pass(a) :: csmm        => psb_c_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_c_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_c_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_c_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_c_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_c_base_cssv
-    procedure, pass(a) :: cssm        => psb_c_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_c_base_scals
-    procedure, pass(a) :: scalv       => psb_c_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_c_base_maxval
-    procedure, pass(a) :: spnmi       => psb_c_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_c_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_c_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_c_base_arwsum
-    procedure, pass(a) :: colsum      => psb_c_base_colsum
-    procedure, pass(a) :: aclsum      => psb_c_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_c_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_c_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_c_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_c_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_c_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_c_base_csmv
+    procedure, pass(a) :: csmm         => psb_c_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_c_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_c_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_c_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_c_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_c_base_cssv
+    procedure, pass(a) :: cssm         => psb_c_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_c_base_scals
+    procedure, pass(a) :: scalv        => psb_c_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_c_base_maxval
+    procedure, pass(a) :: spnmi        => psb_c_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_c_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_c_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_c_base_arwsum
+    procedure, pass(a) :: colsum       => psb_c_base_colsum
+    procedure, pass(a) :: aclsum       => psb_c_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_c_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_c_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_c_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_c_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_c_base_sparse_mat

  private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_c_base_mat_mod
    end subroutine psb_c_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_c_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_c_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_c_base_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_c_base_sparse_mat
--- a/base/modules/serial/psb_s_base_mat_mod.F90
+++ b/base/modules/serial/psb_s_base_mat_mod.F90
@ -35,6 +35,7 @@ module psb_s_base_mat_mod

  use psb_base_mat_mod
  use psb_s_base_vect_mod
+  use psb_s_base_multivect_mod


  !> \namespace  psb_base_mod  \class  psb_s_base_sparse_mat
@ -103,33 +104,34 @@ module psb_s_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_s_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_s_base_csmv
-    procedure, pass(a) :: csmm        => psb_s_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_s_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_s_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_s_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_s_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_s_base_cssv
-    procedure, pass(a) :: cssm        => psb_s_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_s_base_scals
-    procedure, pass(a) :: scalv       => psb_s_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_s_base_maxval
-    procedure, pass(a) :: spnmi       => psb_s_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_s_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_s_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_s_base_arwsum
-    procedure, pass(a) :: colsum      => psb_s_base_colsum
-    procedure, pass(a) :: aclsum      => psb_s_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_s_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_s_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_s_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_s_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_s_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_s_base_csmv
+    procedure, pass(a) :: csmm         => psb_s_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_s_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_s_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_s_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_s_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_s_base_cssv
+    procedure, pass(a) :: cssm         => psb_s_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_s_base_scals
+    procedure, pass(a) :: scalv        => psb_s_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_s_base_maxval
+    procedure, pass(a) :: spnmi        => psb_s_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_s_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_s_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_s_base_arwsum
+    procedure, pass(a) :: colsum       => psb_s_base_colsum
+    procedure, pass(a) :: aclsum       => psb_s_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_s_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_s_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_s_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_s_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_s_base_sparse_mat

  private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_s_base_mat_mod
    end subroutine psb_s_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_s_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_s_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_s_base_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_s_base_sparse_mat
--- a/base/modules/serial/psb_z_base_mat_mod.F90
+++ b/base/modules/serial/psb_z_base_mat_mod.F90
@ -35,6 +35,7 @@ module psb_z_base_mat_mod

  use psb_base_mat_mod
  use psb_z_base_vect_mod
+  use psb_z_base_multivect_mod


  !> \namespace  psb_base_mod  \class  psb_z_base_sparse_mat
@ -103,33 +104,34 @@ module psb_z_base_mat_mod
    !
    ! Computational methods: defined here but not implemented.
    !
-    procedure, pass(a) :: vect_mv     => psb_z_base_vect_mv
-    procedure, pass(a) :: csmv        => psb_z_base_csmv
-    procedure, pass(a) :: csmm        => psb_z_base_csmm
-    generic, public    :: spmm        => csmm, csmv, vect_mv
-    procedure, pass(a) :: in_vect_sv  => psb_z_base_inner_vect_sv
-    procedure, pass(a) :: inner_cssv  => psb_z_base_inner_cssv
-    procedure, pass(a) :: inner_cssm  => psb_z_base_inner_cssm
-    generic, public    :: inner_spsm  => inner_cssm, inner_cssv, in_vect_sv
-    procedure, pass(a) :: vect_cssv   => psb_z_base_vect_cssv
-    procedure, pass(a) :: cssv        => psb_z_base_cssv
-    procedure, pass(a) :: cssm        => psb_z_base_cssm
-    generic, public    :: spsm        => cssm, cssv, vect_cssv
-    procedure, pass(a) :: scals       => psb_z_base_scals
-    procedure, pass(a) :: scalv       => psb_z_base_scal
-    generic, public    :: scal        => scals, scalv
-    procedure, pass(a) :: maxval      => psb_z_base_maxval
-    procedure, pass(a) :: spnmi       => psb_z_base_csnmi
-    procedure, pass(a) :: spnm1       => psb_z_base_csnm1
-    procedure, pass(a) :: rowsum      => psb_z_base_rowsum
-    procedure, pass(a) :: arwsum      => psb_z_base_arwsum
-    procedure, pass(a) :: colsum      => psb_z_base_colsum
-    procedure, pass(a) :: aclsum      => psb_z_base_aclsum
-    procedure, pass(a) :: scalpid     => psb_z_base_scalplusidentity
-    procedure, pass(a) :: spaxpby     => psb_z_base_spaxpby
-    procedure, pass(a) :: cmpval      => psb_z_base_cmpval
-    procedure, pass(a) :: cmpmat      => psb_z_base_cmpmat
-    generic, public    :: spcmp       => cmpval, cmpmat
+    procedure, pass(a) :: vect_mv      => psb_z_base_vect_mv
+    procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv
+    procedure, pass(a) :: csmv         => psb_z_base_csmv
+    procedure, pass(a) :: csmm         => psb_z_base_csmm
+    generic, public    :: spmm         => csmm, csmv, vect_mv, multivect_mv
+    procedure, pass(a) :: in_vect_sv   => psb_z_base_inner_vect_sv
+    procedure, pass(a) :: inner_cssv   => psb_z_base_inner_cssv
+    procedure, pass(a) :: inner_cssm   => psb_z_base_inner_cssm
+    generic, public    :: inner_spsm   => inner_cssm, inner_cssv, in_vect_sv
+    procedure, pass(a) :: vect_cssv    => psb_z_base_vect_cssv
+    procedure, pass(a) :: cssv         => psb_z_base_cssv
+    procedure, pass(a) :: cssm         => psb_z_base_cssm
+    generic, public    :: spsm         => cssm, cssv, vect_cssv
+    procedure, pass(a) :: scals        => psb_z_base_scals
+    procedure, pass(a) :: scalv        => psb_z_base_scal
+    generic, public    :: scal         => scals, scalv
+    procedure, pass(a) :: maxval       => psb_z_base_maxval
+    procedure, pass(a) :: spnmi        => psb_z_base_csnmi
+    procedure, pass(a) :: spnm1        => psb_z_base_csnm1
+    procedure, pass(a) :: rowsum       => psb_z_base_rowsum
+    procedure, pass(a) :: arwsum       => psb_z_base_arwsum
+    procedure, pass(a) :: colsum       => psb_z_base_colsum
+    procedure, pass(a) :: aclsum       => psb_z_base_aclsum
+    procedure, pass(a) :: scalpid      => psb_z_base_scalplusidentity
+    procedure, pass(a) :: spaxpby      => psb_z_base_spaxpby
+    procedure, pass(a) :: cmpval       => psb_z_base_cmpval
+    procedure, pass(a) :: cmpmat       => psb_z_base_cmpmat
+    generic, public    :: spcmp        => cmpval, cmpmat
  end type psb_z_base_sparse_mat

  private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_z_base_mat_mod
    end subroutine psb_z_base_vect_mv
  end interface

+  !> Function  multivect_mv:
+  !! \memberof  psb_z_base_sparse_mat
+  !! \brief Product by an encapsulated array type(psb_z_multivect_type)
+  !!
+  !!        Compute
+  !!           Y = alpha*op(A)*X + beta*Y
+  !!        Usually the unwrapping of the encapsulated multivector is done
+  !!        here, so that all the derived classes need only the
+  !!        versions with the standard arrays.
+  !!        Must be overridden explicitly in case of non standard memory
+  !!        management; an example would be external memory allocation
+  !!        in attached processors such as GPUs.
+  !!
+  !!
+  !! \param alpha  Scaling factor for Ax
+  !! \param A      the input sparse matrix
+  !! \param x      the input X
+  !! \param beta   Scaling factor for y
+  !! \param y      the input/output  Y
+  !! \param info   return code
+  !! \param trans  [N] Whether to use A (N), its transpose (T)
+  !!               or its conjugate transpose (C)
+  !!
+  !
+  interface
+    subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+      import
+      class(psb_z_base_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_base_multivect_mv
+  end interface
+
  !
  !> Function  cssm:
  !! \memberof  psb_z_base_sparse_mat
--- a/base/serial/impl/psb_c_base_mat_impl.F90
+++ b/base/serial/impl/psb_c_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_c_base_vect_mv

+subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv
+  implicit none
+  class(psb_c_base_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_multivect_type), intent(inout) :: x
+  class(psb_c_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_c_base_multivect_mv
+
 subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv
  use psb_c_base_vect_mod
--- a/base/serial/impl/psb_s_base_mat_impl.F90
+++ b/base/serial/impl/psb_s_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_s_base_vect_mv

+subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv
+  implicit none
+  class(psb_s_base_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_multivect_type), intent(inout) :: x
+  class(psb_s_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_s_base_multivect_mv
+
 subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv
  use psb_s_base_vect_mod
--- a/base/serial/impl/psb_z_base_mat_impl.F90
+++ b/base/serial/impl/psb_z_base_mat_impl.F90
@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans)
  call y%set_host()
 end subroutine psb_z_base_vect_mv

+subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
+  use psb_error_mod
+  use psb_const_mod
+  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv
+  implicit none
+  class(psb_z_base_sparse_mat), intent(in) :: a
+  complex(psb_dpk_), intent(in)       :: alpha, beta
+  class(psb_z_base_multivect_type), intent(inout) :: x
+  class(psb_z_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+
+  ! For the time being we just throw everything back
+  ! onto the normal routines.
+  call x%sync()
+  call y%sync()
+  call a%spmm(alpha,x%v,beta,y%v,info,trans)
+  call y%set_host()
+end subroutine psb_z_base_multivect_mv
+
 subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
  use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv
  use psb_z_base_vect_mod
--- a/4
+++ b/4
@ -11710,7 +11710,7 @@ fi
   	CCOPT                 : ${CCOPT}

 	CUDA 		      : ${HAVE_CUDA}
-	CUDA_CC               : ${CUDA_CC}
+	CUDA_CC               : ${pac_cv_cudacc}

 	BLAS                  : ${BLAS_LIBS}

@ -11743,7 +11743,7 @@ $as_echo "$as_me:
   	CCOPT                 : ${CCOPT}

 	CUDA 		      : ${HAVE_CUDA}
-	CUDA_CC               : ${CUDA_CC}
+	CUDA_CC               : ${pac_cv_cudacc}

 	BLAS                  : ${BLAS_LIBS}

--- a/configure.ac
+++ b/configure.ac
@ -970,7 +970,7 @@ AC_MSG_NOTICE([
   	CCOPT                 : ${CCOPT}

 	CUDA 		      : ${HAVE_CUDA}
-	CUDA_CC               : ${CUDA_CC}
+	CUDA_CC               : ${pac_cv_cudacc}

 	BLAS                  : ${BLAS_LIBS}

--- a/cuda/cuda_util.c
+++ b/cuda/cuda_util.c
@ -375,6 +375,13 @@ int getGPUMaxRegistersPerBlock()
  return(count);
 }

+int getGPUSharedMemPerBlock()
+{ int count=0;
+  if (prop!=NULL) 
+    count = prop->sharedMemPerBlock;
+  return(count);
+}
+
 void cpyGPUNameString(char *cstring)
 {
  *cstring='\0';
--- a/cuda/cuda_util.h
+++ b/cuda/cuda_util.h
@ -65,6 +65,7 @@ int getGPUWarpSize();
 int getGPUMaxThreadsPerBlock();
 int getGPUMaxThreadsPerMP();
 int getGPUMaxRegistersPerBlock();
+int getGPUSharedMemPerBlock();
 void cpyGPUNameString(char *cstring);


--- a/cuda/elldev.c
+++ b/cuda/elldev.c
@ -148,6 +148,9 @@ int FallocEllDevice(void** deviceMat,unsigned int rows, unsigned int maxRowSize,
  return(i);
 }

+//
+// Single Precision Float
+//
 void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, int* rP, int* rS, 
 		 int avgRowSize, int maxRowSize, int rows, int pitch, float *x, float beta, int firstIndex)
 {
@ -168,7 +171,7 @@ void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, i
      x += vPitch;		
    }
 }
-//new
+
 int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 		       float beta, void* deviceY)
 { int i=SPGPU_SUCCESS;
@ -191,7 +194,31 @@ int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return(i);
 }

+int spmmEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			float beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuSellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
+          (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (float*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}

+//
+// Double Precision
+//
 void
 dspmdmm_gpu (double *z,int s, int vPitch, double *y, double alpha, double* cM, int* rP,
 	     int* rS, int avgRowSize, int maxRowSize, int rows, int pitch, 
@ -237,6 +264,31 @@ int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+			double beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuDellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
+          (double*)y->v_, y->pitch_, alpha, (double*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (double*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Single Precision Float Complex
+//
 void
 cspmdmm_gpu (cuFloatComplex *z, int s, int vPitch, cuFloatComplex *y,  
 	     cuFloatComplex alpha, cuFloatComplex* cM,
@ -276,6 +328,31 @@ int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* device
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuCellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (cuFloatComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Double Precision Complex
+//
 void
 zspmdmm_gpu (cuDoubleComplex *z, int s, int vPitch, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* cM,
 	     int* rP, int* rS, int avgRowSize, int maxRowSize, int rows, int pitch,
@ -314,6 +391,28 @@ int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* devi
  return SPGPU_SUCCESS;
 }

+int spmmEllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
+{
+  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuZellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
+          devMat->rP, devMat->cMPitch, devMat->rPPitch,
+          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
+          devMat->rows, (cuDoubleComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
 int writeEllDeviceFloat(void* deviceMat, float* val, int* ja, int ldj, int* irn, int *idiag)
 { int i;
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
--- a/cuda/elldev.h
+++ b/cuda/elldev.h
@ -113,16 +113,6 @@ int readEllDeviceDouble(void* deviceMat, double* val, int* ja, int ldj, int* irn
 int readEllDeviceFloatComplex(void* deviceMat, float complex* val, int* ja, int ldj, int* irn, int *idiag);
 int readEllDeviceDoubleComplex(void* deviceMat, double complex* val, int* ja, int ldj, int* irn, int *idiag);

-int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
-		       float beta, void* deviceY);
-int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
-			double beta, void* deviceY);
-int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
-			      float complex beta, void* deviceY);
-int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
-			       double complex beta, void* deviceY);
-
-

 int psiCopyCooToElgFloat(int nr, int nc, int nza, int hacksz, int ldv, int nzm, int *irn,
 			  int *idisp, int *ja, float *val, void *deviceMat);
--- a/cuda/elldev_mod.F90
+++ b/cuda/elldev_mod.F90
@ -318,4 +318,35 @@ module elldev_mod
    end function spmvEllDeviceDoubleComplex
  end interface

+  interface spmmEllDevice
+    function spmmEllDeviceFloat(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceFloat')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value 	:: deviceMat, x, y
+      real(c_float),value     	:: alpha, beta
+    end function spmmEllDeviceFloat
+    function spmmEllDeviceDouble(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceDouble')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value	:: deviceMat, x, y 
+      real(c_double),value     	:: alpha,  beta
+    end function spmmEllDeviceDouble
+    function spmmEllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceFloatComplex')
+      use iso_c_binding
+      integer(c_int)		     :: res
+      type(c_ptr), value	     :: deviceMat, x, y 
+      complex(c_float_complex),value :: alpha,  beta
+    end function spmmEllDeviceFloatComplex
+    function spmmEllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmEllDeviceDoubleComplex')
+      use iso_c_binding
+      integer(c_int)		      :: res
+      type(c_ptr), value	      :: deviceMat, x, y 
+      complex(c_double_complex),value :: alpha,  beta
+    end function spmmEllDeviceDoubleComplex
+  end interface
+
 end module elldev_mod
--- a/cuda/hdiagdev.c
+++ b/cuda/hdiagdev.c
@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+			  double beta, void* deviceY)
+{
+  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_,
+      (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
+      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
+      devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta);
+
+  return SPGPU_SUCCESS;
+}
+
 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets)
 { int i=0,fo,fa,j,k,p;
  char buf_a[255], buf_o[255],tmp[255];
@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }

+int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			  float beta, void* deviceY)
+{
+  struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_,
+      (float *)y->v_, y->pitch_, alpha, (float *)devMat->cM,
+      devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, 
+      devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta);
+
+  return SPGPU_SUCCESS;
+}
--- a/cuda/hdiagdev.h
+++ b/cuda/hdiagdev.h
@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params);
 void freeHdiagDevice(void* remoteMatrix);

 int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
-int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
-			  float beta, void* deviceY);
-
 int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
-int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
-			  double beta, void* deviceY);
+//int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
+//int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);

+//int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
+//int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);

 #endif
--- a/cuda/hdiagdev_mod.F90
+++ b/cuda/hdiagdev_mod.F90
@ -196,4 +196,35 @@ module hdiagdev_mod
 !!$    end function spmvHdiagDeviceDoubleComplex
  end interface spmvHdiagDevice

+  interface spmmHdiagDevice
+    function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmHdiagDeviceFloat')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value 	:: deviceMat, x, y
+      real(c_float),value     	:: alpha, beta
+    end function spmmHdiagDeviceFloat
+    function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) &
+         & result(res) bind(c,name='spmmHdiagDeviceDouble')
+      use iso_c_binding
+      integer(c_int)		:: res
+      type(c_ptr), value	:: deviceMat, x, y 
+      real(c_double),value     	:: alpha,  beta
+    end function spmmHdiagDeviceDouble
+!!$    function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+!!$         & result(res) bind(c,name='spmmHdiagDeviceFloatComplex')
+!!$      use iso_c_binding
+!!$      integer(c_int)		     :: res
+!!$      type(c_ptr), value	     :: deviceMat, x, y 
+!!$      complex(c_float_complex),value :: alpha,  beta
+!!$    end function spmmHdiagDeviceFloatComplex
+!!$    function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+!!$         & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex')
+!!$      use iso_c_binding
+!!$      integer(c_int)		      :: res
+!!$      type(c_ptr), value	      :: deviceMat, x, y 
+!!$      complex(c_double_complex),value :: alpha,  beta
+!!$    end function spmmHdiagDeviceDoubleComplex
+  end interface spmmHdiagDevice
+  
 end module hdiagdev_mod
--- a/cuda/hlldev.c
+++ b/cuda/hlldev.c
@ -156,7 +156,9 @@ int FallocHllDevice(void** deviceMat,unsigned int hksize, unsigned int rows,  un
  return(i);
 }

-
+//
+// Single Precision Float
+//
 int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 			float beta, void* deviceY)
 {
@ -170,101 +172,170 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
-	       devMat->baseIndex);*/
-
-  spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, (float *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
+  spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha,
+          (float *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (float *)x->v_, beta, devMat->baseIndex);

  return SPGPU_SUCCESS;
 }

-void
-dspmdmmhll_gpu (double *z, int s, int vPitch, double *y, double alpha, double* cM, int* rP,
-	     int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, double *x, double beta, int firstIndex)
+int spmmHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
+			float beta, void* deviceY)
 {
-  int i=0;
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
-  for (i=0; i<s; i++)
-    {
-      spgpuDhellspmv (handle, (double*) z, (double*)y, alpha, (double*) cM, rP,
-		       hackSize, hackOffs, rS, NULL,
-               avgNnzPerRow, rows, (double*)x, beta, firstIndex);
-      z += vPitch;
-      y += vPitch;
-      x += vPitch;		
-    }
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuShellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
+          (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (float*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
 }

-//new
+//
+// Double Precision
+//
 int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
 		       double beta, void* deviceY)
 {
  HllDevice *devMat = (HllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();

 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, (double *)y->v_,
-           alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
-           devMat->avgNzr, devMat->rows,
-           (double *)x->v_, beta, devMat->baseIndex);
+  spgpuDhellspmv (handle, (double *)y->v_, (double *)y->v_, alpha,
+		  (double *)devMat->cM, devMat->rP,
+		  devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (double *)x->v_, beta, devMat->baseIndex);

  return SPGPU_SUCCESS;
 }

-int spmvHllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX, 
-		       float complex beta, void* deviceY)
+int spmmHllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
+		       double beta, void* deviceY)
 {
  HllDevice *devMat = (HllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();

-  cuFloatComplex a = make_cuFloatComplex(crealf(alpha),cimagf(alpha));
-  cuFloatComplex b = make_cuFloatComplex(crealf(beta),cimagf(beta));
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  /*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
-	       devMat->baseIndex);*/
+  spgpuDhellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
+          (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (double *)x->v_,
+          x->pitch_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+//
+// Single Precision Float Complex
+//
+int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
+          (cuFloatComplex *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
+			cuFloatComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuChellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);

-  spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, a, (cuFloatComplex *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
-		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, b, devMat->baseIndex);
-  
  return SPGPU_SUCCESS;
 }

-int spmvHllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX, 
-		       double complex beta, void* deviceY)
+//
+// Double Precision Complex
+//
+int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
 {
  HllDevice *devMat = (HllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();

-  cuDoubleComplex a = make_cuDoubleComplex(creal(alpha),cimag(alpha));
-  cuDoubleComplex b = make_cuDoubleComplex(creal(beta),cimag(beta));
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
+  spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
+          (cuDoubleComplex *)devMat->cM, devMat->rP,
+          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
+		  devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
+
+  return SPGPU_SUCCESS;
+}
+
+int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
+			cuDoubleComplex beta, void* deviceY)
+{
+  HllDevice *devMat = (HllDevice *) deviceMat;
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  spgpuHandle_t handle=psb_cudaGetHandle();

-  spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, a, (cuDoubleComplex *)devMat->cM, 
-		  devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
-		  devMat->avgNzr,devMat->rows, (cuDoubleComplex *)x->v_, b, devMat->baseIndex);
+#ifdef VERBOSE
+  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
+  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
+  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
+#endif
+  spgpuZhellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
+          devMat->rP, devMat->hackSize, devMat->hackOffs,
+          devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
+          x->pitch_, beta, devMat->baseIndex);

  return SPGPU_SUCCESS;
 }
--- a/cuda/hlldev_mod.F90
+++ b/cuda/hlldev_mod.F90
@ -265,4 +265,40 @@ module hlldev_mod

  end interface

+interface spmmHllDevice
+
+  function spmmHllDeviceFloat(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceFloat')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    real(c_float),value     	:: alpha, beta
+  end function spmmHllDeviceFloat
+
+  function spmmHllDeviceDouble(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceDouble')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    real(c_double),value     	:: alpha,  beta
+  end function spmmHllDeviceDouble
+
+  function spmmHllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceFloatComplex')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    complex(c_float_complex),value :: alpha,  beta
+  end function spmmHllDeviceFloatComplex
+
+  function spmmHllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
+       & result(res) bind(c,name='spmmHllDeviceDoubleComplex')
+    use iso_c_binding
+    integer(c_int)		:: res
+    type(c_ptr), value	:: deviceMat, x, y 
+    complex(c_double_complex),value :: alpha,  beta
+  end function spmmHllDeviceDoubleComplex
+
+end interface
+
 end module hlldev_mod
--- a/cuda/impl/Makefile
+++ b/cuda/impl/Makefile
@ -275,19 +275,20 @@ psb_d_cuda_cp_hdiag_from_coo.o \
 psb_d_cuda_mv_hdiag_from_coo.o \
 psb_d_cuda_hdiag_to_gpu.o \
 psb_d_cuda_hdiag_csmv.o \
+psb_d_cuda_hdiag_csmm.o \
 psb_d_cuda_hdiag_mold.o \
 psb_d_cuda_hdiag_vect_mv.o \
 psb_s_cuda_cp_hdiag_from_coo.o \
 psb_s_cuda_mv_hdiag_from_coo.o \
 psb_s_cuda_hdiag_to_gpu.o \
 psb_s_cuda_hdiag_csmv.o \
+psb_s_cuda_hdiag_csmm.o \
 psb_s_cuda_hdiag_mold.o \
 psb_s_cuda_hdiag_vect_mv.o \
 psb_s_cuda_dnsg_mat_impl.o \
 psb_d_cuda_dnsg_mat_impl.o \
 psb_c_cuda_dnsg_mat_impl.o \
-psb_z_cuda_dnsg_mat_impl.o 
-
+psb_z_cuda_dnsg_mat_impl.o

 objs: $(OBJS)
 lib:  objs
--- a/cuda/impl/psb_c_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_c_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_c_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_c_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_c_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_c_cuda_elg_vect_mv
+
+subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_c_cuda_elg_mat_mod, psb_protect_name => psb_c_cuda_elg_multivect_mv
+  use psb_c_cuda_vect_mod
+  implicit none 
+  class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_vect_type), intent(inout) :: x
+  class(psb_c_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='c_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= czero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_c_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_c_vect_cuda)
+      select type(yy => y) 
+      type is (psb_c_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= czero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_c_cuda_elg_multivect_mv
--- a/cuda/impl/psb_c_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_c_cuda_hdiag_csmm.F90
@ -0,0 +1,123 @@
+!                Parallel Sparse BLAS   GPU plugin 
+!      (C) Copyright 2013
+!  
+!                         Salvatore Filippone
+!                         Alessandro Fanfarillo
+!   
+!    Redistribution and use in source and binary forms, with or without
+!    modification, are permitted provided that the following conditions
+!    are met:
+!      1. Redistributions of source code must retain the above copyright
+!         notice, this list of conditions and the following disclaimer.
+!      2. Redistributions in binary form must reproduce the above copyright
+!         notice, this list of conditions, and the following disclaimer in the
+!         documentation and/or other materials provided with the distribution.
+!      3. The name of the PSBLAS group or the names of its contributors may
+!         not be used to endorse or promote products derived from this
+!         software without specific written permission.
+!   
+!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
+!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!    POSSIBILITY OF SUCH DAMAGE.
+!   
+  
+subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hdiagdev_mod
+  use psb_vectordev_mod
+  use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_csmm
+  implicit none 
+  class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)          :: alpha, beta, x(:,:)
+  complex(psb_spk_), intent(inout)       :: y(:,:)
+  integer(psb_ipk_), intent(out)      :: info
+  character, optional, intent(in)     :: trans
+
+  character :: trans_
+  integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
+  complex(psb_spk_), allocatable  :: acc(:)
+  type(c_ptr)    :: gpX, gpY
+  logical   :: tra
+  Integer(Psb_ipk_)  :: err_act
+  character(len=20)  :: name='c_cuda_hdiag_csmm'
+  logical, parameter :: debug=.false.
+
+  info = psb_success_
+  call psb_erractionsave(err_act)
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+
+  if (tra) then 
+    m = a%get_ncols()
+    n = a%get_nrows()
+  else
+    n = a%get_ncols()
+    m = a%get_nrows()
+  end if
+
+  if (size(x,1)<n) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (size(y,1)<m) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+  else
+    !
+    ! Just to test, move X/Y to/from the GPU.
+    ! 
+    nxy = min(size(x,2),size(y,2))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
+
+    if (info == 0)  &
+         & info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
+    if (info == 0) &
+         & info = readMultiVecDevice(gpY,y,size(y,1))
+    if (info /= 0) goto 9999
+    call freeMultiVecDevice(gpX)
+    call freeMultiVecDevice(gpY)
+  endif
+
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_c_cuda_hdiag_csmm
--- a/cuda/impl/psb_c_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_hdiag_vect_mv.F90
@ -115,3 +115,94 @@ subroutine psb_c_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_c_cuda_hdiag_vect_mv
+
+subroutine psb_c_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+    
+	use psb_base_mod
+	use hdiagdev_mod
+	use psb_vectordev_mod
+	use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_multivect_mv
+	use psb_c_cuda_multivect_mod
+	implicit none 
+	class(psb_c_cuda_hdiag_mat_mod), intent(in) :: a
+	complex(psb_spk_), intent(in)       :: alpha, beta
+	class(psb_c_base_multivect_type), intent(inout) :: x
+	class(psb_c_base_multivect_type), intent(inout) :: y
+	integer(psb_ipk_), intent(out)             :: info
+	character, optional, intent(in)  :: trans
+	complex(psb_spk_), allocatable      :: rx(:,:), ry(:,:)
+	logical           :: tra
+	character         :: trans_
+	Integer(Psb_ipk_) :: err_act
+	character(len=20) :: name='c_cuda_hdiag_multivect_mv'
+
+	call psb_erractionsave(err_act)
+	info = psb_success_
+
+	if (present(trans)) then
+		trans_ = trans
+	else
+		trans_ = 'N'
+	end if
+
+	if (.not.a%is_asb()) then 
+		info = psb_err_invalid_mat_state_
+		call psb_errpush(info,name)
+		goto 9999
+	endif
+
+
+	tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+	if (tra) then 
+		if (a%is_dev()) call a%sync()
+		if (.not.x%is_host()) call x%sync()
+		if (beta /= dzero) then 
+			if (.not.y%is_host()) call y%sync()
+		end if
+		call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+		call y%set_host()
+	else
+		if (a%is_host()) call a%sync()    
+		select type (xx => x) 
+		type is (psb_c_multivect_cuda)
+			select type(yy => y) 
+			type is (psb_c_multivect_cuda)
+				if (a%is_host()) call a%sync()
+				if (xx%is_host()) call xx%sync()
+				if (beta /= dzero) then 
+					if (yy%is_host()) call yy%sync()
+				end if
+				info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
+						 & beta,yy%deviceVect)
+				if (info /= 0) then 
+					call psb_errpush(psb_err_from_subroutine_ai_,name,&
+							 & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
+					info = psb_err_from_subroutine_ai_
+					goto 9999
+				end if
+				call yy%set_dev()
+			class default
+				if (a%is_dev()) call a%sync()
+				rx = xx%get_vect()
+				ry = y%get_vect()
+				call a%spmm(alpha,rx,beta,ry,info)
+				call y%bld(ry)
+			end select
+		class default
+			if (a%is_dev()) call a%sync()
+			rx = x%get_vect()
+			ry = y%get_vect()
+			call a%spmm(alpha,rx,beta,ry,info)
+			call y%bld(ry)
+		end select
+
+	end if
+	if (info /= 0) goto 9999
+	call psb_erractionrestore(err_act)
+	return
+
+9999 call psb_error_handler(err_act)
+
+	return
+
+end subroutine psb_c_cuda_hdiag_multivect_mv
--- a/cuda/impl/psb_c_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_c_cuda_hlg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_c_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_c_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_hlg_vect_mv.F90
@ -117,3 +117,92 @@ subroutine psb_c_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_c_cuda_hlg_vect_mv
+
+subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hlldev_mod
+  use psb_vectordev_mod
+  use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_multivect_mv
+  use psb_c_cuda_vect_mod
+  implicit none 
+  class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
+  complex(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_c_base_vect_type), intent(inout) :: x
+  class(psb_c_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='c_cuda_hlg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= czero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    if (a%is_dev()) call a%sync()    
+    call a%psb_c_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_c_vect_cuda)
+      select type(yy => y) 
+      type is (psb_c_vect_cuda)
+        if (xx%is_host()) call xx%sync()
+        if (beta /= dzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      rx = x%get_vect()
+      ry = y%get_vect()
+      if (a%is_dev()) call a%sync()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_c_cuda_hlg_multivect_mv
--- a/cuda/impl/psb_d_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_elg_csmm.F90
@ -105,7 +105,7 @@ subroutine psb_d_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
--- a/cuda/impl/psb_d_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_elg_vect_mv.F90
@ -176,7 +176,7 @@ subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
          if (beta /= dzero) then 
            if (yy%is_host()) call yy%sync()
          end if
-          info = spmvEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+          info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
               & beta,yy%deviceVect)
          if (info /= 0) then 
            call psb_errpush(psb_err_from_subroutine_ai_,name,&
--- a/cuda/impl/psb_d_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_d_cuda_hdiag_csmm.F90
@ -0,0 +1,123 @@
+!                Parallel Sparse BLAS   GPU plugin 
+!      (C) Copyright 2013
+!  
+!                         Salvatore Filippone
+!                         Alessandro Fanfarillo
+!   
+!    Redistribution and use in source and binary forms, with or without
+!    modification, are permitted provided that the following conditions
+!    are met:
+!      1. Redistributions of source code must retain the above copyright
+!         notice, this list of conditions and the following disclaimer.
+!      2. Redistributions in binary form must reproduce the above copyright
+!         notice, this list of conditions, and the following disclaimer in the
+!         documentation and/or other materials provided with the distribution.
+!      3. The name of the PSBLAS group or the names of its contributors may
+!         not be used to endorse or promote products derived from this
+!         software without specific written permission.
+!   
+!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
+!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!    POSSIBILITY OF SUCH DAMAGE.
+!   
+  
+subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hdiagdev_mod
+  use psb_vectordev_mod
+  use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_csmm
+  implicit none 
+  class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
+  real(psb_dpk_), intent(in)          :: alpha, beta, x(:,:)
+  real(psb_dpk_), intent(inout)       :: y(:,:)
+  integer(psb_ipk_), intent(out)      :: info
+  character, optional, intent(in)     :: trans
+
+  character :: trans_
+  integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
+  real(psb_dpk_), allocatable  :: acc(:)
+  type(c_ptr)    :: gpX, gpY
+  logical   :: tra
+  Integer(Psb_ipk_)  :: err_act
+  character(len=20)  :: name='d_cuda_hdiag_csmm'
+  logical, parameter :: debug=.false.
+
+  info = psb_success_
+  call psb_erractionsave(err_act)
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+
+  if (tra) then 
+    m = a%get_ncols()
+    n = a%get_nrows()
+  else
+    n = a%get_ncols()
+    m = a%get_nrows()
+  end if
+
+  if (size(x,1)<n) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (size(y,1)<m) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+  else
+    !
+    ! Just to test, move X/Y to/from the GPU.
+    ! 
+    nxy = min(size(x,2),size(y,2))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
+
+    if (info == 0)  &
+         & info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
+    if (info == 0) &
+         & info = readMultiVecDevice(gpY,y,size(y,1))
+    if (info /= 0) goto 9999
+    call freeMultiVecDevice(gpX)
+    call freeMultiVecDevice(gpY)
+  endif
+
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_d_cuda_hdiag_csmm
--- a/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90
@ -115,3 +115,94 @@ subroutine psb_d_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_d_cuda_hdiag_vect_mv
+
+subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+    
+    use psb_base_mod
+    use hdiagdev_mod
+    use psb_vectordev_mod
+    use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv
+    use psb_d_cuda_multivect_mod
+    implicit none 
+    class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
+    real(psb_dpk_), intent(in)       :: alpha, beta
+    class(psb_d_base_multivect_type), intent(inout) :: x
+    class(psb_d_base_multivect_type), intent(inout) :: y
+    integer(psb_ipk_), intent(out)             :: info
+    character, optional, intent(in)  :: trans
+    real(psb_dpk_), allocatable      :: rx(:,:), ry(:,:)
+    logical           :: tra
+    character         :: trans_
+    Integer(Psb_ipk_) :: err_act
+    character(len=20) :: name='d_cuda_hdiag_multivect_mv'
+  
+    call psb_erractionsave(err_act)
+    info = psb_success_
+  
+    if (present(trans)) then
+      trans_ = trans
+    else
+      trans_ = 'N'
+    end if
+  
+    if (.not.a%is_asb()) then 
+      info = psb_err_invalid_mat_state_
+      call psb_errpush(info,name)
+      goto 9999
+    endif
+  
+  
+    tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+    if (tra) then 
+      if (a%is_dev()) call a%sync()
+      if (.not.x%is_host()) call x%sync()
+      if (beta /= dzero) then 
+        if (.not.y%is_host()) call y%sync()
+      end if
+      call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+      call y%set_host()
+    else
+      if (a%is_host()) call a%sync()    
+      select type (xx => x) 
+      type is (psb_d_multivect_cuda)
+        select type(yy => y) 
+        type is (psb_d_multivect_cuda)
+          if (a%is_host()) call a%sync()
+          if (xx%is_host()) call xx%sync()
+          if (beta /= dzero) then 
+            if (yy%is_host()) call yy%sync()
+          end if
+          info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
+               & beta,yy%deviceVect)
+          if (info /= 0) then 
+            call psb_errpush(psb_err_from_subroutine_ai_,name,&
+                 & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
+            info = psb_err_from_subroutine_ai_
+            goto 9999
+          end if
+          call yy%set_dev()
+        class default
+          if (a%is_dev()) call a%sync()
+          rx = xx%get_vect()
+          ry = y%get_vect()
+          call a%spmm(alpha,rx,beta,ry,info)
+          call y%bld(ry)
+        end select
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = x%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+  
+    end if
+    if (info /= 0) goto 9999
+    call psb_erractionrestore(err_act)
+    return
+  
+9999 call psb_error_handler(err_act)
+  
+    return
+  
+end subroutine psb_d_cuda_hdiag_multivect_mv
--- a/cuda/impl/psb_d_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_hlg_csmm.F90
@ -105,7 +105,7 @@ subroutine psb_d_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
--- a/cuda/impl/psb_d_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_hlg_vect_mv.F90
@ -172,7 +172,7 @@ subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
          if (beta /= dzero) then 
            if (yy%is_host()) call yy%sync()
          end if
-          info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+          info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
               & beta,yy%deviceVect)
          if (info /= 0) then 
            call psb_errpush(psb_err_from_subroutine_ai_,name,&
--- a/cuda/impl/psb_s_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_s_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_s_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_s_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_s_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_s_cuda_elg_vect_mv
+
+subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_s_cuda_elg_mat_mod, psb_protect_name => psb_s_cuda_elg_multivect_mv
+  use psb_s_cuda_vect_mod
+  implicit none 
+  class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_vect_type), intent(inout) :: x
+  class(psb_s_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  real(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='s_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= szero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_s_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_s_vect_cuda)
+      select type(yy => y) 
+      type is (psb_s_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= szero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_s_cuda_elg_multivect_mv
--- a/cuda/impl/psb_s_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_csmm.F90
@ -0,0 +1,123 @@
+!                Parallel Sparse BLAS   GPU plugin 
+!      (C) Copyright 2013
+!  
+!                         Salvatore Filippone
+!                         Alessandro Fanfarillo
+!   
+!    Redistribution and use in source and binary forms, with or without
+!    modification, are permitted provided that the following conditions
+!    are met:
+!      1. Redistributions of source code must retain the above copyright
+!         notice, this list of conditions and the following disclaimer.
+!      2. Redistributions in binary form must reproduce the above copyright
+!         notice, this list of conditions, and the following disclaimer in the
+!         documentation and/or other materials provided with the distribution.
+!      3. The name of the PSBLAS group or the names of its contributors may
+!         not be used to endorse or promote products derived from this
+!         software without specific written permission.
+!   
+!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
+!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!    POSSIBILITY OF SUCH DAMAGE.
+!   
+  
+subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hdiagdev_mod
+  use psb_vectordev_mod
+  use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_csmm
+  implicit none 
+  class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)          :: alpha, beta, x(:,:)
+  real(psb_spk_), intent(inout)       :: y(:,:)
+  integer(psb_ipk_), intent(out)      :: info
+  character, optional, intent(in)     :: trans
+
+  character :: trans_
+  integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
+  real(psb_spk_), allocatable  :: acc(:)
+  type(c_ptr)    :: gpX, gpY
+  logical   :: tra
+  Integer(Psb_ipk_)  :: err_act
+  character(len=20)  :: name='s_cuda_hdiag_csmm'
+  logical, parameter :: debug=.false.
+
+  info = psb_success_
+  call psb_erractionsave(err_act)
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+
+  if (tra) then 
+    m = a%get_ncols()
+    n = a%get_nrows()
+  else
+    n = a%get_ncols()
+    m = a%get_nrows()
+  end if
+
+  if (size(x,1)<n) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (size(y,1)<m) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+  else
+    !
+    ! Just to test, move X/Y to/from the GPU.
+    ! 
+    nxy = min(size(x,2),size(y,2))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
+
+    if (info == 0)  &
+         & info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
+    if (info == 0) &
+         & info = readMultiVecDevice(gpY,y,size(y,1))
+    if (info /= 0) goto 9999
+    call freeMultiVecDevice(gpX)
+    call freeMultiVecDevice(gpY)
+  endif
+
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_s_cuda_hdiag_csmm
--- a/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90
@ -115,3 +115,94 @@ subroutine psb_s_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_s_cuda_hdiag_vect_mv
+
+subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+    
+	use psb_base_mod
+	use hdiagdev_mod
+	use psb_vectordev_mod
+	use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv
+	use psb_s_cuda_multivect_mod
+	implicit none 
+	class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
+	real(psb_spk_), intent(in)       :: alpha, beta
+	class(psb_s_base_multivect_type), intent(inout) :: x
+	class(psb_s_base_multivect_type), intent(inout) :: y
+	integer(psb_ipk_), intent(out)             :: info
+	character, optional, intent(in)  :: trans
+	real(psb_spk_), allocatable      :: rx(:,:), ry(:,:)
+	logical           :: tra
+	character         :: trans_
+	Integer(Psb_ipk_) :: err_act
+	character(len=20) :: name='s_cuda_hdiag_multivect_mv'
+
+	call psb_erractionsave(err_act)
+	info = psb_success_
+
+	if (present(trans)) then
+		trans_ = trans
+	else
+		trans_ = 'N'
+	end if
+
+	if (.not.a%is_asb()) then 
+		info = psb_err_invalid_mat_state_
+		call psb_errpush(info,name)
+		goto 9999
+	endif
+
+
+	tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+	if (tra) then 
+		if (a%is_dev()) call a%sync()
+		if (.not.x%is_host()) call x%sync()
+		if (beta /= dzero) then 
+			if (.not.y%is_host()) call y%sync()
+		end if
+		call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+		call y%set_host()
+	else
+		if (a%is_host()) call a%sync()    
+		select type (xx => x) 
+		type is (psb_s_multivect_cuda)
+			select type(yy => y) 
+			type is (psb_s_multivect_cuda)
+				if (a%is_host()) call a%sync()
+				if (xx%is_host()) call xx%sync()
+				if (beta /= dzero) then 
+					if (yy%is_host()) call yy%sync()
+				end if
+				info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
+						 & beta,yy%deviceVect)
+				if (info /= 0) then 
+					call psb_errpush(psb_err_from_subroutine_ai_,name,&
+							 & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
+					info = psb_err_from_subroutine_ai_
+					goto 9999
+				end if
+				call yy%set_dev()
+			class default
+				if (a%is_dev()) call a%sync()
+				rx = xx%get_vect()
+				ry = y%get_vect()
+				call a%spmm(alpha,rx,beta,ry,info)
+				call y%bld(ry)
+			end select
+		class default
+			if (a%is_dev()) call a%sync()
+			rx = x%get_vect()
+			ry = y%get_vect()
+			call a%spmm(alpha,rx,beta,ry,info)
+			call y%bld(ry)
+		end select
+
+	end if
+	if (info /= 0) goto 9999
+	call psb_erractionrestore(err_act)
+	return
+
+9999 call psb_error_handler(err_act)
+
+	return
+
+end subroutine psb_s_cuda_hdiag_multivect_mv
--- a/cuda/impl/psb_s_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_s_cuda_hlg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_s_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_s_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_hlg_vect_mv.F90
@ -117,3 +117,92 @@ subroutine psb_s_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_s_cuda_hlg_vect_mv
+
+subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hlldev_mod
+  use psb_vectordev_mod
+  use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_multivect_mv
+  use psb_s_cuda_vect_mod
+  implicit none 
+  class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
+  real(psb_spk_), intent(in)       :: alpha, beta
+  class(psb_s_base_vect_type), intent(inout) :: x
+  class(psb_s_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  real(psb_spk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='s_cuda_hlg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= szero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    if (a%is_dev()) call a%sync()    
+    call a%psb_s_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_s_vect_cuda)
+      select type(yy => y) 
+      type is (psb_s_vect_cuda)
+        if (xx%is_host()) call xx%sync()
+        if (beta /= dzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      rx = x%get_vect()
+      ry = y%get_vect()
+      if (a%is_dev()) call a%sync()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_s_cuda_hlg_multivect_mv
--- a/cuda/impl/psb_z_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_z_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_z_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_z_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_z_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_z_cuda_elg_vect_mv
+
+subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_z_cuda_elg_mat_mod, psb_protect_name => psb_z_cuda_elg_multivect_mv
+  use psb_z_cuda_vect_mod
+  implicit none 
+  class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
+  complex(psb_dpk_), intent(in)       :: alpha, beta
+  class(psb_z_base_vect_type), intent(inout) :: x
+  class(psb_z_base_vect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)  :: trans
+  complex(psb_dpk_), allocatable      :: rx(:), ry(:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='z_cuda_elg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= zzero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_z_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_z_vect_cuda)
+      select type(yy => y) 
+      type is (psb_z_vect_cuda)
+        if (a%is_host()) call a%sync()
+        if (xx%is_host()) call xx%sync()
+        if (beta /= zzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        if (a%is_dev()) call a%sync()
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      if (a%is_dev()) call a%sync()
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_z_cuda_elg_multivect_mv
--- a/cuda/impl/psb_z_cuda_hdiag_csmm.F90
+++ b/cuda/impl/psb_z_cuda_hdiag_csmm.F90
@ -0,0 +1,123 @@
+!                Parallel Sparse BLAS   GPU plugin 
+!      (C) Copyright 2013
+!  
+!                         Salvatore Filippone
+!                         Alessandro Fanfarillo
+!   
+!    Redistribution and use in source and binary forms, with or without
+!    modification, are permitted provided that the following conditions
+!    are met:
+!      1. Redistributions of source code must retain the above copyright
+!         notice, this list of conditions and the following disclaimer.
+!      2. Redistributions in binary form must reproduce the above copyright
+!         notice, this list of conditions, and the following disclaimer in the
+!         documentation and/or other materials provided with the distribution.
+!      3. The name of the PSBLAS group or the names of its contributors may
+!         not be used to endorse or promote products derived from this
+!         software without specific written permission.
+!   
+!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
+!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!    POSSIBILITY OF SUCH DAMAGE.
+!   
+  
+subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use hdiagdev_mod
+  use psb_vectordev_mod
+  use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_csmm
+  implicit none 
+  class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
+  complex(psb_dpk_), intent(in)          :: alpha, beta, x(:,:)
+  complex(psb_dpk_), intent(inout)       :: y(:,:)
+  integer(psb_ipk_), intent(out)      :: info
+  character, optional, intent(in)     :: trans
+
+  character :: trans_
+  integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
+  complex(psb_dpk_), allocatable  :: acc(:)
+  type(c_ptr)    :: gpX, gpY
+  logical   :: tra
+  Integer(Psb_ipk_)  :: err_act
+  character(len=20)  :: name='z_cuda_hdiag_csmm'
+  logical, parameter :: debug=.false.
+
+  info = psb_success_
+  call psb_erractionsave(err_act)
+
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+
+  if (tra) then 
+    m = a%get_ncols()
+    n = a%get_nrows()
+  else
+    n = a%get_ncols()
+    m = a%get_nrows()
+  end if
+
+  if (size(x,1)<n) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (size(y,1)<m) then 
+    info = 36
+    call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
+    goto 9999
+  end if
+
+  if (tra) then 
+    if (a%is_dev()) call a%sync()
+    call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+  else
+    !
+    ! Just to test, move X/Y to/from the GPU.
+    ! 
+    nxy = min(size(x,2),size(y,2))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
+    if (info == 0) &
+         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
+    if (info == 0) &
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
+
+    if (info == 0)  &
+         & info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
+    if (info == 0) &
+         & info = readMultiVecDevice(gpY,y,size(y,1))
+    if (info /= 0) goto 9999
+    call freeMultiVecDevice(gpX)
+    call freeMultiVecDevice(gpY)
+  endif
+
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+
+end subroutine psb_z_cuda_hdiag_csmm
--- a/cuda/impl/psb_z_cuda_hdiag_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_hdiag_vect_mv.F90
@ -115,3 +115,94 @@ subroutine psb_z_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_z_cuda_hdiag_vect_mv
+
+subroutine psb_z_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+    
+	use psb_base_mod
+	use hdiagdev_mod
+	use psb_vectordev_mod
+	use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_multivect_mv
+	use psb_z_cuda_multivect_mod
+	implicit none 
+	class(psb_z_cuda_hdiag_mat_mod), intent(in) :: a
+	complex(psb_dpk_), intent(in)       :: alpha, beta
+	class(psb_z_base_multivect_type), intent(inout) :: x
+	class(psb_z_base_multivect_type), intent(inout) :: y
+	integer(psb_ipk_), intent(out)             :: info
+	character, optional, intent(in)  :: trans
+	complex(psb_dpk_), allocatable      :: rx(:,:), ry(:,:)
+	logical           :: tra
+	character         :: trans_
+	Integer(Psb_ipk_) :: err_act
+	character(len=20) :: name='z_cuda_hdiag_multivect_mv'
+
+	call psb_erractionsave(err_act)
+	info = psb_success_
+
+	if (present(trans)) then
+		trans_ = trans
+	else
+		trans_ = 'N'
+	end if
+
+	if (.not.a%is_asb()) then 
+		info = psb_err_invalid_mat_state_
+		call psb_errpush(info,name)
+		goto 9999
+	endif
+
+
+	tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+	if (tra) then 
+		if (a%is_dev()) call a%sync()
+		if (.not.x%is_host()) call x%sync()
+		if (beta /= dzero) then 
+			if (.not.y%is_host()) call y%sync()
+		end if
+		call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+		call y%set_host()
+	else
+		if (a%is_host()) call a%sync()    
+		select type (xx => x) 
+		type is (psb_z_multivect_cuda)
+			select type(yy => y) 
+			type is (psb_z_multivect_cuda)
+				if (a%is_host()) call a%sync()
+				if (xx%is_host()) call xx%sync()
+				if (beta /= dzero) then 
+					if (yy%is_host()) call yy%sync()
+				end if
+				info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
+						 & beta,yy%deviceVect)
+				if (info /= 0) then 
+					call psb_errpush(psb_err_from_subroutine_ai_,name,&
+							 & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
+					info = psb_err_from_subroutine_ai_
+					goto 9999
+				end if
+				call yy%set_dev()
+			class default
+				if (a%is_dev()) call a%sync()
+				rx = xx%get_vect()
+				ry = y%get_vect()
+				call a%spmm(alpha,rx,beta,ry,info)
+				call y%bld(ry)
+			end select
+		class default
+			if (a%is_dev()) call a%sync()
+			rx = x%get_vect()
+			ry = y%get_vect()
+			call a%spmm(alpha,rx,beta,ry,info)
+			call y%bld(ry)
+		end select
+
+	end if
+	if (info /= 0) goto 9999
+	call psb_erractionrestore(err_act)
+	return
+
+9999 call psb_error_handler(err_act)
+
+	return
+
+end subroutine psb_z_cuda_hdiag_multivect_mv
--- a/cuda/impl/psb_z_cuda_hlg_csmm.F90
+++ b/cuda/impl/psb_z_cuda_hlg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_z_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))

    if (info == 0)  &
-         & info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_z_cuda_hlg_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_hlg_vect_mv.F90
@ -117,3 +117,92 @@ subroutine psb_z_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
  return

 end subroutine psb_z_cuda_hlg_vect_mv
+
+subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+    use psb_base_mod
+    use hlldev_mod
+    use psb_vectordev_mod
+    use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_multivect_mv
+    use psb_z_cuda_vect_mod
+    implicit none 
+    class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
+    complex(psb_dpk_), intent(in)       :: alpha, beta
+    class(psb_z_base_vect_type), intent(inout) :: x
+    class(psb_z_base_vect_type), intent(inout) :: y
+    integer(psb_ipk_), intent(out)             :: info
+    character, optional, intent(in)  :: trans
+    complex(psb_dpk_), allocatable      :: rx(:), ry(:)
+    logical           :: tra
+    character         :: trans_
+    Integer(Psb_ipk_) :: err_act
+    character(len=20) :: name='z_cuda_hlg_multivect_mv'
+  
+    call psb_erractionsave(err_act)
+    info = psb_success_
+  
+    if (present(trans)) then
+      trans_ = trans
+    else
+      trans_ = 'N'
+    end if
+  
+    if (.not.a%is_asb()) then 
+      info = psb_err_invalid_mat_state_
+      call psb_errpush(info,name)
+      goto 9999
+    endif
+  
+    tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+    if (tra) then 
+      if (.not.x%is_host()) call x%sync()
+      if (beta /= zzero) then 
+        if (.not.y%is_host()) call y%sync()
+      end if
+      if (a%is_dev()) call a%sync()    
+      call a%psb_z_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+      call y%set_host()
+    else
+      if (a%is_host()) call a%sync()    
+      select type (xx => x) 
+      type is (psb_z_vect_cuda)
+        select type(yy => y) 
+        type is (psb_z_vect_cuda)
+          if (xx%is_host()) call xx%sync()
+          if (beta /= dzero) then 
+            if (yy%is_host()) call yy%sync()
+          end if
+          info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
+               & beta,yy%deviceVect)
+          if (info /= 0) then 
+            call psb_errpush(psb_err_from_subroutine_ai_,name,&
+                 & a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
+            info = psb_err_from_subroutine_ai_
+            goto 9999
+          end if
+          call yy%set_dev()
+        class default
+          rx = xx%get_vect()
+          ry = y%get_vect()
+          if (a%is_dev()) call a%sync()
+          call a%spmm(alpha,rx,beta,ry,info)
+          call y%bld(ry)
+        end select
+      class default
+        rx = x%get_vect()
+        ry = y%get_vect()
+        if (a%is_dev()) call a%sync()
+        call a%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+  
+    end if
+    if (info /= 0) goto 9999
+    call psb_erractionrestore(err_act)
+    return
+  
+  9999 call psb_error_handler(err_act)
+  
+    return
+  
+  end subroutine psb_z_cuda_hlg_multivect_mv
--- a/cuda/psb_c_cuda_elg_mat_mod.F90
+++ b/cuda/psb_c_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_c_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => c_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => c_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_c_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_c_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_c_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_c_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_c_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_c_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_c_cuda_elg_vect_mv
+    subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_elg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
+      class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_c_cuda_hdiag_mat_mod.F90
+++ b/cuda/psb_c_cuda_hdiag_mat_mod.F90
@ -44,7 +44,8 @@ module psb_c_cuda_hdiag_mat_mod
    procedure, nopass  :: get_fmt       => c_cuda_hdiag_get_fmt
    !  procedure, pass(a) :: sizeof        => c_cuda_hdiag_sizeof
    procedure, pass(a) :: vect_mv       => psb_c_cuda_hdiag_vect_mv
-    ! procedure, pass(a) :: csmm          => psb_c_cuda_hdiag_csmm
+    procedure, pass(a) :: multivect_mv  => psb_c_cuda_hdiag_multivect_mv
+    procedure, pass(a) :: csmm          => psb_c_cuda_hdiag_csmm
    procedure, pass(a) :: csmv          => psb_c_cuda_hdiag_csmv
    !    procedure, pass(a) :: in_vect_sv    => psb_c_cuda_hdiag_inner_vect_sv
    !    procedure, pass(a) :: scals         => psb_c_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_c_cuda_hdiag_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_c_cuda_hdiag_vect_mv
+		subroutine psb_c_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
+      class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)                 :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_cuda_hdiag_multivect_mv
  end interface

 !!$  interface 
@ -172,17 +182,17 @@ module psb_c_cuda_hdiag_mat_mod
    end subroutine psb_c_cuda_hdiag_csmv
  end interface

-!!$  interface 
-!!$    subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
-!!$      import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
-!!$      class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
-!!$      complex(psb_spk_), intent(in)              :: alpha, beta, x(:,:)
-!!$      complex(psb_spk_), intent(inout)           :: y(:,:)
-!!$      integer(psb_ipk_), intent(out)          :: info
-!!$      character, optional, intent(in)         :: trans
-!!$    end subroutine psb_c_cuda_hdiag_csmm
-!!$  end interface
-!!$  
+  interface 
+    subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
+      class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)              :: alpha, beta, x(:,:)
+      complex(psb_spk_), intent(inout)           :: y(:,:)
+      integer(psb_ipk_), intent(out)          :: info
+      character, optional, intent(in)         :: trans
+    end subroutine psb_c_cuda_hdiag_csmm
+  end interface
+  
 !!$  interface 
 !!$    subroutine psb_c_cuda_hdiag_scal(d,a,info, side) 
 !!$      import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
--- a/cuda/psb_c_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_c_cuda_hlg_mat_mod.F90
@ -55,6 +55,7 @@ module psb_c_cuda_hlg_mat_mod
  contains
    procedure, nopass  :: get_fmt       => c_cuda_hlg_get_fmt
    procedure, pass(a) :: sizeof        => c_cuda_hlg_sizeof
+    procedure, pass(a) :: multivect_mv  => psb_c_cuda_hlg_multivect_mv
    procedure, pass(a) :: vect_mv       => psb_c_cuda_hlg_vect_mv
    procedure, pass(a) :: csmm          => psb_c_cuda_hlg_csmm
    procedure, pass(a) :: csmv          => psb_c_cuda_hlg_csmv
@ -97,6 +98,15 @@ module psb_c_cuda_hlg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_c_cuda_hlg_vect_mv
+    subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_c_cuda_hlg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
+      class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
+      complex(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_c_base_multivect_type), intent(inout) :: x
+      class(psb_c_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_c_cuda_hlg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_cuda_env_mod.F90
+++ b/cuda/psb_cuda_env_mod.F90
@ -165,6 +165,11 @@ module psb_cuda_env_mod
      use iso_c_binding
      integer(c_int) :: res
    end function psb_C_get_MaxRegistersPerBlock
+    function psb_C_get_SharedMemPerBlock() &
+         & result(res) bind(c,name='getGPUSharedMemsPerBlock')
+      use iso_c_binding
+      integer(c_int) :: res
+    end function psb_C_get_SharedMemPerBlock
  end interface
  interface 
    subroutine psb_C_cpy_NameString(cstring) &
--- a/cuda/psb_d_cuda_hdiag_mat_mod.F90
+++ b/cuda/psb_d_cuda_hdiag_mat_mod.F90
@ -44,7 +44,8 @@ module psb_d_cuda_hdiag_mat_mod
    procedure, nopass  :: get_fmt       => d_cuda_hdiag_get_fmt
    !  procedure, pass(a) :: sizeof        => d_cuda_hdiag_sizeof
    procedure, pass(a) :: vect_mv       => psb_d_cuda_hdiag_vect_mv
-    ! procedure, pass(a) :: csmm          => psb_d_cuda_hdiag_csmm
+    procedure, pass(a) :: multivect_mv  => psb_d_cuda_hdiag_multivect_mv
+    procedure, pass(a) :: csmm          => psb_d_cuda_hdiag_csmm
    procedure, pass(a) :: csmv          => psb_d_cuda_hdiag_csmv
    !    procedure, pass(a) :: in_vect_sv    => psb_d_cuda_hdiag_inner_vect_sv
    !    procedure, pass(a) :: scals         => psb_d_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_d_cuda_hdiag_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_d_cuda_hdiag_vect_mv
+    subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_
+      class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
+      real(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_d_base_multivect_type), intent(inout) :: x
+      class(psb_d_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_d_cuda_hdiag_multivect_mv
  end interface

 !!$  interface 
@ -172,17 +182,17 @@ module psb_d_cuda_hdiag_mat_mod
    end subroutine psb_d_cuda_hdiag_csmv
  end interface

-!!$  interface 
-!!$    subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
-!!$      import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
-!!$      class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
-!!$      real(psb_dpk_), intent(in)              :: alpha, beta, x(:,:)
-!!$      real(psb_dpk_), intent(inout)           :: y(:,:)
-!!$      integer(psb_ipk_), intent(out)          :: info
-!!$      character, optional, intent(in)         :: trans
-!!$    end subroutine psb_d_cuda_hdiag_csmm
-!!$  end interface
-!!$  
+  interface 
+    subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+      import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
+      class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
+      real(psb_dpk_), intent(in)              :: alpha, beta, x(:,:)
+      real(psb_dpk_), intent(inout)           :: y(:,:)
+      integer(psb_ipk_), intent(out)          :: info
+      character, optional, intent(in)         :: trans
+    end subroutine psb_d_cuda_hdiag_csmm
+  end interface
+
 !!$  interface 
 !!$    subroutine psb_d_cuda_hdiag_scal(d,a,info, side) 
 !!$      import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
--- a/cuda/psb_s_cuda_elg_mat_mod.F90
+++ b/cuda/psb_s_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_s_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => s_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => s_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_s_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_s_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_s_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_s_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_s_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_s_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_s_cuda_elg_vect_mv
+    subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_elg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
+      class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_s_cuda_hdiag_mat_mod.F90
+++ b/cuda/psb_s_cuda_hdiag_mat_mod.F90
@ -44,7 +44,8 @@ module psb_s_cuda_hdiag_mat_mod
    procedure, nopass  :: get_fmt       => s_cuda_hdiag_get_fmt
    !  procedure, pass(a) :: sizeof        => s_cuda_hdiag_sizeof
    procedure, pass(a) :: vect_mv       => psb_s_cuda_hdiag_vect_mv
-    ! procedure, pass(a) :: csmm          => psb_s_cuda_hdiag_csmm
+    procedure, pass(a) :: multivect_mv  => psb_s_cuda_hdiag_multivect_mv
+    procedure, pass(a) :: csmm          => psb_s_cuda_hdiag_csmm
    procedure, pass(a) :: csmv          => psb_s_cuda_hdiag_csmv
    !    procedure, pass(a) :: in_vect_sv    => psb_s_cuda_hdiag_inner_vect_sv
    !    procedure, pass(a) :: scals         => psb_s_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_s_cuda_hdiag_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_s_cuda_hdiag_vect_mv
+		subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
+      class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_cuda_hdiag_multivect_mv
  end interface

 !!$  interface 
@ -172,17 +182,17 @@ module psb_s_cuda_hdiag_mat_mod
    end subroutine psb_s_cuda_hdiag_csmv
  end interface

-!!$  interface 
-!!$    subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
-!!$      import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
-!!$      class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
-!!$      real(psb_spk_), intent(in)              :: alpha, beta, x(:,:)
-!!$      real(psb_spk_), intent(inout)           :: y(:,:)
-!!$      integer(psb_ipk_), intent(out)          :: info
-!!$      character, optional, intent(in)         :: trans
-!!$    end subroutine psb_s_cuda_hdiag_csmm
-!!$  end interface
-!!$  
+  interface 
+    subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
+      class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)              :: alpha, beta, x(:,:)
+      real(psb_spk_), intent(inout)           :: y(:,:)
+      integer(psb_ipk_), intent(out)          :: info
+      character, optional, intent(in)         :: trans
+    end subroutine psb_s_cuda_hdiag_csmm
+  end interface
+  
 !!$  interface 
 !!$    subroutine psb_s_cuda_hdiag_scal(d,a,info, side) 
 !!$      import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
--- a/cuda/psb_s_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_s_cuda_hlg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_s_cuda_hlg_mat_mod
    procedure, nopass  :: get_fmt       => s_cuda_hlg_get_fmt
    procedure, pass(a) :: sizeof        => s_cuda_hlg_sizeof
    procedure, pass(a) :: vect_mv       => psb_s_cuda_hlg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_s_cuda_hlg_multivect_mv
    procedure, pass(a) :: csmm          => psb_s_cuda_hlg_csmm
    procedure, pass(a) :: csmv          => psb_s_cuda_hlg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_s_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_s_cuda_hlg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_s_cuda_hlg_vect_mv
+    subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_s_cuda_hlg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
+      class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
+      real(psb_spk_), intent(in)       :: alpha, beta
+      class(psb_s_base_multivect_type), intent(inout) :: x
+      class(psb_s_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_s_cuda_hlg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_z_cuda_elg_mat_mod.F90
+++ b/cuda/psb_z_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_z_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => z_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => z_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_z_cuda_elg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_z_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_z_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_z_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_z_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_z_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_z_cuda_elg_vect_mv
+    subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_elg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
+      class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_cuda_elg_multivect_mv
  end interface

  interface 
--- a/cuda/psb_z_cuda_hdiag_mat_mod.F90
+++ b/cuda/psb_z_cuda_hdiag_mat_mod.F90
@ -44,7 +44,8 @@ module psb_z_cuda_hdiag_mat_mod
    procedure, nopass  :: get_fmt       => z_cuda_hdiag_get_fmt
    !  procedure, pass(a) :: sizeof        => z_cuda_hdiag_sizeof
    procedure, pass(a) :: vect_mv       => psb_z_cuda_hdiag_vect_mv
-    ! procedure, pass(a) :: csmm          => psb_z_cuda_hdiag_csmm
+    procedure, pass(a) :: multivect_mv  => psb_z_cuda_hdiag_multivect_mv
+    procedure, pass(a) :: csmm          => psb_z_cuda_hdiag_csmm
    procedure, pass(a) :: csmv          => psb_z_cuda_hdiag_csmv
    !    procedure, pass(a) :: in_vect_sv    => psb_z_cuda_hdiag_inner_vect_sv
    !    procedure, pass(a) :: scals         => psb_z_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_z_cuda_hdiag_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_z_cuda_hdiag_vect_mv
+		subroutine psb_z_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
+      class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
+			complex(psb_dpk_), intent(in)                 :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_cuda_hdiag_multivect_mv
  end interface

 !!$  interface 
@ -172,17 +182,17 @@ module psb_z_cuda_hdiag_mat_mod
    end subroutine psb_z_cuda_hdiag_csmv
  end interface

-!!$  interface 
-!!$    subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
-!!$      import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
-!!$      class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
-!!$      complex(psb_dpk_), intent(in)              :: alpha, beta, x(:,:)
-!!$      complex(psb_dpk_), intent(inout)           :: y(:,:)
-!!$      integer(psb_ipk_), intent(out)          :: info
-!!$      character, optional, intent(in)         :: trans
-!!$    end subroutine psb_z_cuda_hdiag_csmm
-!!$  end interface
-!!$  
+  interface 
+    subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
+      class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)              :: alpha, beta, x(:,:)
+      complex(psb_dpk_), intent(inout)           :: y(:,:)
+      integer(psb_ipk_), intent(out)          :: info
+      character, optional, intent(in)         :: trans
+    end subroutine psb_z_cuda_hdiag_csmm
+  end interface
+  
 !!$  interface 
 !!$    subroutine psb_z_cuda_hdiag_scal(d,a,info, side) 
 !!$      import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
--- a/cuda/psb_z_cuda_hlg_mat_mod.F90
+++ b/cuda/psb_z_cuda_hlg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_z_cuda_hlg_mat_mod
    procedure, nopass  :: get_fmt       => z_cuda_hlg_get_fmt
    procedure, pass(a) :: sizeof        => z_cuda_hlg_sizeof
    procedure, pass(a) :: vect_mv       => psb_z_cuda_hlg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_z_cuda_hlg_multivect_mv
    procedure, pass(a) :: csmm          => psb_z_cuda_hlg_csmm
    procedure, pass(a) :: csmv          => psb_z_cuda_hlg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_z_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_z_cuda_hlg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)            :: trans
    end subroutine psb_z_cuda_hlg_vect_mv
+    subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_z_cuda_hlg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
+      class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
+      complex(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_z_base_multivect_type), intent(inout) :: x
+      class(psb_z_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_z_cuda_hlg_multivect_mv
  end interface

  interface 
--- a/cuda/spgpu/ell.h
+++ b/cuda/spgpu/ell.h
@ -70,6 +70,51 @@ void spgpuSellspmv (spgpuHandle_t handle,
 	float beta,
 	int baseIndex);

+/** 
+* \fn void spgpuSellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgRowSize,int maxRowSize,int rows, const __device float *x,int xpitch,float beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuSellspmm(spgpuHandle_t handle,
+	int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device float *x,
+	int xpitch,
+	float beta,
+	int baseIndex);
+
 /** 
 * \fn void spgpuDellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -107,6 +152,50 @@ void spgpuDellspmv (spgpuHandle_t handle,
 	double beta,
 	int baseIndex);

+/** 
+* \fn void spgpuDellspmm (	int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device double *x,int xpitch,double beta,int baseIndex)
+ * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuDellspmm(spgpuHandle_t handle,
+	int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device double *x,
+	int xpitch,
+	double beta,
+	int baseIndex);

 /** 
 * \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -145,6 +234,51 @@ void spgpuCellspmv (spgpuHandle_t handle,
 	cuFloatComplex beta,
 	int baseIndex);

+/** 
+* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
+ * Computes single precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuCellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device cuFloatComplex *x,
+	int xpitch,
+	cuFloatComplex beta,
+	int baseIndex);
+
 /** 
 * \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
 * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -182,6 +316,50 @@ void spgpuZellspmv (spgpuHandle_t handle,
 	cuDoubleComplex beta,
 	int baseIndex);
 	
+/** 
+* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
+ * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The ELL non zero values allocation pointer
+ * \param rP The ELL column indices allocation pointer
+ * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
+ * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
+ * \param maxNnzPerRow Maximum number of non zeroes per row.
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuZellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* cM, 
+	const __device int* rP,
+    int cMPitch,
+    int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+    int avgNnzPerRow,
+    int maxNnzPerRow,
+	int rows, 
+	const __device cuDoubleComplex *x,
+	int xpitch,
+	cuDoubleComplex beta,
+	int baseIndex);
 	
 /** 
 * \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device float *aVal, int baseIndex)
--- a/cuda/spgpu/hdia.h
+++ b/cuda/spgpu/hdia.h
@ -27,6 +27,10 @@
 extern "C" {
 #endif

+// DIA/HDIA Compressed Matrix Format routines
+
+/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
+#define DIA_PITCH_ALIGN_BYTE 128

 /** 
 * \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
 	float beta);


+/** 
+* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuShdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device float *x, 
+	int xpitch,
+	float beta);
+
 /** 
 * \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
 	const double *x, 
 	double beta);

+/** 
+* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuDhdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device double *x, 
+	int xpitch,
+	double beta);

 /** 
 * \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
 	const cuFloatComplex *x, 
 	cuFloatComplex beta);

+/** 
+* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuChdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device cuFloatComplex *x, 
+	int xpitch,
+	cuFloatComplex beta);

 /** 
 * \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
 	int cols, 
 	const cuDoubleComplex *x, 
 	cuDoubleComplex beta);
+
+/** 
+* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param dM The stacked HDIA non zero values allocation pointer
+ * \param offsets The stacked HDIA diagonals offsets vector
+ * \param hackSize The constant size of every hack (must be a multiple of 32) 
+ * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
+ * \param rows the rows count
+ * \param cols the columns count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ */
+void 
+spgpuZhdiaspmm (spgpuHandle_t handle, 
+  int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* dM, 
+	const __device int* offsets, 
+	int hackSize, 
+	const __device int* hackOffsets,
+	int rows,
+	int cols, 
+	const __device cuDoubleComplex *x, 
+	int xpitch,
+	cuDoubleComplex beta);
+
 		
 /** @}*/

--- a/cuda/spgpu/hell.h
+++ b/cuda/spgpu/hell.h
@ -68,7 +68,46 @@ void spgpuShellspmv (spgpuHandle_t handle,
 	float beta,
 	int baseIndex);

-
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device float *x,int xpitch,float beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuShellspmm(spgpuHandle_t handle,
+	int count,
+	__device float *z,
+	int zpitch,
+	const __device float *y,
+	int ypitch,
+	float alpha, 
+	const __device float* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device float *x,
+	int xpitch,
+	float beta,
+	int baseIndex);

 /** 
 * \fn void spgpuDhellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
@ -105,6 +144,46 @@ void spgpuDhellspmv (spgpuHandle_t handle,
 	double beta,
 	int baseIndex);

+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device double *x,int xpitch,double beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuDhellspmm(spgpuHandle_t handle,
+	int count,
+	__device double *z,
+	int zpitch,
+	const __device double *y,
+	int ypitch,
+	double alpha, 
+	const __device double* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device double *x,
+	int xpitch,
+	double beta,
+	int baseIndex);

 /** 
 * \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -141,7 +220,46 @@ void spgpuChellspmv (spgpuHandle_t handle,
 	cuFloatComplex beta,
 	int baseIndex);

-
+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuChellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuFloatComplex *z,
+	int zpitch,
+	const __device cuFloatComplex *y,
+	int ypitch,
+	cuFloatComplex alpha, 
+	const __device cuFloatComplex* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device cuFloatComplex *x,
+	int xpitch,
+	cuFloatComplex beta,
+	int baseIndex);

 /** 
 * \fn void spgpuZhellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
@ -178,6 +296,46 @@ void spgpuZhellspmv (spgpuHandle_t handle,
 	cuDoubleComplex beta,
 	int baseIndex);

+/** 
+* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
+ * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
+ * \param handle The spgpu handle used to call this routine
+ * \param count The cols count
+ * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
+ * \param zpitch The pitch of the output vector
+ * \param y The y input vector
+ * \param ypitch The pitch of the y input vector
+ * \param alpha The alpha scalar
+ * \param cM The HELL non zero values allocation pointer
+ * \param rP The HELL column indices allocation pointer
+ * \param hackSize The constant size of every hack (must be a multiple of 32).
+ * \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
+ * \param rS the array containing the row sized (in non zero elements)
+ * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
+ * \param rows the rows count
+ * \param x the x vector
+ * \param xpitch The pitch of the x input vector
+ * \param beta the beta scalar
+ * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
+ */
+void spgpuZhellspmm(spgpuHandle_t handle,
+	int count,
+	__device cuDoubleComplex *z,
+	int zpitch,
+	const __device cuDoubleComplex *y,
+	int ypitch,
+	cuDoubleComplex alpha, 
+	const __device cuDoubleComplex* cM, 
+	const __device int* rP,
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const __device cuDoubleComplex *x,
+	int xpitch,
+	cuDoubleComplex beta,
+	int baseIndex);

 /** @}*/

--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@ -14,10 +14,14 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
  cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\
  dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\
  dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \
-  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
-  ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
-  hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
-  hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
+  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o ell_dcsput.o \
+  ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
+  ell_cspmm.o ell_dspmm.o ell_sspmm.o ell_zspmm.o \
+  hdia_cspmv.o hdia_cspmm.o hdia_dspmv.o hdia_dspmm.o \
+  hdia_sspmv.o hdia_sspmm.o hdia_zspmv.o hdia_zspmm.o \
+  hell_cspmv.o hell_dspmv.o hell_sspmv.o hell_zspmv.o \
+  hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
+  igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
  saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
  zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
  sxyzw.o cxyzw.o dxyzw.o zxyzw.o
--- a/cuda/spgpu/kernels/ell_cspmm.cu
+++ b/cuda/spgpu/kernels/ell_cspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuFloatComplex
+#define TYPE_SYMBOL C
+#define TEX_FETCH_TYPE cuFloatComplex
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_dspmm.cu
+++ b/cuda/spgpu/kernels/ell_dspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <stdio.h>
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE double
+#define TYPE_SYMBOL D
+#define TEX_FETCH_TYPE int2
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base.cuh
@ -0,0 +1,276 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+
+#define PRE_CONCAT(A, B) A ## B
+#define CONCAT(A, B) PRE_CONCAT(A, B)
+
+#undef GEN_SPGPU_ELL_NAME
+#undef X_TEX
+#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
+
+__device__ __host__ static float zero_float() { return 0.0f; }
+__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
+__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
+
+__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
+__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
+__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
+
+__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
+__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
+__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
+
+__device__ static float readValue_float(float fetch) { return fetch; }
+__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
+
+// host or c.c >= 1.3 
+#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
+__device__ __host__ static double zero_double() { return 0.0; }
+__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
+__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
+
+__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
+__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
+__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
+
+__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
+__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
+__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
+
+__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
+__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
+{
+	cuDoubleComplex c;
+	c.x = __hiloint2double (fetch.y, fetch.x);
+	c.y = __hiloint2double (fetch.w, fetch.z);
+	return c;
+}
+#endif
+
+#if 0
+// Texture cache management
+texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
+
+#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
+#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
+
+__device__ static VALUE_TYPE 
+fetchTex (int pointer)
+{
+	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
+	return CONCAT(readValue_,VALUE_TYPE) (fetch);
+}
+#endif
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
+#define GEN_SPGPU_ELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
+#include "ell_spmm_base_template.cuh"
+#if 0
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
+#define GEN_SPGPU_ELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
+#undef USE_PREFETCHING
+#define USE_PREFETCHING
+#include "ell_spmm_base_template.cuh"
+#define ENABLE_CACHE
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
+#define GEN_SPGPU_ELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
+#include "ell_spmm_base_template.cuh"
+#undef GEN_SPGPU_ELL_NAME
+#undef USE_PREFETCHING
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
+#define GEN_SPGPU_ELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
+#include "ell_spmm_base_template.cuh"
+#endif
+#undef GEN_SPGPU_ELL_NAME
+#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
+
+#if 0
+
+void
+GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP,
+  int cMPitch,
+  int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+  int avgNnzPerRow,
+  int maxNnzPerRow,
+	int rows, 
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta,
+	int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt, c1;
+
+  dim3 block (THREAD_BLOCK, 1);
+  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  int numMp=getGPUMultiProcessors();
+  int maxThMp=getGPUMaxThreadsPerMP();
+  int nmblksMp=maxThMp/THREAD_BLOCK;
+  int nmblk=nmblksMp*numMp;
+  dim3 grid (nmblk);
+
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  if (shrMemSize > maxShmemSz) {
+    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
+    return;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > 2*MMBSZ) {
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt > MMBSZ) {
+    c1 = cnt/2;
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    cnt -= c1;
+  }
+  if (cnt > MMBSZ) {
+    fprintf(stderr,"Invalid residual count %d\n",cnt);
+  } else if (cnt > 0){
+    CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
+							      alpha, cM, rP, cMPitch, rPPitch,
+							      rS, rows, px, xPitch, beta, baseIndex);
+  }
+  cudaCheckError("CUDA error on ell_spmm");
+}
+
+#endif
+
+void
+GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP,
+  int cMPitch,
+  int rPPitch,
+	const __device int* rS,
+	const __device int* rIdx, 
+  int avgNnzPerRow,
+  int maxNnzPerRow,
+	int rows, 
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta,
+	int baseIndex)
+{
+  VALUE_TYPE *px,*py,*pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    cM = cM + maxNForACall;
+	rP = rP + maxNForACall;
+	rS = rS + maxNForACall;
+    rows -= maxNForACall;
+  }
+
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, rows,
+							  px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  cMPitch, rPPitch,
+							  rS, rIdx, avgNnzPerRow,
+							  maxNnzPerRow, rows,
+							  px, xPitch, beta, baseIndex);
+  }
+  
+  cudaCheckError("CUDA error on ell_spmm");
+}
--- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh
@ -0,0 +1,169 @@
+/* 
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define IDX2
+#define THREAD_BLOCK 128
+#define MMBSZ 8
+
+#if 0
+
+__global__ void
+CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int cMPitch, int rPPitch, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+  
+  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
+  unsigned int gridSize =  gridDim.x * blockDim.x;
+  
+  while (i < rows) {
+
+    rS += i; rP += i; cM += i;
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += rPPitch;
+      
+      value = cM[0];
+      cM += cMPitch;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	      fetch = px[pointer]; 
+	      temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	      px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+    }
+    
+    i += gridSize;
+  }
+}
+
+#endif
+
+__global__ void
+CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int cMPitch, int rPPitch, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+    
+  if (i < rows) {
+    rS += i; rP += i; cM += i;
+
+    int rowSize = rS[0];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }
+
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += rPPitch;
+      
+      value = cM[0];
+      cM += cMPitch;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        px = px + xPitch;
+      }
+    }
+
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+    }
+  }
+}
+
+void
+CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch,
+   const int* rS,  const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows,
+   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
+							    alpha, cM, rP, cMPitch, rPPitch, rS, rows,
+							    x, xPitch, beta, baseIndex);
+}
--- a/cuda/spgpu/kernels/ell_sspmm.cu
+++ b/cuda/spgpu/kernels/ell_sspmm.cu
@ -0,0 +1,34 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE float
+#define TYPE_SYMBOL S
+#define TEX_FETCH_TYPE float
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_zspmm.cu
+++ b/cuda/spgpu/kernels/ell_zspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "ell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuDoubleComplex
+#define TYPE_SYMBOL Z
+#define TEX_FETCH_TYPE int4
+#include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/hdia_cspmm.cu
+++ b/cuda/spgpu/kernels/hdia_cspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+ 
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hdia.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuFloatComplex
+#define TYPE_SYMBOL C
+#define TEX_FETCH_TYPE cuFloatComplex
+#include "hdia_spmm_base.cuh"
--- a/cuda/spgpu/kernels/hdia_dspmm.cu
+++ b/cuda/spgpu/kernels/hdia_dspmm.cu
@ -0,0 +1,36 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hdia.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+//#define ENABLE_CACHE
+#define VALUE_TYPE double
+#define TYPE_SYMBOL D
+//#define TEX_FETCH_TYPE int2
+#include "hdia_spmm_base.cuh"
+
--- a/cuda/spgpu/kernels/hdia_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base.cuh
@ -0,0 +1,188 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+
+#define PRE_CONCAT(A, B) A ## B
+#define CONCAT(A, B) PRE_CONCAT(A, B)
+
+#undef GEN_SPGPU_HDIA_NAME
+#undef X_TEX
+#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
+
+__device__ __host__ static float zero_float() { return 0.0f; }
+__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
+__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
+
+__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
+__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
+__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
+
+__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
+__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
+__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
+
+__device__ static float readValue_float(float fetch) { return fetch; }
+__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
+
+// host or c.c >= 1.3 
+#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
+__device__ __host__ static double zero_double() { return 0.0; }
+__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
+__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
+
+__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
+__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
+__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
+
+__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
+__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
+__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
+
+__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
+__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
+{
+	cuDoubleComplex c;
+	c.x = __hiloint2double (fetch.y, fetch.x);
+	c.y = __hiloint2double (fetch.w, fetch.z);
+	return c;
+}
+#endif
+#if 0
+// Texture cache management
+texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
+
+#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
+#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
+
+__device__ static VALUE_TYPE 
+fetchTex (int pointer)
+{
+	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
+	return CONCAT(readValue_,VALUE_TYPE) (fetch);
+}
+#endif
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
+#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
+#include "hdia_spmm_base_template.cuh"
+#if 0 
+#undef GEN_SPGPU_HDIA_NAME
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
+#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
+#undef USE_PREFETCHING
+#define USE_PREFETCHING
+#include "hdia_spmm_base_template.cuh"
+#define ENABLE_CACHE
+#undef GEN_SPGPU_HDIA_NAME
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
+#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
+#include "hdia_spmm_base_template.cuh"
+#undef GEN_SPGPU_HDIA_NAME
+#undef USE_PREFETCHING
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
+#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
+#include "hdia_spmm_base_template.cuh"
+#endif
+#undef GEN_SPGPU_HDIA_NAME
+#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)
+
+void
+GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+	int count,
+	VALUE_TYPE* z,
+	int zPitch,
+	const VALUE_TYPE *y,
+	int yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* hdiaOffsets,
+  int hackSize,
+	const __device int* hackOffsets,
+	int rows, 
+  int cols,
+	const VALUE_TYPE *x,
+	int xPitch,
+	VALUE_TYPE beta)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+  
+  // maxNForACall should be a multiple of hackSize
+  maxNForACall = (maxNForACall/hackSize)*hackSize;
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  maxNForACall, cols,
+							  px, xPitch, beta);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  maxNForACall, cols,
+							  px, xPitch, beta);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    hackOffsets = hackOffsets + maxNForACall/hackSize;
+    rows -= maxNForACall;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  rows, cols,
+							  px, xPitch, beta);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, hdiaOffsets,
+							  hackSize, hackOffsets,
+							  rows, cols,
+							  px, xPitch, beta);
+  }
+  
+  cudaCheckError("CUDA error on hdiag_spmm");
+}
--- a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh
@ -0,0 +1,141 @@
+/* 
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+ 
+#define THREAD_BLOCK 128
+#define MMBSZ 8
+
+__global__ void
+CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
+	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
+  int hackSize, const int* hackOffsets, int rows, int cols,
+  const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int hackCount = (rows + hackSize - 1)/hackSize;
+
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+
+  int hackId = i / hackSize;
+  int hackLaneId = i % hackSize;
+
+  // shared between offsetsChunks and warpHackOffsetTemp
+	extern __shared__ int dynShrMem[]; 
+    
+  int hackOffset = 0;
+	int nextOffset = 0;
+
+  unsigned int laneId = threadIdx.x % warpSize;
+	unsigned int warpId = threadIdx.x / warpSize;
+
+  if (laneId == 0 && i < rows) {
+    hackOffset = hackOffsets[hackId];
+		nextOffset = hackOffsets[hackId+1];
+  }
+
+  hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);		
+	nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
+    
+  if (hackId >= hackCount)
+		return;
+    
+  cM += hackOffset*hackSize + hackLaneId;
+	hdiaOffsets += hackOffset;
+
+  for (int k=0; k<count; k++) {
+    temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+  }
+
+  // diags for this hack is next hackOffset minus current hackOffset
+	int diags = nextOffset - hackOffset;
+    
+	// Warp oriented
+	int rounds = (diags + warpSize - 1)/warpSize;
+	
+	volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
+
+  for (int r = 0; r < rounds; r++) {
+    // in the last round diags will be <= warpSize
+		if (laneId < diags)
+			offsetsChunk[laneId] = hdiaOffsets[laneId];
+
+    if (i < rows) {
+			int dCount = min(diags, warpSize);
+
+      for (int j = 0; j < dCount; ++j) {
+        int column = offsetsChunk[j] + i;
+
+				if(column >= 0 && column < cols) {
+          px = (VALUE_TYPE *) x;
+          for (int k = 0; k < count; k++) {
+            VALUE_TYPE xValue = px[column];
+            VALUE_TYPE mValue = cM[0];
+            temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
+            px = px + xPitch;
+          }
+        }
+        cM += hackSize;
+      }
+    }
+    diags -= warpSize;
+    hdiaOffsets += warpSize;
+  }
+
+  // Since z and y are accessed with the same offset by the same thread,
+	// and the write to z follows the y read, y and z can share the same base address (in-place computing).
+	if (i >= rows)
+    return;
+
+  py = (VALUE_TYPE *) y;
+  pz = z;
+
+  if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
+    for (int k=0; k<count; k++) {
+      yVal = py[i];
+      pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+      py += yPitch;
+      pz += zPitch;
+    }
+  else
+    for (int k=0; k<count; k++) {
+      pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+      pz += zPitch;
+    }
+}
+
+void
+CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
+	int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
+	const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
+  int xPitch, VALUE_TYPE beta)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz = getGPUSharedMemPerBlock();
+  shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
+							    alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
+							    x, xPitch, beta);
+}
--- a/cuda/spgpu/kernels/hdia_sspmm.cu
+++ b/cuda/spgpu/kernels/hdia_sspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+ 
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hdia.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE float
+#define TYPE_SYMBOL S
+#define TEX_FETCH_TYPE float
+#include "hdia_spmm_base.cuh"
+
--- a/cuda/spgpu/kernels/hdia_zspmm.cu
+++ b/cuda/spgpu/kernels/hdia_zspmm.cu
@ -0,0 +1,36 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+ 
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hdia.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuDoubleComplex
+#define TYPE_SYMBOL Z
+#define TEX_FETCH_TYPE int4
+#include "hdia_spmm_base.cuh"
+
--- a/cuda/spgpu/kernels/hell_cspmm.cu
+++ b/cuda/spgpu/kernels/hell_cspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuFloatComplex
+#define TYPE_SYMBOL C
+#define TEX_FETCH_TYPE cuFloatComplex
+#include "hell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/hell_dspmm.cu
+++ b/cuda/spgpu/kernels/hell_dspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <stdio.h>
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE double
+#define TYPE_SYMBOL D
+#define TEX_FETCH_TYPE int2
+#include "hell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/hell_dspmv.cu
+++ b/cuda/spgpu/kernels/hell_dspmv.cu
@ -16,11 +16,14 @@

 #include "cudadebug.h"
 #include "cudalang.h"
-
+#include <stdio.h>
 extern "C"
 {
 #include "core.h"
 #include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
 }

 #include "debug.h"
@ -29,4 +32,3 @@ extern "C"
 #define TYPE_SYMBOL D
 #define TEX_FETCH_TYPE int2
 #include "hell_spmv_base.cuh"
-
--- a/cuda/spgpu/kernels/hell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base.cuh
@ -0,0 +1,274 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ *  
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define PRE_CONCAT(A, B) A ## B
+#define CONCAT(A, B) PRE_CONCAT(A, B)
+
+#undef GEN_SPGPU_HELL_NAME
+#undef X_TEX
+#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
+
+__device__ __host__ static float zero_float() { return 0.0f; }
+__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
+__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
+
+__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
+__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
+__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
+
+__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
+__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
+__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
+
+__device__ static float readValue_float(float fetch) { return fetch; }
+__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
+
+// host or c.c >= 1.3 
+#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
+__device__ __host__ static double zero_double() { return 0.0; }
+__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
+__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
+
+__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
+__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
+__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
+
+__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
+__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
+__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
+
+__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
+__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
+{
+	cuDoubleComplex c;
+	c.x = __hiloint2double (fetch.y, fetch.x);
+	c.y = __hiloint2double (fetch.w, fetch.z);
+	return c;
+}
+#endif
+#if 0 
+// Texture cache management
+texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
+
+#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
+#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
+
+__device__ static VALUE_TYPE 
+fetchTex (int pointer)
+{
+	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
+	return CONCAT(readValue_,VALUE_TYPE) (fetch);
+}
+#endif
+#if __CUDA_ARCH__ < 300
+extern __shared__ int dynShrMem[]; 
+#endif
+
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
+#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#if 0
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
+#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
+#undef USE_PREFETCHING
+#define USE_PREFETCHING
+#include "hell_spmm_base_template.cuh"
+#define ENABLE_CACHE
+#undef GEN_SPGPU_HELL_NAME
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
+#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#undef USE_PREFETCHING
+#endif
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
+#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
+#include "hell_spmm_base_template.cuh"
+#undef GEN_SPGPU_HELL_NAME
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
+
+#if 0
+
+void
+GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+  int count, 
+  VALUE_TYPE* z,
+  int zPitch,
+	const VALUE_TYPE *y,
+  int  yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP, 
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const VALUE_TYPE *x,
+  int xPitch,
+	VALUE_TYPE beta, 
+  int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt, c1;
+
+  dim3 block (THREAD_BLOCK, 1);
+  //  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  int numMp=getGPUMultiProcessors();
+  int maxThMp=getGPUMaxThreadsPerMP();
+  int nmblksMp=maxThMp/THREAD_BLOCK;
+  int nmblk=nmblksMp*numMp;
+  dim3 grid (nmblk);
+
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  if (shrMemSize > maxShmemSz) {
+    fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
+    return;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > 2*MMBSZ) {
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt > MMBSZ) {
+    c1 = cnt/2;
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+    cnt -= c1;
+  }
+  if (cnt > MMBSZ) {
+    fprintf(stderr,"Invalid residual count %d\n",cnt);
+  } else if (cnt > 0){
+    CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) 
+      <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
+							      alpha, cM, rP, hackSize, hackOffsets,
+							      rS, rows, px, xPitch, beta, baseIndex);
+  }
+  cudaCheckError("CUDA error on hell_spmm");
+}
+
+#endif
+
+void
+GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
+(spgpuHandle_t handle,
+  int count, 
+  VALUE_TYPE* z,
+  int zPitch,
+	const VALUE_TYPE *y,
+  int  yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP, 
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const VALUE_TYPE *x,
+  int xPitch,
+	VALUE_TYPE beta, 
+  int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+  
+  // maxNForACall should be a multiple of hackSize
+  maxNForACall = (maxNForACall/hackSize)*hackSize;
+  int maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  //fprintf(stderr,"MaxSHmemSz  %d \n",maxShmemSz);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    hackOffsets = hackOffsets + maxNForACall/hackSize;
+    rS = rS + maxNForACall;
+    
+    rows -= maxNForACall;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
+							alpha, cM, rP, hackSize, hackOffsets,
+							rS, rIdx, rows,
+							px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							py, yPitch,
+							alpha, cM, rP,
+							hackSize, hackOffsets,
+							rS, rIdx,
+							rows,
+							px, xPitch, beta, baseIndex);
+  }
+  
+  cudaCheckError("CUDA error on hell_spmm");
+}
--- a/cuda/spgpu/kernels/hell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh
@ -0,0 +1,194 @@
+/* 
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2015
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define IDX2
+#define THREAD_BLOCK 128
+#define MMBSZ 8
+
+#if 0
+
+__global__ void
+CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int hackSize, const int* hackOffsets, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+
+  int *rrP;
+  VALUE_TYPE *rcM;
+  
+  unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
+  unsigned int gridSize =  gridDim.x * blockDim.x;
+  
+  while (i < rows) {
+    int j; 
+    int hackId = i / hackSize;
+    int hackLaneId = i % hackSize;
+    
+    int hackOffset;
+    unsigned int laneId = threadIdx.x % 32;
+    if (laneId == 0)
+      hackOffset = hackOffsets[hackId];
+    //__syncthreads();
+    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
+    
+    rrP = (int *) rP + hackOffset; 
+    rcM = (VALUE_TYPE *) cM + hackOffset; 
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rrP[0] - baseIndex;
+      rrP += hackSize;
+      
+      value = rcM[0];
+      rcM += hackSize;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+	      px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    } else {
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+    }
+    
+    i += gridSize;
+  }
+}
+
+#endif
+
+__global__ void
+CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int hackSize, const int* hackOffsets, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
+        
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+    
+  if (i < rows) {
+    int j; 
+    int hackId = i / hackSize;
+    int hackLaneId = i % hackSize;
+    
+    int hackOffset;
+    unsigned int laneId = threadIdx.x % 32;
+    if (laneId == 0)
+      hackOffset = hackOffsets[hackId];
+    //__syncthreads();
+    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
+    
+    rP += hackOffset; 
+    cM += hackOffset; 
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += hackSize;
+      
+      value = cM[0];
+      cM += hackSize;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+        fetch = px[pointer]; 
+        temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
+        px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
+      for (int k=0; k<count; k++) {
+        yVal = py[i];
+        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
+        py += yPitch;
+        pz += zPitch;
+      }
+    else
+      for (int k=0; k<count; k++) {
+        pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
+        pz += zPitch;
+      }
+  }
+}
+
+
+void
+CONCAT(_,GEN_SPGPU_HELL_NAME(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
+   const int* rS,  const __device int* rIdx, int rows,
+   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize,maxShmemSz;
+  maxShmemSz=getGPUSharedMemPerBlock();
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
+							    alpha, cM, rP, hackSize, hackOffsets, rS, rows,
+							    x, xPitch, beta, baseIndex);
+}
--- a/cuda/spgpu/kernels/hell_spmv_base.cuh
+++ b/cuda/spgpu/kernels/hell_spmv_base.cuh
@ -14,7 +14,6 @@
 * GNU General Public License for more details.
 */

-
 #define PRE_CONCAT(A, B) A ## B
 #define CONCAT(A, B) PRE_CONCAT(A, B)

@ -156,4 +155,3 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	
 	cudaCheckError("CUDA error on hell_spmv");
 }
-
--- a/cuda/spgpu/kernels/hell_sspmm.cu
+++ b/cuda/spgpu/kernels/hell_sspmm.cu
@ -0,0 +1,34 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE float
+#define TYPE_SYMBOL S
+#define TEX_FETCH_TYPE float
+#include "hell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/hell_zspmm.cu
+++ b/cuda/spgpu/kernels/hell_zspmm.cu
@ -0,0 +1,35 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2014
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include "cuComplex.h"
+
+extern "C"
+{
+#include "core.h"
+#include "hell.h"
+  int getGPUSharedMemPerBlock();
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+#include "debug.h"
+
+#define VALUE_TYPE cuDoubleComplex
+#define TYPE_SYMBOL Z
+#define TEX_FETCH_TYPE int4
+#include "hell_spmm_base.cuh"
--- a/cuda/vectordev.c
+++ b/cuda/vectordev.c
@ -38,8 +38,10 @@
 #include "core.h"

 //new
-MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size, 
-						   unsigned int elementType)
+MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
+						   unsigned int size, 
+						   unsigned int elementType,
+						   unsigned int storage2d)
 {
  struct MultiVectorDeviceParams params;

@ -75,6 +77,7 @@ MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned
 	
  params.count = count;
  params.size = size;
+  params.storage2d = storage2d ;

  return params;

@ -87,8 +90,9 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
  
  struct MultiVectDevice *tmp = (struct MultiVectDevice *)malloc(sizeof(struct MultiVectDevice));
  *remoteMultiVec = (void *)tmp;
-  tmp->size_ = params->size;
-  tmp->count_ = params->count;
+  tmp->size_      = params->size;
+  tmp->count_     = params->count;
+  tmp->storage2d_ = params->storage2d;

  if (params->elementType == SPGPU_TYPE_INT)
    {
@ -163,9 +167,9 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
 { int i;
  struct MultiVectorDeviceParams p;

-  p = getMultiVectorDeviceParams(count, size, elementType);
+  p = getMultiVectorDeviceParams(count, size, elementType, PSB_VECT_STOR_BY_COLS);
  i = allocMultiVecDevice(deviceMultiVec, &p);
-  //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
+  //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.count);
  //cudaSync();
  if (i != 0) {
    fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
@ -194,3 +198,9 @@ int getMultiVecDevicePitch(void* deviceVec)
  return(i);
 }

+int getMultiVecDeviceStorage2d(void* deviceVec)
+{ int i;
+  struct MultiVectDevice *dev = (struct MultiVectDevice *) deviceVec;
+  i = dev->storage2d_;
+  return(i);
+}
--- a/cuda/vectordev.h
+++ b/cuda/vectordev.h
@ -37,8 +37,14 @@
 #include "cintrf.h"
 #include <complex.h>

+#define PSB_VECT_STOR_BY_COLS  0
+#define PSB_VECT_STOR_BY_ROWS  1
+
 struct MultiVectDevice
 {
+  // storage of 2d data
+  int storage2d_;
+  
  // number of vectors
  int count_;

@ -54,17 +60,19 @@ struct MultiVectDevice

 typedef struct MultiVectorDeviceParams
 {			
-	// number on vectors
-	unsigned int count; //1 for a simple vector
-
-	// The resulting allocation will be pitch*s*(size of the elementType)
-	unsigned int elementType;
-	
-	// Pitch (in number of elements)
-	unsigned int pitch;
-
-	// Size of a single vector (in number of elements).
-	unsigned int size; 
+  // storage of 2d data
+  unsigned int storage2d;
+  // number on vectors
+  unsigned int count; //1 for a simple vector
+  
+  // The resulting allocation will be pitch*s*(size of the elementType)
+  unsigned int elementType;
+  
+  // Pitch (in number of elements)
+  unsigned int pitch;
+  
+  // Size of a single vector (in number of elements).
+  unsigned int size; 
 } MultiVectorDeviceParams;


@ -76,7 +84,8 @@ int unregisterMapped(void *);

 MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
 						   unsigned int size,  
-						   unsigned int elementType);
+						   unsigned int elementType,
+						   unsigned int storage2d);

 int FallocMultiVecDevice(void** deviceMultiVec, unsigned count, 
 			 unsigned int size, unsigned int elementType);
@ -85,3 +94,4 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
 int getMultiVecDeviceSize(void* deviceVec);
 int getMultiVecDeviceCount(void* deviceVec);
 int getMultiVecDevicePitch(void* deviceVec);
+int getMultiVecDeviceStorage2d(void* deviceVec);
--- a/prec/impl/Makefile
+++ b/prec/impl/Makefile
@ -7,16 +7,16 @@ HERE=..
 OBJS=psb_s_prec_type_impl.o psb_d_prec_type_impl.o \
     psb_c_prec_type_impl.o psb_z_prec_type_impl.o \
     psb_d_diagprec_impl.o psb_d_bjacprec_impl.o psb_d_nullprec_impl.o \
-     psb_dilu_fct.o psb_d_ilu0_fact.o psb_d_iluk_fact.o psb_d_ilut_fact.o \
+     psb_d_ilu0_fact.o psb_d_iluk_fact.o psb_d_ilut_fact.o \
     psb_dprecbld.o psb_dprecinit.o \
     psb_s_diagprec_impl.o psb_s_bjacprec_impl.o psb_s_nullprec_impl.o \
-     psb_silu_fct.o psb_s_ilu0_fact.o psb_s_iluk_fact.o psb_s_ilut_fact.o \
+     psb_s_ilu0_fact.o psb_s_iluk_fact.o psb_s_ilut_fact.o \
     psb_sprecbld.o  psb_sprecinit.o \
     psb_c_diagprec_impl.o psb_c_bjacprec_impl.o psb_c_nullprec_impl.o \
-     psb_cilu_fct.o psb_c_ilu0_fact.o psb_c_iluk_fact.o psb_c_ilut_fact.o \
+     psb_c_ilu0_fact.o psb_c_iluk_fact.o psb_c_ilut_fact.o \
     psb_cprecbld.o  psb_cprecinit.o \
     psb_z_diagprec_impl.o psb_z_bjacprec_impl.o psb_z_nullprec_impl.o \
-     psb_zilu_fct.o psb_z_ilu0_fact.o psb_z_iluk_fact.o psb_z_ilut_fact.o \
+     psb_z_ilu0_fact.o psb_z_iluk_fact.o psb_z_ilut_fact.o \
     psb_zprecbld.o  psb_zprecinit.o \
     psb_c_sparsify.o psb_d_sparsify.o psb_s_sparsify.o psb_z_sparsify.o \
     psb_crwclip.o psb_drwclip.o psb_srwclip.o psb_zrwclip.o \
--- a/prec/impl/psb_cilu_fct.f90
+++ b/prec/impl/psb_cilu_fct.f90
@ -1,438 +0,0 @@
-!   
-!                Parallel Sparse BLAS  version 3.5
-!      (C) Copyright 2006-2018
-!        Salvatore Filippone    
-!        Alfredo Buttari      
-!   
-!    Redistribution and use in source and binary forms, with or without
-!    modification, are permitted provided that the following conditions
-!    are met:
-!      1. Redistributions of source code must retain the above copyright
-!         notice, this list of conditions and the following disclaimer.
-!      2. Redistributions in binary form must reproduce the above copyright
-!         notice, this list of conditions, and the following disclaimer in the
-!         documentation and/or other materials provided with the distribution.
-!      3. The name of the PSBLAS group or the names of its contributors may
-!         not be used to endorse or promote products derived from this
-!         software without specific written permission.
-!   
-!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!    POSSIBILITY OF SUCH DAMAGE.
-!   
-!    
-subroutine psb_cilu_fct(a,l,u,d,info,blck)
-  
-  !
-  ! This routine copies and factors "on the fly" from A and BLCK
-  ! into L/D/U. 
-  !
-  !
-  use psb_base_mod
-  implicit none
-  !     .. Scalar Arguments ..
-  integer(psb_ipk_), intent(out)                ::     info
-  !     .. Array Arguments ..
-  type(psb_cspmat_type),intent(in)    :: a
-  type(psb_c_csr_sparse_mat),intent(inout) :: l,u
-  type(psb_cspmat_type),intent(in), optional, target :: blck
-  complex(psb_spk_), intent(inout)     ::  d(:)
-  !     .. Local Scalars ..
-  integer(psb_ipk_) ::  l1, l2,m,err_act  
-  type(psb_cspmat_type), pointer  :: blck_
-  character(len=20)   :: name, ch_err
-  name='psb_ilu_fct'
-  info = psb_success_
-  call psb_erractionsave(err_act)
-  !     .. Executable Statements ..
-  !
-
-  if (present(blck)) then 
-    blck_ => blck
-  else
-    allocate(blck_,stat=info) 
-    if (info /= psb_success_) then 
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
-      goto 9999      
-    end if
-
-    call blck_%csall(izero,izero,info,ione)
-
-  endif
-
-  call psb_cilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
-       & d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
-  if(info /= psb_success_) then
-     info=psb_err_from_subroutine_
-     ch_err='psb_cilu_fctint'
-     call psb_errpush(info,name,a_err=ch_err)
-     goto 9999
-  end if
-
-  call l%set_triangle()
-  call l%set_lower()
-  call l%set_unit()
-  call u%set_triangle()
-  call u%set_upper()
-  call u%set_unit()
-  call l%set_nrows(m)
-  call l%set_ncols(m)
-  call u%set_nrows(m)
-  call u%set_ncols(m)
-
-  if (present(blck)) then 
-    blck_ => null() 
-  else
-    call blck_%free()
-    if(info /= psb_success_) then
-       info=psb_err_from_subroutine_
-       ch_err='psb_sp_free'
-       call psb_errpush(info,name,a_err=ch_err)
-       goto 9999
-    end if
-    deallocate(blck_) 
-  endif
-
-  call psb_erractionrestore(err_act)
-  return
-
-9999 continue
-  call psb_erractionrestore(err_act)
-  if (err_act == psb_act_abort_) then
-     call psb_error()
-     return
-  end if
-  return
-
-contains
-  subroutine psb_cilu_fctint(m,ma,a,mb,b,&
-       & d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
-    implicit none 
-
-    type(psb_cspmat_type)          :: a,b
-    integer(psb_ipk_) :: m,ma,mb,l1,l2,info
-    integer(psb_ipk_), dimension(:)          :: lia1,lia2,uia1,uia2
-    complex(psb_spk_), dimension(:) :: laspk,uaspk,d
-
-    integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
-    complex(psb_spk_) :: dia,temp
-    integer(psb_ipk_), parameter :: nrb=60
-    type(psb_c_coo_sparse_mat) :: trw
-    integer(psb_ipk_) :: int_err(5) 
-    character(len=20)   :: name, ch_err
-
-    name='psb_cilu_fctint'
-    if(psb_get_errstatus() /= 0) return 
-    info=psb_success_
-    call psb_erractionsave(err_act)
-    call trw%allocate(izero,izero,ione)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_sp_all'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    lia2(1) = 1
-    uia2(1) = 1
-    l1=0
-    l2=0
-    m = ma+mb
-
-    do i = 1, ma
-      d(i) = czero
-
-      !
-      !
-      select type(aa => a%a) 
-      type is (psb_c_csr_sparse_mat)
-        do j = aa%irp(i), aa%irp(i+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i,i+irb-1,trw,info)
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          nz = trw%get_nzeros()
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-!!$
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloop: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !    j=i  update diagonal
-            !    write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
-            dia = dia - temp*uaspk(jj)
-            !    write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
-            cycle updateloop
-            !     
-          else if (j > i) then
-            !     search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloop
-      enddo
-      !    
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < s_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        info = psb_err_pivot_too_small_
-        int_err(1) = i
-        write(ch_err,'(g20.10)') abs(dia)
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = cone/dia
-      end if
-      d(i) = dia
-      ! write(psb_err_unit,*)'diag(',i,')=',d(i)
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    do i = ma+1, m
-      d(i) = czero
-
-      select type(aa => b%a) 
-      type is (psb_c_csr_sparse_mat)
-        do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i-ma,i-ma+irb-1,trw,info)
-          nz = trw%get_nzeros()
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloopb: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !   j=i  update diagonal
-            dia = dia - temp*uaspk(jj)
-            cycle updateloopb
-            !     
-          else if (j > i) then
-            !    search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloopb
-      enddo
-      !     
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < s_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        int_err(1) = i
-        write(ch_err,'(g20.10)') abs(dia)
-        info = psb_err_pivot_too_small_
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = cone/dia
-      end if
-      d(i) = dia
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    call trw%free()
-
-    call psb_erractionrestore(err_act)
-    return
-
-9999 continue
-    call psb_erractionrestore(err_act)
-    if (err_act == psb_act_abort_) then
-      call psb_error()
-      return
-    end if
-    return
-  end subroutine psb_cilu_fctint
-end subroutine psb_cilu_fct
--- a/prec/impl/psb_dilu_fct.f90
+++ b/prec/impl/psb_dilu_fct.f90
@ -1,441 +0,0 @@
-!   
-!                Parallel Sparse BLAS  version 3.5
-!      (C) Copyright 2006-2018
-!        Salvatore Filippone    
-!        Alfredo Buttari      
-!   
-!    Redistribution and use in source and binary forms, with or without
-!    modification, are permitted provided that the following conditions
-!    are met:
-!      1. Redistributions of source code must retain the above copyright
-!         notice, this list of conditions and the following disclaimer.
-!      2. Redistributions in binary form must reproduce the above copyright
-!         notice, this list of conditions, and the following disclaimer in the
-!         documentation and/or other materials provided with the distribution.
-!      3. The name of the PSBLAS group or the names of its contributors may
-!         not be used to endorse or promote products derived from this
-!         software without specific written permission.
-!   
-!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!    POSSIBILITY OF SUCH DAMAGE.
-!   
-!    
-subroutine psb_dilu_fct(a,l,u,d,info,blck)
-  
-  !
-  ! This routine copies and factors "on the fly" from A and BLCK
-  ! into L/D/U. 
-  !
-  !
-  use psb_base_mod
-  implicit none
-  !     .. Scalar Arguments ..
-  integer(psb_ipk_), intent(out)                ::     info
-  !     .. Array Arguments ..
-  type(psb_dspmat_type),intent(in)    :: a
-  type(psb_d_csr_sparse_mat),intent(inout) :: l,u
-  type(psb_dspmat_type),intent(in), optional, target :: blck
-  real(psb_dpk_), intent(inout)     ::  d(:)
-  !     .. Local Scalars ..
-  integer(psb_ipk_) ::  l1,l2,m,err_act  
-  type(psb_dspmat_type), pointer  :: blck_
-  character(len=20)   :: name, ch_err
-  name='psb_ilu_fct'
-  info = psb_success_
-  call psb_erractionsave(err_act)
-  !     .. Executable Statements ..
-  !
-
-  if (present(blck)) then 
-    blck_ => blck
-  else
-    allocate(blck_,stat=info) 
-    if (info /= psb_success_) then 
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
-      goto 9999      
-    end if
-
-    call blck_%csall(izero,izero,info,ione)
-
-  endif
-
-  call psb_dilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
-       & d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
-  if(info /= psb_success_) then
-     info=psb_err_from_subroutine_
-     ch_err='psb_dilu_fctint'
-     call psb_errpush(info,name,a_err=ch_err)
-     goto 9999
-  end if
-
-  call l%set_triangle()
-  call l%set_lower()
-  call l%set_unit()
-  call u%set_triangle()
-  call u%set_upper()
-  call u%set_unit()
-  call l%set_nrows(m)
-  call l%set_ncols(m)
-  call u%set_nrows(m)
-  call u%set_ncols(m)
-
-  if (present(blck)) then 
-    blck_ => null() 
-  else
-    call blck_%free()
-    if(info /= psb_success_) then
-       info=psb_err_from_subroutine_
-       ch_err='psb_sp_free'
-       call psb_errpush(info,name,a_err=ch_err)
-       goto 9999
-    end if
-    deallocate(blck_) 
-  endif
-
-  call psb_erractionrestore(err_act)
-  return
-
-9999 continue
-  call psb_erractionrestore(err_act)
-  if (err_act == psb_act_abort_) then
-     call psb_error()
-     return
-  end if
-  return
-
-contains
-  subroutine psb_dilu_fctint(m,ma,a,mb,b,&
-       & d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
-    use psb_mat_mod
-
-    implicit none 
-
-    type(psb_dspmat_type), target :: a
-    type(psb_dspmat_type), target :: b
-    integer(psb_ipk_) :: m,ma,mb,l1,l2,info
-    integer(psb_ipk_), dimension(:)          :: lia1,lia2,uia1,uia2
-    real(psb_dpk_), dimension(:) :: laspk,uaspk,d
-
-    integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
-    real(psb_dpk_) :: dia,temp
-    integer(psb_ipk_), parameter :: nrb=60
-    type(psb_d_coo_sparse_mat) :: trw
-    integer(psb_ipk_) :: int_err(5) 
-    character(len=20)   :: name, ch_err
-
-
-    name='psb_dilu_fctint'
-    if(psb_get_errstatus() /= 0) return 
-    info=psb_success_
-    call psb_erractionsave(err_act)
-    call trw%allocate(izero,izero,ione)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_sp_all'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    lia2(1) = 1
-    uia2(1) = 1
-    l1=0
-    l2=0
-    m = ma+mb
-
-    do i = 1, ma
-      d(i) = dzero
-      !
-      !
-      select type(aa => a%a) 
-      type is (psb_d_csr_sparse_mat)
-        do j = aa%irp(i), aa%irp(i+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i,i+irb-1,trw,info)
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          nz = trw%get_nzeros()
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-!!$
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloop: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !    j=i  update diagonal
-            !    write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
-            dia = dia - temp*uaspk(jj)
-            !    write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
-            cycle updateloop
-            !     
-          else if (j > i) then
-            !     search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloop
-      enddo
-      !    
-      !     
-      !     Non singularity
-      !     
-      if (dabs(dia) < d_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        info = psb_err_pivot_too_small_
-        int_err(1) = i
-        write(ch_err,'(g20.10)') dia
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = done/dia
-      end if
-      d(i) = dia
-      ! write(psb_err_unit,*)'diag(',i,')=',d(i)
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    do i = ma+1, m
-      d(i) = dzero
-
-      select type(aa => b%a) 
-      type is (psb_d_csr_sparse_mat)
-        do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i-ma,i-ma+irb-1,trw,info)
-          nz = trw%get_nzeros()
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloopb: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !   j=i  update diagonal
-            dia = dia - temp*uaspk(jj)
-            cycle updateloopb
-            !     
-          else if (j > i) then
-            !    search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloopb
-      enddo
-      !     
-      !     
-      !     Non singularity
-      !     
-      if (dabs(dia) < d_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        int_err(1) = i
-        write(ch_err,'(g20.10)') dia
-        info = psb_err_pivot_too_small_
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = done/dia
-      end if
-      d(i) = dia
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    call trw%free()
-
-    call psb_erractionrestore(err_act)
-    return
-
-9999 continue
-    call psb_erractionrestore(err_act)
-    if (err_act == psb_act_abort_) then
-      call psb_error()
-      return
-    end if
-    return
-  end subroutine psb_dilu_fctint
-end subroutine psb_dilu_fct
--- a/prec/impl/psb_silu_fct.f90
+++ b/prec/impl/psb_silu_fct.f90
@ -1,440 +0,0 @@
-!   
-!                Parallel Sparse BLAS  version 3.5
-!      (C) Copyright 2006-2018
-!        Salvatore Filippone    
-!        Alfredo Buttari      
-!   
-!    Redistribution and use in source and binary forms, with or without
-!    modification, are permitted provided that the following conditions
-!    are met:
-!      1. Redistributions of source code must retain the above copyright
-!         notice, this list of conditions and the following disclaimer.
-!      2. Redistributions in binary form must reproduce the above copyright
-!         notice, this list of conditions, and the following disclaimer in the
-!         documentation and/or other materials provided with the distribution.
-!      3. The name of the PSBLAS group or the names of its contributors may
-!         not be used to endorse or promote products derived from this
-!         software without specific written permission.
-!   
-!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!    POSSIBILITY OF SUCH DAMAGE.
-!   
-!    
-subroutine psb_silu_fct(a,l,u,d,info,blck)
-  
-  !
-  ! This routine copies and factors "on the fly" from A and BLCK
-  ! into L/D/U. 
-  !
-  !
-  use psb_base_mod
-  implicit none
-  !     .. Scalar Arguments ..
-  integer(psb_ipk_), intent(out)                ::     info
-  !     .. Array Arguments ..
-  type(psb_sspmat_type),intent(in)    :: a
-  type(psb_s_csr_sparse_mat),intent(inout) :: l,u
-  type(psb_sspmat_type),intent(in), optional, target :: blck
-  real(psb_spk_), intent(inout)     ::  d(:)
-  !     .. Local Scalars ..
-  integer(psb_ipk_) ::  l1,l2,m,err_act  
-  type(psb_sspmat_type), pointer  :: blck_
-  character(len=20)   :: name, ch_err
-  name='psb_ilu_fct'
-  info = psb_success_
-  call psb_erractionsave(err_act)
-  !     .. Executable Statements ..
-  !
-
-  if (present(blck)) then 
-    blck_ => blck
-  else
-    allocate(blck_,stat=info) 
-    if (info /= psb_success_) then 
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
-      goto 9999      
-    end if
-
-    call blck_%csall(izero,izero,info,ione)
-
-  endif
-
-  call psb_silu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
-       & d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
-  if(info /= psb_success_) then
-     info=psb_err_from_subroutine_
-     ch_err='psb_silu_fctint'
-     call psb_errpush(info,name,a_err=ch_err)
-     goto 9999
-  end if
-
-  call l%set_triangle()
-  call l%set_lower()
-  call l%set_unit()
-  call u%set_triangle()
-  call u%set_upper()
-  call u%set_unit()
-  call l%set_nrows(m)
-  call l%set_ncols(m)
-  call u%set_nrows(m)
-  call u%set_ncols(m)
-
-  if (present(blck)) then 
-    blck_ => null() 
-  else
-    call blck_%free()
-    if(info /= psb_success_) then
-       info=psb_err_from_subroutine_
-       ch_err='psb_sp_free'
-       call psb_errpush(info,name,a_err=ch_err)
-       goto 9999
-    end if
-    deallocate(blck_) 
-  endif
-
-  call psb_erractionrestore(err_act)
-  return
-
-9999 continue
-  call psb_erractionrestore(err_act)
-  if (err_act == psb_act_abort_) then
-     call psb_error()
-     return
-  end if
-  return
-
-contains
-  subroutine psb_silu_fctint(m,ma,a,mb,b,&
-       & d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
-    use psb_mat_mod
-
-    implicit none 
-
-    type(psb_sspmat_type)         :: a
-    type(psb_sspmat_type)         :: b
-    integer(psb_ipk_) :: m,ma,mb,l1,l2,info
-    integer(psb_ipk_), dimension(:)          :: lia1,lia2,uia1,uia2
-    real(psb_spk_), dimension(:) :: laspk,uaspk,d
-
-    integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
-    real(psb_spk_) :: dia,temp
-    integer(psb_ipk_), parameter :: nrb=60
-    type(psb_s_coo_sparse_mat) :: trw
-    integer(psb_ipk_) :: int_err(5) 
-    character(len=20)   :: name, ch_err
-
-    name='psb_silu_fctint'
-    if(psb_get_errstatus() /= 0) return 
-    info=psb_success_
-    call psb_erractionsave(err_act)
-    call trw%allocate(izero,izero,ione)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_sp_all'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    lia2(1) = 1
-    uia2(1) = 1
-    l1=0
-    l2=0
-    m = ma+mb
-
-    do i = 1, ma
-      d(i) = szero
-      !
-      !
-      select type(aa => a%a) 
-      type is (psb_s_csr_sparse_mat)
-        do j = aa%irp(i), aa%irp(i+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i,i+irb-1,trw,info)
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          nz = trw%get_nzeros()
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-!!$
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloop: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !    j=i  update diagonal
-            !    write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
-            dia = dia - temp*uaspk(jj)
-            !    write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
-            cycle updateloop
-            !     
-          else if (j > i) then
-            !     search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloop
-      enddo
-      !    
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < s_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        info = psb_err_pivot_too_small_
-        int_err(1) = i
-        write(ch_err,'(g20.10)') dia
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = sone/dia
-      end if
-      d(i) = dia
-      ! write(psb_err_unit,*)'diag(',i,')=',d(i)
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    do i = ma+1, m
-      d(i) = szero
-
-      select type(aa => b%a) 
-      type is (psb_s_csr_sparse_mat)
-        do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i-ma,i-ma+irb-1,trw,info)
-          nz = trw%get_nzeros()
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloopb: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !   j=i  update diagonal
-            dia = dia - temp*uaspk(jj)
-            cycle updateloopb
-            !     
-          else if (j > i) then
-            !    search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloopb
-      enddo
-      !     
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < s_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        int_err(1) = i
-        write(ch_err,'(g20.10)') dia
-        info = psb_err_pivot_too_small_
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = sone/dia
-      end if
-      d(i) = dia
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    call trw%free()
-
-    call psb_erractionrestore(err_act)
-    return
-
-9999 continue
-    call psb_erractionrestore(err_act)
-    if (err_act == psb_act_abort_) then
-      call psb_error()
-      return
-    end if
-    return
-  end subroutine psb_silu_fctint
-end subroutine psb_silu_fct
--- a/prec/impl/psb_zilu_fct.f90
+++ b/prec/impl/psb_zilu_fct.f90
@ -1,438 +0,0 @@
-!   
-!                Parallel Sparse BLAS  version 3.5
-!      (C) Copyright 2006-2018
-!        Salvatore Filippone    
-!        Alfredo Buttari      
-!   
-!    Redistribution and use in source and binary forms, with or without
-!    modification, are permitted provided that the following conditions
-!    are met:
-!      1. Redistributions of source code must retain the above copyright
-!         notice, this list of conditions and the following disclaimer.
-!      2. Redistributions in binary form must reproduce the above copyright
-!         notice, this list of conditions, and the following disclaimer in the
-!         documentation and/or other materials provided with the distribution.
-!      3. The name of the PSBLAS group or the names of its contributors may
-!         not be used to endorse or promote products derived from this
-!         software without specific written permission.
-!   
-!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!    POSSIBILITY OF SUCH DAMAGE.
-!   
-!    
-subroutine psb_zilu_fct(a,l,u,d,info,blck)
-  
-  !
-  ! This routine copies and factors "on the fly" from A and BLCK
-  ! into L/D/U. 
-  !
-  !
-  use psb_base_mod
-  implicit none
-  !     .. Scalar Arguments ..
-  integer(psb_ipk_), intent(out)                ::     info
-  !     .. Array Arguments ..
-  type(psb_zspmat_type),intent(in)    :: a
-  type(psb_z_csr_sparse_mat),intent(inout) :: l,u
-  type(psb_zspmat_type),intent(in), optional, target :: blck
-  complex(psb_dpk_), intent(inout)     ::  d(:)
-  !     .. Local Scalars ..
-  integer(psb_ipk_) ::  l1, l2,m,err_act  
-  type(psb_zspmat_type), pointer  :: blck_
-  character(len=20)   :: name, ch_err
-  name='psb_ilu_fct'
-  info = psb_success_
-  call psb_erractionsave(err_act)
-  !     .. Executable Statements ..
-  !
-
-  if (present(blck)) then 
-    blck_ => blck
-  else
-    allocate(blck_,stat=info) 
-    if (info /= psb_success_) then 
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
-      goto 9999      
-    end if
-
-    call blck_%csall(izero,izero,info,ione)
-
-  endif
-
-  call psb_zilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
-       & d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
-  if(info /= psb_success_) then
-     info=psb_err_from_subroutine_
-     ch_err='psb_zilu_fctint'
-     call psb_errpush(info,name,a_err=ch_err)
-     goto 9999
-  end if
-
-  call l%set_triangle()
-  call l%set_lower()
-  call l%set_unit()
-  call u%set_triangle()
-  call u%set_upper()
-  call u%set_unit()
-  call l%set_nrows(m)
-  call l%set_ncols(m)
-  call u%set_nrows(m)
-  call u%set_ncols(m)
-
-  if (present(blck)) then 
-    blck_ => null() 
-  else
-    call blck_%free()
-    if(info /= psb_success_) then
-       info=psb_err_from_subroutine_
-       ch_err='psb_sp_free'
-       call psb_errpush(info,name,a_err=ch_err)
-       goto 9999
-    end if
-    deallocate(blck_) 
-  endif
-
-  call psb_erractionrestore(err_act)
-  return
-
-9999 continue
-  call psb_erractionrestore(err_act)
-  if (err_act == psb_act_abort_) then
-     call psb_error()
-     return
-  end if
-  return
-
-contains
-  subroutine psb_zilu_fctint(m,ma,a,mb,b,&
-       & d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
-    implicit none 
-
-    type(psb_zspmat_type)          :: a,b
-    integer(psb_ipk_) :: m,ma,mb,l1,l2,info
-    integer(psb_ipk_), dimension(:)          :: lia1,lia2,uia1,uia2
-    complex(psb_dpk_), dimension(:) :: laspk,uaspk,d
-
-    integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
-    complex(psb_dpk_) :: dia,temp
-    integer(psb_ipk_), parameter :: nrb=60
-    type(psb_z_coo_sparse_mat) :: trw
-    integer(psb_ipk_) :: int_err(5) 
-    character(len=20)   :: name, ch_err
-
-    name='psb_zilu_fctint'
-    if(psb_get_errstatus() /= 0) return 
-    info=psb_success_
-    call psb_erractionsave(err_act)
-    call trw%allocate(izero,izero,ione)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_sp_all'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    lia2(1) = 1
-    uia2(1) = 1
-    l1=0
-    l2=0
-    m = ma+mb
-
-    do i = 1, ma
-      d(i) = zzero
-
-      !
-      !
-      select type(aa => a%a) 
-      type is (psb_z_csr_sparse_mat)
-        do j = aa%irp(i), aa%irp(i+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i,i+irb-1,trw,info)
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          nz = trw%get_nzeros()
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-!!$
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloop: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !    j=i  update diagonal
-            !    write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
-            dia = dia - temp*uaspk(jj)
-            !    write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
-            cycle updateloop
-            !     
-          else if (j > i) then
-            !     search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloop
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloop
-      enddo
-      !    
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < d_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        info = psb_err_pivot_too_small_
-        int_err(1) = i
-        write(ch_err,'(g20.10)') abs(dia)
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = zone/dia
-      end if
-      d(i) = dia
-      ! write(psb_err_unit,*)'diag(',i,')=',d(i)
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    do i = ma+1, m
-      d(i) = zzero
-
-      select type(aa => b%a) 
-      type is (psb_z_csr_sparse_mat)
-        do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
-          k = aa%ja(j)
-          !           write(psb_err_unit,*)'KKKKK',k
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = aa%val(j)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = aa%val(j)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = aa%val(j)
-            uia1(l2) = k
-          end if
-        enddo
-
-      class default
-
-        if ((mod(i,nrb) == 1).or.(nrb == 1)) then 
-          irb = min(ma-i+1,nrb)
-          call aa%csget(i-ma,i-ma+irb-1,trw,info)
-          nz = trw%get_nzeros()
-          if(info /= psb_success_) then
-            info=psb_err_from_subroutine_
-            ch_err='a%csget'
-            call psb_errpush(info,name,a_err=ch_err)
-            goto 9999
-          end if
-          ktrw=1
-        end if
-
-        do 
-          if (ktrw > nz ) exit
-          if (trw%ia(ktrw) > i) exit
-          k = trw%ja(ktrw)
-          if ((k < i).and.(k >= 1)) then
-            l1 = l1 + 1
-            laspk(l1) = trw%val(ktrw)
-            lia1(l1) = k
-          else if (k == i) then
-            d(i) = trw%val(ktrw)
-          else if ((k > i).and.(k <= m)) then
-            l2 = l2 + 1
-            uaspk(l2) = trw%val(ktrw)
-            uia1(l2) = k
-          end if
-          ktrw = ktrw + 1
-        enddo
-      end select
-
-
-      lia2(i+1) = l1 + 1
-      uia2(i+1) = l2 + 1
-
-      dia = d(i)
-      do kk = lia2(i), lia2(i+1) - 1
-        !     
-        !     compute element alo(i,k) of incomplete factorization
-        !     
-        temp = laspk(kk)
-        k = lia1(kk)
-        laspk(kk) = temp*d(k)
-        !     update the rest of row i using alo(i,k)
-        low1 = kk + 1
-        low2 = uia2(i)
-        updateloopb: do  jj = uia2(k), uia2(k+1) - 1
-          j = uia1(jj)
-          !
-          if (j < i) then
-            !     search alo(i,*) for matching index J
-            do  ll = low1, lia2(i+1) - 1
-              l = lia1(ll)
-              if (l > j) then
-                low1 = ll
-                exit
-              else if (l == j) then
-                laspk(ll) = laspk(ll) - temp*uaspk(jj)
-                low1 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-            !     
-          else if (j == i) then
-            !   j=i  update diagonal
-            dia = dia - temp*uaspk(jj)
-            cycle updateloopb
-            !     
-          else if (j > i) then
-            !    search aup(i,*) for matching index j
-            do ll = low2, uia2(i+1) - 1
-              l = uia1(ll)
-              if (l > j) then
-                low2 = ll
-                exit
-              else if (l == j) then
-                uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
-                low2 = ll + 1
-                cycle updateloopb
-              end if
-            enddo
-          end if
-          !     
-          !     for milu al=1.;  for ilu al=0.
-          !     al = 1.d0
-          !     dia = dia - al*temp*aup(jj)
-        enddo updateloopb
-      enddo
-      !     
-      !     
-      !     Non singularity
-      !     
-      if (abs(dia) < d_epstol) then
-        !
-        !     Pivot too small: unstable factorization
-        !     
-        int_err(1) = i
-        write(ch_err,'(g20.10)') abs(dia)
-        info = psb_err_pivot_too_small_
-        call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
-        goto 9999
-      else
-        dia = zone/dia
-      end if
-      d(i) = dia
-      !     Scale row i of upper triangle
-      do  kk = uia2(i), uia2(i+1) - 1
-        uaspk(kk) = uaspk(kk)*dia
-      enddo
-    enddo
-
-    call trw%free()
-
-    call psb_erractionrestore(err_act)
-    return
-
-9999 continue
-    call psb_erractionrestore(err_act)
-    if (err_act == psb_act_abort_) then
-      call psb_error()
-      return
-    end if
-    return
-  end subroutine psb_zilu_fctint
-end subroutine psb_zilu_fct
--- a/test/block_krylov/kernel/dpdegenmm.F90
+++ b/test/block_krylov/kernel/dpdegenmm.F90
@ -596,7 +596,7 @@ program pdegenmm
  ! solver parameters
  integer(psb_epk_)  :: amatsize, precsize, descsize, annz, nbytes
  real(psb_dpk_)     :: err, eps
-  integer, parameter :: ntests=50, ngpu=50, ncnv=20
+  integer, parameter :: ntests=50, ngpu=10, ncnv=20
  type(psb_d_coo_sparse_mat), target  :: acoo
  type(psb_d_csr_sparse_mat), target  :: acsr
  type(psb_d_ell_sparse_mat), target  :: aell
@ -613,6 +613,7 @@ program pdegenmm
  type(psb_d_cuda_hybg_sparse_mat), target  :: ahybg
 #endif
  type(psb_d_cuda_hlg_sparse_mat), target   :: ahlg
+  ! TODO HDIAG E DNSG non hanno nemmeno CSMM
  type(psb_d_cuda_hdiag_sparse_mat), target   :: ahdiag
  type(psb_d_cuda_dnsg_sparse_mat), target   :: adnsg
 #endif
@ -658,12 +659,12 @@ program pdegenmm
  !
  !  get parameters
  !
-  !call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
-  nrhs=2
-  acfmt='CSR'
-  agfmt='CSRG'
-  idim=2
-  tnd=.false.
+  call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
+  !nrhs=8
+  !acfmt='CSR'
+  !agfmt='CSRG'
+  !idim=100
+  !tnd=.false.
  call psb_init_timers()
  !
  !  allocate and fill in the coefficient matrix and initial vectors
@ -803,6 +804,40 @@ program pdegenmm
  x1 = b_mv%get_vect()
  x2 = b_mv_g%get_vect()

+!   ! TODO test AXPBY
+!   call psb_geall(xg,desc_a,info)
+!   call psb_geasb(xg,desc_a,info,mold=tmold)
+!   call xg%set(done)
+!   call xg%sync()
+!   call psb_geall(bg,desc_a,info)
+!   call psb_geasb(bg,desc_a,info,mold=tmold)
+!   !call bg%set(done+done)
+
+! !   ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble)
+!   call psb_geaxpby(done,xg,dzero,bg,desc_a,info)
+!   call psb_cuda_DeviceSync()
+
+!   write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync()
+!   call bg%sync()
+!   write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync()
+!   do i=1,8
+!     write(*,*) bg%v%v(i)
+!   end do
+
+!   return
+
+!   call x_mv_g%set(done)
+!   call x_mv_g%sync()
+
+!   call psb_geaxpby(done,x_mv_g,dzero,b_mv_g,desc_a,info)
+
+!   call b_mv_g%sync()
+!   do i=1,size(b_mv_g%v%v,1)
+!     write(*,*) b_mv_g%v%v(i,:)
+!   end do
+
+!   return
+
  call psb_barrier(ctxt)
  tt1 = psb_wtime()
  do i=1,ntests 
@ -876,7 +911,7 @@ program pdegenmm
    write(psb_out_unit,'("Size of matrix:     ",i20)') nr
    write(psb_out_unit,'("Number of nonzeros: ",i20)') annz
    write(psb_out_unit,'("Memory occupation:  ",i20)') amatsize
-    flops  = ntests*(2.d0*annz)
+    flops  = ntests*(2.d0*annz)*nrhs
    tflops = flops
    gflops = flops * ngpu
    write(psb_out_unit,'("Storage type for A:    ",a)') a%get_fmt()
@ -935,7 +970,7 @@ program pdegenmm
    write(psb_out_unit,*)
    write(psb_out_unit,'("MBYTES/S sust. effective bandwidth  (CPU)  : ",F20.3)') bdwdth
 #ifdef HAVE_CUDA
-    bdwdth = ngpu*ntests*nbytes/(gt2*1.d6)
+    bdwdth = nrhs*ngpu*ntests*nbytes/(gt2*1.d6)
    write(psb_out_unit,'("MBYTES/S sust. effective bandwidth  (GPU)  : ",F20.3)') bdwdth
    bdwdth = psb_cuda_MemoryPeakBandwidth()
    write(psb_out_unit,'("MBYTES/S peak bandwidth             (GPU)  : ",F20.3)') bdwdth
@ -1018,4 +1053,4 @@ contains

  end subroutine get_parms

-end program pdegenmm
+end program pdegenmm