ELG SpMM (not compiling)

6 months ago · 6b8199f84b
parent 9daa04c3dc
commit 6b8199f84b
23 changed files with 1076 additions and 31 deletions
--- a/cuda/elldev.c
+++ b/cuda/elldev.c
@ -148,6 +148,9 @@ int FallocEllDevice(void** deviceMat,unsigned int rows, unsigned int maxRowSize,
  return(i);
 }
 //
 // Single Precision Float
 //
 void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, int* rP, int* rS, 
 		 int avgRowSize, int maxRowSize, int rows, int pitch, float *x, float beta, int firstIndex)
 {
@ -168,7 +171,7 @@ void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, i
      x += vPitch;		
    }
 }
-//new
+
 int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 		       float beta, void* deviceY)
 { int i=SPGPU_SUCCESS;
@ -191,7 +194,31 @@ int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
  return(i);
 }
 int spmmEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 			float beta, void* deviceY)
 {
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuSellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
          (float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
          devMat->rP, devMat->cMPitch, devMat->rPPitch,
          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
          devMat->rows, (float*)x->v_, x->pitch_, beta, devMat->baseIndex);
  return SPGPU_SUCCESS;
 }
 //
 // Double Precision
 //
 void
 dspmdmm_gpu (double *z,int s, int vPitch, double *y, double alpha, double* cM, int* rP,
 	     int* rS, int avgRowSize, int maxRowSize, int rows, int pitch, 
@ -237,6 +264,31 @@ int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
  return SPGPU_SUCCESS;
 }
 int spmmEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
 			double beta, void* deviceY)
 {
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuDellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
          (double*)y->v_, y->pitch_, alpha, (double*)devMat->cM,
          devMat->rP, devMat->cMPitch, devMat->rPPitch,
          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
          devMat->rows, (double*)x->v_, x->pitch_, beta, devMat->baseIndex);
  return SPGPU_SUCCESS;
 }
 //
 // Single Precision Float Complex
 //
 void
 cspmdmm_gpu (cuFloatComplex *z, int s, int vPitch, cuFloatComplex *y,  
 	     cuFloatComplex alpha, cuFloatComplex* cM,
@ -276,6 +328,31 @@ int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* device
  return SPGPU_SUCCESS;
 }
 int spmmEllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX, 
 			cuFloatComplex beta, void* deviceY)
 {
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuCellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
          devMat->rP, devMat->cMPitch, devMat->rPPitch,
          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
          devMat->rows, (cuFloatComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
  return SPGPU_SUCCESS;
 }
 //
 // Double Precision Complex
 //
 void
 zspmdmm_gpu (cuDoubleComplex *z, int s, int vPitch, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* cM,
 	     int* rP, int* rS, int avgRowSize, int maxRowSize, int rows, int pitch,
@ -314,6 +391,28 @@ int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* devi
  return SPGPU_SUCCESS;
 }
 int spmmEllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX, 
 			cuDoubleComplex beta, void* deviceY)
 {
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
  spgpuHandle_t handle=psb_cudaGetHandle();
 #ifdef VERBOSE
  /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
  spgpuZellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
          devMat->rP, devMat->cMPitch, devMat->rPPitch,
          devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
          devMat->rows, (cuDoubleComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
  return SPGPU_SUCCESS;
 }
 int writeEllDeviceFloat(void* deviceMat, float* val, int* ja, int ldj, int* irn, int *idiag)
 { int i;
  struct EllDevice *devMat = (struct EllDevice *) deviceMat;
--- a/cuda/elldev.h
+++ b/cuda/elldev.h
@ -113,16 +113,6 @@ int readEllDeviceDouble(void* deviceMat, double* val, int* ja, int ldj, int* irn
 int readEllDeviceFloatComplex(void* deviceMat, float complex* val, int* ja, int ldj, int* irn, int *idiag);
 int readEllDeviceDoubleComplex(void* deviceMat, double complex* val, int* ja, int ldj, int* irn, int *idiag);
 int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX, 
 		       float beta, void* deviceY);
 int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX, 
 			double beta, void* deviceY);
 int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
 			      float complex beta, void* deviceY);
 int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
 			       double complex beta, void* deviceY);
 int psiCopyCooToElgFloat(int nr, int nc, int nza, int hacksz, int ldv, int nzm, int *irn,
 			  int *idisp, int *ja, float *val, void *deviceMat);
--- a/cuda/elldev_mod.F90
+++ b/cuda/elldev_mod.F90
@ -318,4 +318,35 @@ module elldev_mod
    end function spmvEllDeviceDoubleComplex
  end interface
  interface spmmEllDevice
    function spmmEllDeviceFloat(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmEllDeviceFloat')
      use iso_c_binding
      integer(c_int)		:: res
      type(c_ptr), value 	:: deviceMat, x, y
      real(c_float),value     	:: alpha, beta
    end function spmmEllDeviceFloat
    function spmmEllDeviceDouble(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmEllDeviceDouble')
      use iso_c_binding
      integer(c_int)		:: res
      type(c_ptr), value	:: deviceMat, x, y 
      real(c_double),value     	:: alpha,  beta
    end function spmmEllDeviceDouble
    function spmmEllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmEllDeviceFloatComplex')
      use iso_c_binding
      integer(c_int)		     :: res
      type(c_ptr), value	     :: deviceMat, x, y 
      complex(c_float_complex),value :: alpha,  beta
    end function spmmEllDeviceFloatComplex
    function spmmEllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
         & result(res) bind(c,name='spmmEllDeviceDoubleComplex')
      use iso_c_binding
      integer(c_int)		      :: res
      type(c_ptr), value	      :: deviceMat, x, y 
      complex(c_double_complex),value :: alpha,  beta
    end function spmmEllDeviceDoubleComplex
  end interface
 end module elldev_mod
--- a/cuda/hlldev.c
+++ b/cuda/hlldev.c
@ -264,7 +264,7 @@ int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* devic
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
+  spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
          (cuFloatComplex *)devMat->cM, devMat->rP,
          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
@ -285,7 +285,7 @@ int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* devic
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
+  spgpuChellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
          (cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
          devMat->rP, devMat->hackSize, devMat->hackOffs,
          devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
@ -310,7 +310,7 @@ int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* dev
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
+  spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
          (cuDoubleComplex *)devMat->cM, devMat->rP,
          devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
 		  devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
@ -331,7 +331,7 @@ int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* dev
  /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
  /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  spgpuShellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
+  spgpuZhellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
          (cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
          devMat->rP, devMat->hackSize, devMat->hackOffs,
          devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
--- a/cuda/impl/psb_c_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_c_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_c_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_c_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_c_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_c_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return
 end subroutine psb_c_cuda_elg_vect_mv
 subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
  use psb_base_mod
  use elldev_mod
  use psb_vectordev_mod
  use psb_c_cuda_elg_mat_mod, psb_protect_name => psb_c_cuda_elg_multivect_mv
  use psb_c_cuda_vect_mod
  implicit none 
  class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
  complex(psb_spk_), intent(in)       :: alpha, beta
  class(psb_c_base_vect_type), intent(inout) :: x
  class(psb_c_base_vect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  complex(psb_spk_), allocatable      :: rx(:), ry(:)
  logical           :: tra
  character         :: trans_
  Integer(Psb_ipk_) :: err_act
  character(len=20) :: name='c_cuda_elg_multivect_mv'
  call psb_erractionsave(err_act)
  info = psb_success_
  if (present(trans)) then
    trans_ = trans
  else
    trans_ = 'N'
  end if
  if (.not.a%is_asb()) then 
    info = psb_err_invalid_mat_state_
    call psb_errpush(info,name)
    goto 9999
  endif
  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
  if (tra) then 
    if (a%is_dev()) call a%sync()
    if (.not.x%is_host()) call x%sync()
    if (beta /= czero) then 
      if (.not.y%is_host()) call y%sync()
    end if
    call a%psb_c_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
    call y%set_host()
  else
    if (a%is_host()) call a%sync()    
    select type (xx => x) 
    type is (psb_c_vect_cuda)
      select type(yy => y) 
      type is (psb_c_vect_cuda)
        if (a%is_host()) call a%sync()
        if (xx%is_host()) call xx%sync()
        if (beta /= czero) then 
          if (yy%is_host()) call yy%sync()
        end if
        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
             & beta,yy%deviceVect)
        if (info /= 0) then 
          call psb_errpush(psb_err_from_subroutine_ai_,name,&
               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
          info = psb_err_from_subroutine_ai_
          goto 9999
        end if
        call yy%set_dev()
      class default
        if (a%is_dev()) call a%sync()
        rx = xx%get_vect()
        ry = y%get_vect()
        call a%spmm(alpha,rx,beta,ry,info)
        call y%bld(ry)
      end select
    class default
      if (a%is_dev()) call a%sync()
      rx = x%get_vect()
      ry = y%get_vect()
      call a%spmm(alpha,rx,beta,ry,info)
      call y%bld(ry)
    end select
  end if
  if (info /= 0) goto 9999
  call psb_erractionrestore(err_act)
  return
 9999 call psb_error_handler(err_act)
  return
 end subroutine psb_c_cuda_elg_multivect_mv
--- a/cuda/impl/psb_d_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_d_cuda_elg_csmm.F90
@ -105,7 +105,7 @@ subroutine psb_d_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
         & info = writeMultiVecDevice(gpY,y,size(y,1))
    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
--- a/cuda/impl/psb_d_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_elg_vect_mv.F90
@ -176,7 +176,7 @@ subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
          if (beta /= dzero) then 
            if (yy%is_host()) call yy%sync()
          end if
-          info = spmvEllDevice(a%deviceMat,alpha,xx%deviceVect,&
+          info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
               & beta,yy%deviceVect)
          if (info /= 0) then 
            call psb_errpush(psb_err_from_subroutine_ai_,name,&
--- a/cuda/impl/psb_s_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_s_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_s_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_s_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_s_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_s_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return
 end subroutine psb_s_cuda_elg_vect_mv
 subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
  use psb_base_mod
  use elldev_mod
  use psb_vectordev_mod
  use psb_s_cuda_elg_mat_mod, psb_protect_name => psb_s_cuda_elg_multivect_mv
  use psb_s_cuda_vect_mod
  implicit none 
  class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
  real(psb_spk_), intent(in)       :: alpha, beta
  class(psb_s_base_vect_type), intent(inout) :: x
  class(psb_s_base_vect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  real(psb_spk_), allocatable      :: rx(:), ry(:)
  logical           :: tra
  character         :: trans_
  Integer(Psb_ipk_) :: err_act
  character(len=20) :: name='s_cuda_elg_multivect_mv'
  call psb_erractionsave(err_act)
  info = psb_success_
  if (present(trans)) then
    trans_ = trans
  else
    trans_ = 'N'
  end if
  if (.not.a%is_asb()) then 
    info = psb_err_invalid_mat_state_
    call psb_errpush(info,name)
    goto 9999
  endif
  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
  if (tra) then 
    if (a%is_dev()) call a%sync()
    if (.not.x%is_host()) call x%sync()
    if (beta /= szero) then 
      if (.not.y%is_host()) call y%sync()
    end if
    call a%psb_s_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
    call y%set_host()
  else
    if (a%is_host()) call a%sync()    
    select type (xx => x) 
    type is (psb_s_vect_cuda)
      select type(yy => y) 
      type is (psb_s_vect_cuda)
        if (a%is_host()) call a%sync()
        if (xx%is_host()) call xx%sync()
        if (beta /= szero) then 
          if (yy%is_host()) call yy%sync()
        end if
        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
             & beta,yy%deviceVect)
        if (info /= 0) then 
          call psb_errpush(psb_err_from_subroutine_ai_,name,&
               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
          info = psb_err_from_subroutine_ai_
          goto 9999
        end if
        call yy%set_dev()
      class default
        if (a%is_dev()) call a%sync()
        rx = xx%get_vect()
        ry = y%get_vect()
        call a%spmm(alpha,rx,beta,ry,info)
        call y%bld(ry)
      end select
    class default
      if (a%is_dev()) call a%sync()
      rx = x%get_vect()
      ry = y%get_vect()
      call a%spmm(alpha,rx,beta,ry,info)
      call y%bld(ry)
    end select
  end if
  if (info /= 0) goto 9999
  call psb_erractionrestore(err_act)
  return
 9999 call psb_error_handler(err_act)
  return
 end subroutine psb_s_cuda_elg_multivect_mv
--- a/cuda/impl/psb_z_cuda_elg_csmm.F90
+++ b/cuda/impl/psb_z_cuda_elg_csmm.F90
@ -98,16 +98,16 @@ subroutine psb_z_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
    if (info == 0) &
         & info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpX,x,nxy)
+         & info = writeMultiVecDevice(gpX,x,size(x,1))
    if (info == 0) &
         & info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
    if (info == 0) &
-         & info = writeMultiVecDevice(gpY,y,nxy)
+         & info = writeMultiVecDevice(gpY,y,size(y,1))
    if (info == 0)  &
-         & info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
+         & info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
    if (info == 0) &
-         & info = readMultiVecDevice(gpY,y,nxy)
+         & info = readMultiVecDevice(gpY,y,size(y,1))
    if (info /= 0) goto 9999
    call freeMultiVecDevice(gpX)
    call freeMultiVecDevice(gpY)
--- a/cuda/impl/psb_z_cuda_elg_vect_mv.F90
+++ b/cuda/impl/psb_z_cuda_elg_vect_mv.F90
@ -119,3 +119,94 @@ subroutine psb_z_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
  return
 end subroutine psb_z_cuda_elg_vect_mv
 subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
  use psb_base_mod
  use elldev_mod
  use psb_vectordev_mod
  use psb_z_cuda_elg_mat_mod, psb_protect_name => psb_z_cuda_elg_multivect_mv
  use psb_z_cuda_vect_mod
  implicit none 
  class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
  complex(psb_dpk_), intent(in)       :: alpha, beta
  class(psb_z_base_vect_type), intent(inout) :: x
  class(psb_z_base_vect_type), intent(inout) :: y
  integer(psb_ipk_), intent(out)             :: info
  character, optional, intent(in)  :: trans
  complex(psb_dpk_), allocatable      :: rx(:), ry(:)
  logical           :: tra
  character         :: trans_
  Integer(Psb_ipk_) :: err_act
  character(len=20) :: name='z_cuda_elg_multivect_mv'
  call psb_erractionsave(err_act)
  info = psb_success_
  if (present(trans)) then
    trans_ = trans
  else
    trans_ = 'N'
  end if
  if (.not.a%is_asb()) then 
    info = psb_err_invalid_mat_state_
    call psb_errpush(info,name)
    goto 9999
  endif
  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
  if (tra) then 
    if (a%is_dev()) call a%sync()
    if (.not.x%is_host()) call x%sync()
    if (beta /= zzero) then 
      if (.not.y%is_host()) call y%sync()
    end if
    call a%psb_z_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
    call y%set_host()
  else
    if (a%is_host()) call a%sync()    
    select type (xx => x) 
    type is (psb_z_vect_cuda)
      select type(yy => y) 
      type is (psb_z_vect_cuda)
        if (a%is_host()) call a%sync()
        if (xx%is_host()) call xx%sync()
        if (beta /= zzero) then 
          if (yy%is_host()) call yy%sync()
        end if
        info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
             & beta,yy%deviceVect)
        if (info /= 0) then 
          call psb_errpush(psb_err_from_subroutine_ai_,name,&
               & a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
          info = psb_err_from_subroutine_ai_
          goto 9999
        end if
        call yy%set_dev()
      class default
        if (a%is_dev()) call a%sync()
        rx = xx%get_vect()
        ry = y%get_vect()
        call a%spmm(alpha,rx,beta,ry,info)
        call y%bld(ry)
      end select
    class default
      if (a%is_dev()) call a%sync()
      rx = x%get_vect()
      ry = y%get_vect()
      call a%spmm(alpha,rx,beta,ry,info)
      call y%bld(ry)
    end select
  end if
  if (info /= 0) goto 9999
  call psb_erractionrestore(err_act)
  return
 9999 call psb_error_handler(err_act)
  return
 end subroutine psb_z_cuda_elg_multivect_mv
--- a/cuda/psb_c_cuda_elg_mat_mod.F90
+++ b/cuda/psb_c_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_c_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => c_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => c_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_c_cuda_elg_vect_mv
    procedure, pass(a) :: multivect_mv  => psb_c_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_c_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_c_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_c_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_c_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_c_cuda_elg_vect_mv
    subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
      import :: psb_c_cuda_elg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
      class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
      complex(psb_spk_), intent(in)       :: alpha, beta
      class(psb_c_base_multivect_type), intent(inout) :: x
      class(psb_c_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_c_cuda_elg_multivect_mv
  end interface
  interface 
--- a/cuda/psb_s_cuda_elg_mat_mod.F90
+++ b/cuda/psb_s_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_s_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => s_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => s_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_s_cuda_elg_vect_mv
    procedure, pass(a) :: multivect_mv  => psb_s_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_s_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_s_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_s_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_s_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_s_cuda_elg_vect_mv
    subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
      import :: psb_s_cuda_elg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
      class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
      real(psb_spk_), intent(in)       :: alpha, beta
      class(psb_s_base_multivect_type), intent(inout) :: x
      class(psb_s_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_s_cuda_elg_multivect_mv
  end interface
  interface 
--- a/cuda/psb_z_cuda_elg_mat_mod.F90
+++ b/cuda/psb_z_cuda_elg_mat_mod.F90
@ -56,6 +56,7 @@ module psb_z_cuda_elg_mat_mod
    procedure, nopass  :: get_fmt       => z_cuda_elg_get_fmt
    procedure, pass(a) :: sizeof        => z_cuda_elg_sizeof
    procedure, pass(a) :: vect_mv       => psb_z_cuda_elg_vect_mv
    procedure, pass(a) :: multivect_mv  => psb_z_cuda_elg_multivect_mv
    procedure, pass(a) :: csmm          => psb_z_cuda_elg_csmm
    procedure, pass(a) :: csmv          => psb_z_cuda_elg_csmv
    procedure, pass(a) :: in_vect_sv    => psb_z_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_z_cuda_elg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_z_cuda_elg_vect_mv
    subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans) 
      import :: psb_z_cuda_elg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
      class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
      complex(psb_dpk_), intent(in)       :: alpha, beta
      class(psb_z_base_multivect_type), intent(inout) :: x
      class(psb_z_base_multivect_type), intent(inout) :: y
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_z_cuda_elg_multivect_mv
  end interface
  interface 
--- a/cuda/spgpu/ell.h
+++ b/cuda/spgpu/ell.h
@ -70,6 +70,51 @@ void spgpuSellspmv (spgpuHandle_t handle,
 	float beta,
 	int baseIndex);
 /** 
 * \fn void spgpuSellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgRowSize,int maxRowSize,int rows, const __device float *x,int xpitch,float beta,int baseIndex)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
 void spgpuSellspmm(spgpuHandle_t handle,
 	int count,
 	__device float *z,
 	int zpitch,
 	const __device float *y,
 	int ypitch,
 	float alpha, 
 	const __device float* cM, 
 	const __device int* rP,
    int cMPitch,
    int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
    int avgNnzPerRow,
    int maxNnzPerRow,
 	int rows, 
 	const __device float *x,
 	int xpitch,
 	float beta,
 	int baseIndex);
 /** 
 * \fn void spgpuDellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -107,6 +152,50 @@ void spgpuDellspmv (spgpuHandle_t handle,
 	double beta,
 	int baseIndex);
 /** 
 * \fn void spgpuDellspmm (	int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device double *x,int xpitch,double beta,int baseIndex)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
 void spgpuDellspmm(spgpuHandle_t handle,
 	int count,
 	__device double *z,
 	int zpitch,
 	const __device double *y,
 	int ypitch,
 	double alpha, 
 	const __device double* cM, 
 	const __device int* rP,
    int cMPitch,
    int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
    int avgNnzPerRow,
    int maxNnzPerRow,
 	int rows, 
 	const __device double *x,
 	int xpitch,
 	double beta,
 	int baseIndex);
 /** 
 * \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -145,6 +234,51 @@ void spgpuCellspmv (spgpuHandle_t handle,
 	cuFloatComplex beta,
 	int baseIndex);
 /** 
 * \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
 * Computes single precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
 void spgpuCellspmm(spgpuHandle_t handle,
 	int count,
 	__device cuFloatComplex *z,
 	int zpitch,
 	const __device cuFloatComplex *y,
 	int ypitch,
 	cuFloatComplex alpha, 
 	const __device cuFloatComplex* cM, 
 	const __device int* rP,
    int cMPitch,
    int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
    int avgNnzPerRow,
    int maxNnzPerRow,
 	int rows, 
 	const __device cuFloatComplex *x,
 	int xpitch,
 	cuFloatComplex beta,
 	int baseIndex);
 /** 
 * \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
 * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -182,6 +316,50 @@ void spgpuZellspmv (spgpuHandle_t handle,
 	cuDoubleComplex beta,
 	int baseIndex);
 /** 
 * \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
 * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param count The cols count
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param zpitch The pitch of the output vector
 * \param y The y input vector
 * \param ypitch The pitch of the y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param xpitch The pitch of the x input vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
 void spgpuZellspmm(spgpuHandle_t handle,
 	int count,
 	__device cuDoubleComplex *z,
 	int zpitch,
 	const __device cuDoubleComplex *y,
 	int ypitch,
 	cuDoubleComplex alpha, 
 	const __device cuDoubleComplex* cM, 
 	const __device int* rP,
    int cMPitch,
    int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
    int avgNnzPerRow,
    int maxNnzPerRow,
 	int rows, 
 	const __device cuDoubleComplex *x,
 	int xpitch,
 	cuDoubleComplex beta,
 	int baseIndex);
 /** 
 * \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device float *aVal, int baseIndex)
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@ -14,8 +14,9 @@ OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
  cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\
  dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\
  dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \
-  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
+  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o ell_dcsput.o \
-  ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
+  ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
  ell_cspmm.o ell_dspmm.o ell_sspmm.o ell_zspmm.o \
  hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
  hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
  igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
--- a/cuda/spgpu/kernels/ell_cspmm.cu
+++ b/cuda/spgpu/kernels/ell_cspmm.cu
@ -0,0 +1,35 @@
 /*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "cudadebug.h"
 #include "cudalang.h"
 #include "cuComplex.h"
 extern "C"
 {
 #include "core.h"
 #include "ell.h"
  int getGPUSharedMemPerBlock();
  int getGPUMultiProcessors();
  int getGPUMaxThreadsPerMP();
 }
 #include "debug.h"
 #define VALUE_TYPE cuFloatComplex
 #define TYPE_SYMBOL C
 #define TEX_FETCH_TYPE cuFloatComplex
 #include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_dspmm.cu
+++ b/cuda/spgpu/kernels/ell_dspmm.cu
@ -0,0 +1,35 @@
 /*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "cudadebug.h"
 #include "cudalang.h"
 #include <stdio.h>
 extern "C"
 {
 #include "core.h"
 #include "hell.h"
  int getGPUSharedMemPerBlock();
  int getGPUMultiProcessors();
  int getGPUMaxThreadsPerMP();
 }
 #include "debug.h"
 #define VALUE_TYPE double
 #define TYPE_SYMBOL D
 #define TEX_FETCH_TYPE int2
 #include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_spmm_base.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base.cuh
@ -0,0 +1,200 @@
 /*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2015
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #define PRE_CONCAT(A, B) A ## B
 #define CONCAT(A, B) PRE_CONCAT(A, B)
 #undef GEN_SPGPU_ELL_NAME
 #undef X_TEX
 #define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
 __device__ __host__ static float zero_float() { return 0.0f; }
 __device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
 __device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
 __device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
 __device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
 __device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
 __device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } 
 __device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
 __device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
 __device__ static float readValue_float(float fetch) { return fetch; }
 __device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
 // host or c.c >= 1.3 
 #if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
 __device__ __host__ static double zero_double() { return 0.0; }
 __device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
 __device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
 __device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
 __device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
 __device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
 __device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
 __device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
 __device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
 __device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
 __device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) 
 {
 	cuDoubleComplex c;
 	c.x = __hiloint2double (fetch.y, fetch.x);
 	c.y = __hiloint2double (fetch.w, fetch.z);
 	return c;
 }
 #endif
 #if 0
 // Texture cache management
 texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
 #define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
 #define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
 __device__ static VALUE_TYPE 
 fetchTex (int pointer)
 {
 	TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
 	return CONCAT(readValue_,VALUE_TYPE) (fetch);
 }
 #endif
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
 #define GEN_SPGPU_ELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
 #include "ell_spmm_base_template.cuh"
 #if 0
 #undef GEN_SPGPU_ELL_NAME
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
 #define GEN_SPGPU_ELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
 #undef USE_PREFETCHING
 #define USE_PREFETCHING
 #include "ell_spmm_base_template.cuh"
 #define ENABLE_CACHE
 #undef GEN_SPGPU_ELL_NAME
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
 #define GEN_SPGPU_ELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
 #include "ell_spmm_base_template.cuh"
 #undef GEN_SPGPU_ELL_NAME
 #undef USE_PREFETCHING
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
 #define GEN_SPGPU_ELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
 #include "ell_spmm_base_template.cuh"
 #endif
 #undef GEN_SPGPU_ELL_NAME
 #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
 void
 GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
 (spgpuHandle_t handle,
 	int count,
 	VALUE_TYPE* z,
 	int zPitch,
 	const VALUE_TYPE *y,
 	int yPitch,
 	VALUE_TYPE alpha, 
 	const VALUE_TYPE* cM, 
 	const int* rP,
    int cMPitch,
    int rPPitch,
 	const __device int* rS,
 	const __device int* rIdx, 
    int avgNnzPerRow,
    int maxNnzPerRow,
 	int rows, 
 	const VALUE_TYPE *x,
 	int xPitch,
 	VALUE_TYPE beta,
 	int baseIndex)
 {
 // TODO
  VALUE_TYPE *px,*py,*pz;
  int cnt;
  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
  int maxShmemSz;
  maxShmemSz=getGPUSharedMemPerBlock();
  while (rows > maxNForACall) {//managing large vectors
    cnt = count;
    px = (VALUE_TYPE *) x;
    py = (VALUE_TYPE *) y;
    pz = (VALUE_TYPE *) z;	  
    while (cnt > MMBSZ) {
      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, rP,
 							  cMPitch, rPPitch,
 							  rS, rIdx, avgNnzPerRow,
 							  maxNnzPerRow, maxNForACall,
 							  px, xPitch, beta, baseIndex);
      px += xPitch*MMBSZ;
      py += yPitch*MMBSZ;
      pz += zPitch*MMBSZ;
      cnt -= MMBSZ;
    }
    if (cnt >0) {
      CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, rP,
 							  cMPitch, rPPitch,
 							  rS, rIdx, avgNnzPerRow,
 							  maxNnzPerRow, maxNForACall,
 							  px, xPitch, beta, baseIndex);
    }
    y = y + maxNForACall;
    z = z + maxNForACall;
    cM = cM + maxNForACall;
 	rP = rP + maxNForACall;
 	rS = rS + maxNForACall;
    rows -= maxNForACall;
  }
  cnt = count;
  px = (VALUE_TYPE *) x;
  py = (VALUE_TYPE *) y;
  pz = (VALUE_TYPE *) z;	  
  while (cnt > MMBSZ) {
    fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, rP,
 							  cMPitch, rPPitch,
 							  rS, rIdx, avgNnzPerRow,
 							  maxNnzPerRow, maxNForACall,
 							  px, xPitch, beta, baseIndex);
    px += xPitch*MMBSZ;
    py += yPitch*MMBSZ;
    pz += zPitch*MMBSZ;
    cnt -= MMBSZ;
  }
  if (cnt >0) {
    fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
    CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
 							  py, yPitch,
 							  alpha, cM, rP,
 							  cMPitch, rPPitch,
 							  rS, rIdx, avgNnzPerRow,
 							  maxNnzPerRow, maxNForACall,
 							  px, xPitch, beta, baseIndex);
  }
  cudaCheckError("CUDA error on hell_spmm");
 }
--- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh
+++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh
@ -0,0 +1,104 @@
 /* 
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2015
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #define IDX2
 #define THREAD_BLOCK 128
 #define MMBSZ 8
 __global__ void
 CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
   int cMPitch, int rPPitch, const int* rS, int rows,
   const VALUE_TYPE *x, int xPitch, 
   VALUE_TYPE beta, int baseIndex)
 {
 // TODO
  VALUE_TYPE *pz,*px,*py;
  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
  VALUE_TYPE yVal; 
  __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
  if (i < rows) {
    int j;
    rP += rPPitch;
    cM += cMPitch;
    int rowSize = rS[i];
    for (int k=0; k<count; k++) {
      temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
    }
    for (int j = 0; j < rowSize; j++) {
      int pointer;
      VALUE_TYPE value;
      VALUE_TYPE fetch;
      pointer = rP[0] - baseIndex;
      rP += rPPitch;
      value = cM[0];
      cM += cMPitch;
      px = (VALUE_TYPE *) x;
      for (int k=0; k<count; k++) {
 	    fetch = px[pointer]; 
 	    temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
 	    px = px + xPitch;
      }
    }
    // Since z and y are accessed with the same offset by the same thread,
    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
    py = (VALUE_TYPE *) y;
    pz = z;
    if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
      for (int k=0; k<count; k++) {
        yVal = py[i];
        pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
        py += yPitch;
        pz += zPitch;
      }
    } else {
      for (int k=0; k<count; k++) {
        pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
        pz += zPitch;
      }
    }
  }
 }
 void
 CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch,
   const int* rS,  const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows,
   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
 {
  dim3 block (THREAD_BLOCK, 1);
  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
  // Should we generalize the code to 1/2/4/8 threads per row?
  // And maybe adjust THREAD_BLOCK size? 
  int shrMemSize,maxShmemSz;
  maxShmemSz=getGPUSharedMemPerBlock();
  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
  CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) 
    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
 							    alpha, cM, rP, cMPitch, rPPitch, rS, rows,
 							    x, xPitch, beta, baseIndex);
 }
--- a/cuda/spgpu/kernels/ell_sspmm.cu
+++ b/cuda/spgpu/kernels/ell_sspmm.cu
@ -0,0 +1,34 @@
 /*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "cudadebug.h"
 #include "cudalang.h"
 extern "C"
 {
 #include "core.h"
 #include "hell.h"
  int getGPUSharedMemPerBlock();
  int getGPUMultiProcessors();
  int getGPUMaxThreadsPerMP();
 }
 #include "debug.h"
 #define VALUE_TYPE float
 #define TYPE_SYMBOL S
 #define TEX_FETCH_TYPE float
 #include "ell_spmm_base.cuh"
--- a/cuda/spgpu/kernels/ell_zspmm.cu
+++ b/cuda/spgpu/kernels/ell_zspmm.cu
@ -0,0 +1,35 @@
 /*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "cudadebug.h"
 #include "cudalang.h"
 #include "cuComplex.h"
 extern "C"
 {
 #include "core.h"
 #include "ell.h"
  int getGPUSharedMemPerBlock();
  int getGPUMultiProcessors();
  int getGPUMaxThreadsPerMP();
 }
 #include "debug.h"
 #define VALUE_TYPE cuDoubleComplex
 #define TYPE_SYMBOL Z
 #define TEX_FETCH_TYPE int4
 #include "ell_spmm_base.cuh"