Read/Write multivect fixed (SpMM bug)

10 months ago · ee140bc8dd
parent a624b7098b
commit ee140bc8dd
12 changed files with 252 additions and 74 deletions
--- a/base/modules/serial/psb_d_base_vect_mod.F90
+++ b/base/modules/serial/psb_d_base_vect_mod.F90
@ -3400,7 +3400,7 @@ contains
    implicit none
    class(psb_d_base_multivect_type), intent(inout) :: x
    integer(psb_ipk_), intent(in) :: nr
-    real(psb_dpk_), allocatable :: res
+    real(psb_dpk_)    :: res
    integer(psb_ipk_) :: j, nc

    if (x%is_dev()) call x%sync()
--- a/cuda/cuda_util.c
+++ b/cuda/cuda_util.c
@ -181,6 +181,20 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count)
  }
 }

+// TODO
+int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size)
+{
+  cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice);
+
+  if (err == cudaSuccess)
+    return SPGPU_SUCCESS;	
+  else {
+    fprintf(stderr,"CUDA Error writeRemoteBuffer: %s  %p %p %d %d %d\n", 
+	    cudaGetErrorString(err),buffer, hostSrc, count, pitch, size);
+    return SPGPU_UNSPECIFIED;
+  }
+}
+
 int readRemoteBuffer(void* hostDest, void* buffer, int count)
 {

@ -206,6 +220,20 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count)
  }
 }

+// TODO sistemare pitch e size (si possono gestire senza realloc su fortran)
+int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size)
+{
+  cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost);
+
+  if (err == cudaSuccess)
+    return SPGPU_SUCCESS;	
+  else {
+    fprintf(stderr,"CUDA Error readRemoteBuffer: %s %p  %p %d %d\n", 
+	    cudaGetErrorString(err),hostDest,buffer,count,err);
+    return SPGPU_UNSPECIFIED;
+  }
+}
+
 int freeRemoteBuffer(void* buffer)
 {
  cudaError_t err = cudaFree(buffer);
--- a/cuda/cuda_util.h
+++ b/cuda/cuda_util.h
@ -49,7 +49,9 @@ int allocMappedMemory(void **buffer, void **dp, int size);
 int registerMappedMemory(void *buffer, void **dp, int size);
 int unregisterMappedMemory(void *buffer);
 int writeRemoteBuffer(void* hostSrc, void* buffer, int count);
+int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size);
 int readRemoteBuffer(void* hostDest, void* buffer, int count);
+int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size);
 int freeRemoteBuffer(void* buffer);
 int gpuInit(int dev);
 int getDeviceCount();
--- a/cuda/dvectordev.c
+++ b/cuda/dvectordev.c
@ -55,7 +55,10 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec)

 int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
 { int i;
-  i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
+  struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
+  i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
+//   i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec);
+  fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_);
  if (i != 0) {
    fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i);
  }
@ -75,7 +78,10 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec)

 int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
 { int i;
-  i = readMultiVecDeviceDouble(deviceVec, hostVec);
+  struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec;
+  i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_);
+//   i = readMultiVecDeviceDouble(deviceVec, hostVec);
+  fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_);
  if (i != 0) {
    fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i);
  }
@ -236,6 +242,7 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX,
    return SPGPU_UNSUPPORTED;

  for(j=0;j<devVecY->count_;j++)
+    fprintf(stderr,"CUDA ENTERED %d %d %d %d \n",j, n, pitch, devVecY->size_);
    spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, 
 		(double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j);
  return(i);
--- a/cuda/fcusparse_fct.h
+++ b/cuda/fcusparse_fct.h
@ -137,8 +137,10 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; 
  void *vX, *vY;
-  int r,n;
+  int j,r,n;
+  int pitch;
  cusparseHandle_t *my_handle=getHandle();
+  pitch = y->pitch_;
  TYPE   ealpha=alpha, ebeta=beta;
 #if CUDA_SHORT_VERSION <= 10
  /* getAddrMultiVecDevice(deviceX, &vX); */
@ -196,37 +198,69 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
 				 CUSPARSE_BASE_TYPE, (void *) cMat->mvbuffer));

 #else
-  cusparseDnVecDescr_t vecX, vecY;
+//   cusparseDnVecDescr_t vecX, vecY;
+  cusparseDnMatDescr_t vecX, vecY;
  size_t bfsz;

  if (T_CSRGIsNullMvDescr(cMat)) {
    cMat->spmvDescr = (cusparseSpMatDescr_t  *) malloc(sizeof(cusparseSpMatDescr_t  *));
  }
  T_CSRGCreateSpMVDescr(cMat);
-  vX=x->v_;
-  vY=y->v_;
-  CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
-  CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
-  CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
-					 &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
-					 CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
-					 &bfsz));
-  if (bfsz > cMat->mvbsize) {
-    if (cMat->mvbuffer != NULL) {
-      CHECK_CUDA(cudaFree(cMat->mvbuffer));
-      cMat->mvbuffer = NULL;
+//   vX=x->v_;
+//   vY=y->v_;
+//   fprintf(stderr,"CUDA ENTERED %p %d %d %d %d %d\n", vX, pitch, y->size_, x->count_, alpha, beta);
+//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecX, cMat->n, x->count_, pitch, vX, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
+//   CHECK_CUSPARSE(cusparseCreateDnMat(&vecY, cMat->m, y->count_, pitch, vY, CUSPARSE_BASE_TYPE, CUSPARSE_ORDER_COL));
+//   CHECK_CUSPARSE(cusparseSpMM_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
+//                         CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,
+//                         (*(cMat->spmvDescr)),vecX,&beta,vecY,
+//                         CUSPARSE_BASE_TYPE,CUSPARSE_SPMM_ALG_DEFAULT,
+//                         &bfsz));
+//     if (bfsz > cMat->mvbsize) {
+//         if (cMat->mvbuffer != NULL) {
+//         CHECK_CUDA(cudaFree(cMat->mvbuffer));
+//         cMat->mvbuffer = NULL;
+//         }
+//         //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
+//         allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
+        
+//         cMat->mvbsize = bfsz;
+//     }
+
+//     CHECK_CUSPARSE(cusparseSpMM(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
+//                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+//                         &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,CUSPARSE_BASE_TYPE,
+//                         CUSPARSE_SPMM_ALG_DEFAULT,cMat->mvbuffer));
+//     CHECK_CUSPARSE(cusparseDestroyDnMat(vecX));
+//     CHECK_CUSPARSE(cusparseDestroyDnMat(vecY));
+  for(j=0;j<y->count_;j++) {
+    vX=x->v_+pitch*j;
+    vY=y->v_+pitch*j;
+    fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n);
+    CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) );
+    CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) );
+    CHECK_CUSPARSE(cusparseSpMV_bufferSize(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
+                        &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
+                        CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
+                        &bfsz));
+    if (bfsz > cMat->mvbsize) {
+        if (cMat->mvbuffer != NULL) {
+        CHECK_CUDA(cudaFree(cMat->mvbuffer));
+        cMat->mvbuffer = NULL;
+        }
+        //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
+        allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
+        
+        cMat->mvbsize = bfsz;
    }
-    //CHECK_CUDA(cudaMalloc((void **) &(cMat->mvbuffer), bfsz));
-    allocRemoteBuffer((void **) &(cMat->mvbuffer), bfsz);
-    
-    cMat->mvbsize = bfsz;
+    CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
+                    CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
+                    cMat->mvbuffer));
+    fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_);
+    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
+    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
  }
-  CHECK_CUSPARSE(cusparseSpMV(*my_handle,CUSPARSE_OPERATION_NON_TRANSPOSE,
-			      &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY,
-			      CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT,
-			      cMat->mvbuffer));
-  CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) );
-  CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) );
  CHECK_CUSPARSE(cusparseDestroySpMat(*(cMat->spmvDescr)));
 #endif
 }
--- a/cuda/impl/psb_d_cuda_csrg_vect_mv.F90
+++ b/cuda/impl/psb_d_cuda_csrg_vect_mv.F90
@ -115,3 +115,92 @@ subroutine psb_d_cuda_csrg_vect_mv(alpha,a,x,beta,y,info,trans)

  return
 end subroutine psb_d_cuda_csrg_vect_mv
+
+
+subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+  
+  use psb_base_mod
+  use cusparse_mod
+  use elldev_mod
+  use psb_vectordev_mod
+  use psb_d_cuda_csrg_mat_mod, psb_protect_name => psb_d_cuda_csrg_multivect_mv
+  use psb_d_cuda_multivect_mod
+  implicit none 
+  class(psb_d_cuda_csrg_sparse_mat), intent(in) :: a
+  real(psb_dpk_), intent(in)       :: alpha, beta
+  class(psb_d_base_multivect_type), intent(inout) :: x
+  class(psb_d_base_multivect_type), intent(inout) :: y
+  integer(psb_ipk_), intent(out)             :: info
+  character, optional, intent(in)            :: trans
+
+  real(psb_dpk_), allocatable :: rx(:,:), ry(:,:)
+  logical           :: tra
+  character         :: trans_
+  Integer(Psb_ipk_) :: err_act
+  character(len=20) :: name='d_cuda_csrg_multivect_mv'
+
+  call psb_erractionsave(err_act)
+  info = psb_success_
+  if (present(trans)) then
+    trans_ = trans
+  else
+    trans_ = 'N'
+  end if
+
+  if (.not.a%is_asb()) then 
+    info = psb_err_invalid_mat_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
+
+  if (tra) then 
+    if (.not.x%is_host()) call x%sync()
+    if (beta /= dzero) then 
+      if (.not.y%is_host()) call y%sync()
+    end if
+    call a%psb_d_csr_sparse_mat%spmm(alpha,x,beta,y,info,trans) 
+    call y%set_host()
+  else
+    if (a%is_host()) call a%sync()    
+    select type (xx => x) 
+    type is (psb_d_multivect_cuda)
+      select type(yy => y) 
+      type is (psb_d_multivect_cuda)
+        if (xx%is_host()) call xx%sync()
+        if (beta /= dzero) then 
+          if (yy%is_host()) call yy%sync()
+        end if
+        ! TODO
+        write(*,*) 'AAAAAAAAA'
+        info = spmvCSRGDevice(a%deviceMat,alpha,xx%deviceVect,&
+             & beta,yy%deviceVect)
+        if (info /= 0) then 
+          call psb_errpush(psb_err_from_subroutine_ai_,name,&
+               & a_err='spmvCSRGDevice',i_err=(/info,izero,izero,izero,izero/))
+          info = psb_err_from_subroutine_ai_
+          goto 9999
+        end if
+        call yy%set_dev()
+      class default
+        rx = xx%get_vect()
+        ry = y%get_vect()
+        call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info)
+        call y%bld(ry)
+      end select
+    class default
+      rx = x%get_vect()
+      ry = y%get_vect()
+      call a%psb_d_csr_sparse_mat%spmm(alpha,rx,beta,ry,info)
+      call y%bld(ry)
+    end select
+  end if
+  if (info /= 0) goto 9999
+  call psb_erractionrestore(err_act)
+  return
+
+9999 call psb_error_handler(err_act)
+
+  return
+end subroutine psb_d_cuda_csrg_multivect_mv
--- a/cuda/psb_d_cuda_csrg_mat_mod.F90
+++ b/cuda/psb_d_cuda_csrg_mat_mod.F90
@ -55,6 +55,7 @@ module psb_d_cuda_csrg_mat_mod
    procedure, nopass  :: get_fmt       => d_cuda_csrg_get_fmt
    procedure, pass(a) :: sizeof        => d_cuda_csrg_sizeof
    procedure, pass(a) :: vect_mv       => psb_d_cuda_csrg_vect_mv
+    procedure, pass(a) :: multivect_mv  => psb_d_cuda_csrg_multivect_mv
    procedure, pass(a) :: in_vect_sv    => psb_d_cuda_csrg_inner_vect_sv
    procedure, pass(a) :: csmm          => psb_d_cuda_csrg_csmm
    procedure, pass(a) :: csmv          => psb_d_cuda_csrg_csmv
@ -109,6 +110,15 @@ module psb_d_cuda_csrg_mat_mod
      integer(psb_ipk_), intent(out)             :: info
      character, optional, intent(in)  :: trans
    end subroutine psb_d_cuda_csrg_vect_mv
+    subroutine psb_d_cuda_csrg_multivect_mv(alpha,a,x,beta,y,info,trans) 
+      import :: psb_d_cuda_csrg_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_
+      class(psb_d_cuda_csrg_sparse_mat), intent(in) :: a
+      real(psb_dpk_), intent(in)       :: alpha, beta
+      class(psb_d_base_multivect_type), intent(inout) :: x
+      class(psb_d_base_multivect_type), intent(inout) :: y
+      integer(psb_ipk_), intent(out)             :: info
+      character, optional, intent(in)  :: trans
+    end subroutine psb_d_cuda_csrg_multivect_mv
  end interface

  interface
--- a/cuda/psb_d_cuda_vect_mod.F90
+++ b/cuda/psb_d_cuda_vect_mod.F90
@ -1357,9 +1357,8 @@ module psb_d_cuda_multivect_mod
    procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows
    procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols
    procedure, nopass  :: get_fmt   => d_cuda_multi_get_fmt
-    ! TODO
-!!$    procedure, pass(x) :: dot_v    => d_cuda_multi_dot_v
-!!$    procedure, pass(x) :: dot_a    => d_cuda_multi_dot_a
+    procedure, pass(x) :: dot_v    => d_cuda_multi_dot_v
+    procedure, pass(x) :: dot_a    => d_cuda_multi_dot_a
    procedure, pass(y) :: axpby_v  => d_cuda_multi_axpby_v
    procedure, pass(y) :: axpby_a  => d_cuda_multi_axpby_a
 !!$    procedure, pass(y) :: mlt_v    => d_cuda_multi_mlt_v
@ -1369,7 +1368,7 @@ module psb_d_cuda_multivect_mod
 !!$    procedure, pass(x) :: scal     => d_cuda_multi_scal
    procedure, pass(x) :: nrm2     => d_cuda_multi_nrm2
    procedure, pass(x) :: amax     => d_cuda_multi_amax
-!!$    procedure, pass(x) :: asum     => d_cuda_multi_asum
+    procedure, pass(x) :: asum     => d_cuda_multi_asum
    procedure, pass(x) :: all      => d_cuda_multi_all
    procedure, pass(x) :: zero     => d_cuda_multi_zero
    procedure, pass(x) :: asb      => d_cuda_multi_asb
@ -1608,11 +1607,11 @@ contains
    res = 'dGPU'
  end function d_cuda_multi_get_fmt

-  function d_cuda_multi_dot_v(n,x,y) result(res)
+  function d_cuda_multi_dot_v(nr,x,y) result(res)
    implicit none 
    class(psb_d_multivect_cuda), intent(inout)       :: x
    class(psb_d_base_multivect_type), intent(inout) :: y
-    integer(psb_ipk_), intent(in)              :: n
+    integer(psb_ipk_), intent(in)              :: nr
    real(psb_dpk_), allocatable   :: res(:,:)
    real(psb_dpk_), external      :: ddot
    integer(psb_ipk_) :: info
@ -1623,38 +1622,41 @@ contains
    !  When we get here, we are sure that X is of
    !  TYPE psb_d_vect
    !
-    ! TODO tra
    select type(yy => y)
    type is (psb_d_multivect_cuda)
      if (x%is_host()) call x%sync()
      if (yy%is_host()) call yy%sync()
-      info = dotMultiVecDevice(res,n,x%deviceVect,yy%deviceVect,x%get_ncols())
+      info = dotMultiVecDevice(res,nr,x%deviceVect,yy%deviceVect,x%get_ncols())
      if (info /= 0) then 
        info = psb_err_internal_error_
        call psb_errpush(info,'d_cuda_multi_dot_v')
      end if
-
-    ! TODO
    class default
      ! y%sync is done in dot_a
      call x%sync()      
-      res = y%dot(n,x%v)
+      res = y%dot(nr,x%v)
    end select

  end function d_cuda_multi_dot_v

-  ! TODO
-  function d_cuda_multi_dot_a(n,x,y) result(res)
+  function d_cuda_multi_dot_a(nr,x,y) result(res)
    implicit none 
    class(psb_d_multivect_cuda), intent(inout) :: x
-    real(psb_dpk_), intent(in)           :: y(:)
-    integer(psb_ipk_), intent(in)        :: n
+    real(psb_dpk_), intent(in)           :: y(:,:)
+    integer(psb_ipk_), intent(in)        :: nr
    real(psb_dpk_), allocatable   :: res(:,:)
    real(psb_dpk_), external      :: ddot
+    integer(psb_ipk_)             :: i, j, n_x, n_y

    if (x%is_dev()) call x%sync()
-    allocate(res(2,2))
-    res = ddot(n,y,1,x%v,1)
+    n_x = size(x%v,2)
+    n_y = size(y,2_psb_ipk_)
+    allocate(res(n_x,n_y))
+    do i=1,n_x
+      do j=1,n_y
+        res(i,j) = ddot(nr,x%v(1:nr,i),1,y(1:nr,j),1)
+      end do
+    end do

  end function d_cuda_multi_dot_a
   
@ -1980,7 +1982,7 @@ contains
        nh=0
      end if
      if (c_associated(x%deviceVect)) then 
-        md  = getMultiVecDevicePitch(x%deviceVect)
+        md  = getMultiVecDeviceSize(x%deviceVect)
        nd  = getMultiVecDeviceCount(x%deviceVect)
        if ((md < mh).or.(nd<nh)) then 
          call freeMultiVecDevice(x%deviceVect)
@ -1991,7 +1993,7 @@ contains
      if (.not.c_associated(x%deviceVect)) then 
        info = FallocMultiVecDevice(x%deviceVect,nh,mh,spgpu_type_double)
        if (info == 0) &
-             & call psb_realloc(getMultiVecDevicePitch(x%deviceVect),&
+             & call psb_realloc(getMultiVecDeviceSize(x%deviceVect),&
             & getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero)
        if  (info /= 0) then 
 !!$          write(0,*) 'Error from FallocMultiVecDevice',info,n
@ -2010,10 +2012,10 @@ contains
        mh=0
        nh=0
      end if
-      md  = getMultiVecDevicePitch(x%deviceVect)
+      md  = getMultiVecDeviceSize(x%deviceVect)
      nd  = getMultiVecDeviceCount(x%deviceVect)
      if ((mh /= md).or.(nh /= nd)) then 
-        call psb_realloc(getMultiVecDevicePitch(x%deviceVect),&
+        call psb_realloc(getMultiVecDeviceSize(x%deviceVect),&
             & getMultiVecDeviceCount(x%deviceVect),x%v,info,pad=dzero)
      end if
      
--- a/cuda/vectordev.c
+++ b/cuda/vectordev.c
@ -165,6 +165,7 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,

  p = getMultiVectorDeviceParams(count, size, elementType);
  i = allocMultiVecDevice(deviceMultiVec, &p);
+  fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
  //cudaSync();
  if (i != 0) {
    fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
--- a/krylov/psb_dbgmres.f90
+++ b/krylov/psb_dbgmres.f90
@ -226,10 +226,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,

   ! BGMRES algorithm

-   ! TODO Con tanti ITRS e tanti NRHS si ottengono NaN, deflazione e restart dopo aver trovato una colonna, difficile...
-
-   ! TODO Provare a compilare su GPU remota (Vedere REC)
-
   ! STEP 1: Compute R(0) = B - A*X(0)

   ! Store B in V(1)
@ -284,7 +280,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
         idx_i = (i-1)*nrhs+1

         ! STEP 6: Compute H(i,j) = (V(i)**T)*W
-         h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_geprod(v(i),w,desc_a,info,trans=.true.)
+         h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_gedot(v(i),w,desc_a,info)
         if (info /= psb_success_) then
            info=psb_err_from_subroutine_non_
            call psb_errpush(info,name)
@ -366,13 +362,13 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
         errnum = rmn2
         errden = r0n2

-         do col=1,nrhs
-            write(*,*) rmn2(col), r0n2(col)
-         end do
+        !  do col=1,nrhs
+        !     write(*,*) rmn2(col), r0n2(col)
+        !  end do
      end if

-      ! TODO Check convergence (max o media)
-      if (all(errnum.le.(eps*errden))) then
+      ! Check convergence (max o media)
+      if (maxval(errnum).le.(eps*maxval(errden))) then

         ! Exit algorithm
         exit outer
@ -394,7 +390,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,

   ! END algorithm

-   ! TODO Versione finale che stampa errore massimo (si può usare log_conv con questo)
   if (itrace_ > 0) call log_conv(methdname,me,itx,ione,maxval(errnum),maxval(errden),deps)

   call log_end(methdname,me,itx,itrace_,maxval(errnum),maxval(errden),deps,err=derr,iter=iter)
@ -445,8 +440,6 @@ contains
      beta_e1 = dzero
      beta_e1(1:nrhs,1:nrhs) = beta

-      ! TODO DGELS ha anche i residui (con i residui fai come MATLAB e poi si prova se esce uguale)
-
      ! Compute min Frobenius norm
      call dgels('N',m_h,n_h,nrhs,h_temp(1:m_h,1:n_h),m_h,beta_e1,m_h,work,lwork,info)

--- a/test/block_krylov/Makefile
+++ b/test/block_krylov/Makefile
@ -37,4 +37,3 @@ lib:
 	(cd ../../; make library)
 verycleanlib:
 	(cd ../../; make veryclean)
-
--- a/test/block_krylov/kernel/dpdegenmm.F90
+++ b/test/block_krylov/kernel/dpdegenmm.F90
@ -624,7 +624,9 @@ program pdegenmm
  character(len=20)  :: name,ch_err
  character(len=40)  :: fname

-  real(psb_dpk_), allocatable :: test(:,:)
+  real(psb_dpk_), allocatable :: test(:,:), test1(:,:), test2(:,:)
+
+  type(c_ptr) :: gpx, gpy

  info=psb_success_

@ -787,7 +789,7 @@ program pdegenmm
 #endif
  end do

-  call x_mv%set(x0)
+  call x_mv%set(done)
  call psb_barrier(ctxt)
  t1 = psb_wtime()
  do i=1,ntests 
@ -803,7 +805,11 @@ program pdegenmm
  ! FIXME: cache flush needed here
  x1 = b_mv%get_vect()
  x2 = b_mv_g%get_vect()
+  do i=1,8
+    write(*,*) x1(i,:)
+  end do

+  ! TODO test AXPBY e SPMM
 !   call psb_geall(xg,desc_a,info)
 !   call psb_geasb(xg,desc_a,info,mold=tmold)
 !   call xg%set(done)
@ -838,21 +844,26 @@ program pdegenmm
 !     write(*,*) test(i)
 !   end do

+  ! TODO SpMM da fare con vettori GPU su mod csrg
+  ! TODO SpMM da fare a parte dopo
+  ! TODO Da cambiare WRITE READ R2 devono usare Memcopy 2D
  ! TODO Test DDOT
-  call x_mv_g%set(done)
-  call b_mv_g%set(done+done)
-  test = b_mv_g%dot(8,x_mv_g)
-  write(*,*) 'SIZE ', size(test,1), size(test,2)
-  do i=1,nrhs
-    write(*,*) test(i,:)
-  end do
+!   call x_mv_g%set(done)
+!   call b_mv_g%set(done+done)
+!   test = psb_gedot(b_mv_g,test2,desc_a,info)
+!   write(*,*) 'SIZE ', size(test,1), size(test,2)
+!   do i=1,size(test,1)
+!     write(*,*) test(i,:)
+!   end do

-  return
+  write(*,*) 'TEST'
+  call x_mv_g%set(done)
+  call x_mv_g%sync()

  call psb_barrier(ctxt)
  tt1 = psb_wtime()
  do i=1,ntests 
-    call psb_spmm(done,agpu,x_mv,dzero,b_mv_g,desc_a,info)
+    call psb_spmm(done,agpu,x_mv_g,dzero,b_mv_g,desc_a,info)
    if ((info /= 0).or.(psb_get_errstatus()/=0)) then 
      write(0,*) 'From 1 spmm',info,i,ntests
      call psb_error()
@ -903,6 +914,8 @@ program pdegenmm
  call psb_geaxpby(-done,b_mv_g,+done,b_mv,desc_a,info)
  eps = psb_geamax(b_mv,desc_a,info)

+  return
+
  call psb_amx(ctxt,t2)
  eps = maxval(abs(x1(1:nr,1:nrhs)-x2(1:nr,1:nrhs)))
  call psb_amx(ctxt,eps)