Multivector Product GPU

8 months ago · f7d44a70d6
parent 34a2a7ddbc
commit f7d44a70d6
11 changed files with 351 additions and 348 deletions
--- a/base/modules/psblas/psb_d_psblas_mod.F90
+++ b/base/modules/psblas/psb_d_psblas_mod.F90
@ -98,37 +98,23 @@ module psb_d_psblas_mod
  end interface

  interface psb_geprod
-    function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
+    subroutine psb_dprod_multivect(x,y,res,desc_a,info,global)
      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
           & psb_d_multivect_type, psb_dspmat_type
-      real(psb_dpk_), allocatable               :: res(:,:)
-      type(psb_d_multivect_type), intent(inout) :: x, y
+      type(psb_d_multivect_type), intent(inout) :: x, y, res
      type(psb_desc_type), intent(in)           :: desc_a
      integer(psb_ipk_), intent(out)            :: info
-      logical, intent(in), optional             :: trans
      logical, intent(in), optional             :: global
-    end function psb_dprod_multivect
-    function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
+    end subroutine psb_dprod_multivect
+    subroutine psb_dprod_multivect_a(x,y,res,desc_a,info,global)
      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
           & psb_d_multivect_type, psb_dspmat_type
-      real(psb_dpk_), allocatable               :: res(:,:)
-      type(psb_d_multivect_type), intent(inout) :: x
+      type(psb_d_multivect_type), intent(inout) :: x, res
      real(psb_dpk_), intent(in)                :: y(:,:)
      type(psb_desc_type), intent(in)           :: desc_a
      integer(psb_ipk_), intent(out)            :: info
-      logical, intent(in), optional             :: trans
-      logical, intent(in), optional             :: global
-    end function psb_dprod_multivect_a
-    function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)
-      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
-           & psb_d_multivect_type, psb_dspmat_type
-      real(psb_dpk_), allocatable               :: res(:,:)
-      real(psb_dpk_), intent(in)                :: x(:,:), y(:,:)
-      type(psb_desc_type), intent(in)           :: desc_a
-      integer(psb_ipk_), intent(out)            :: info
-      logical, intent(in), optional             :: trans
      logical, intent(in), optional             :: global
-    end function psb_dprod_m
+    end subroutine psb_dprod_multivect_a
  end interface

  interface psb_geaxpby
--- a/base/modules/serial/psb_d_base_vect_mod.F90
+++ b/base/modules/serial/psb_d_base_vect_mod.F90
@ -2593,8 +2593,12 @@ contains
    implicit none
    class(psb_d_base_multivect_type), intent(inout)    :: x

-    if (allocated(x%v)) x%v=dzero
-
+    if (allocated(x%v)) then
+      !$omp workshare
+      x%v(:,:)=dzero
+      !$omp end workshare
+    end if
+    call x%set_host()
  end subroutine d_base_mlv_zero


@ -2839,13 +2843,37 @@ contains
  !! \brief  Set all entries
  !! \param val   The value to set
  !!
-  subroutine d_base_mlv_set_scal(x,val)
+  subroutine d_base_mlv_set_scal(x,val,first_row,last_row,first_col,last_col)
    implicit none
    class(psb_d_base_multivect_type), intent(inout)  :: x
    real(psb_dpk_), intent(in) :: val
+    integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
+    integer(psb_ipk_) :: first_row_, last_row_, first_col_, last_col_, i, j

-    integer(psb_ipk_) :: info
-    x%v = val
+    first_row_=1
+    last_row_=size(x%v,1)
+
+    first_col_=1
+    last_col_=size(x%v,2)
+
+    if (present(first_row)) first_row_ = max(first_row,first_row_)
+    if (present(last_row))  last_row_  = min(last_row,last_row_)
+
+    if (present(first_col)) first_col_ = max(first_col,first_col_)
+    if (present(last_col))  last_col_  = min(last_col,last_col_)
+
+    if (x%is_dev()) call x%sync()
+#if defined(OPENMP)
+    !$omp parallel do private (i,j)
+    do i = first_row_, last_row_
+      do j = first_col_, last_col_
+        x%v(i,j) = val
+      end do
+    end do
+#else
+    x%v(first_row_:last_row_,first_col_:last_col_) = val
+#endif
+    call x%set_host()

  end subroutine d_base_mlv_set_scal

@ -2855,22 +2883,42 @@ contains
  !! \brief  Set all entries
  !! \param val(:)  The vector to be copied in
  !!
-  subroutine d_base_mlv_set_vect(x,val)
+  subroutine d_base_mlv_set_vect(x,val,first_row,last_row,first_col,last_col)
    implicit none
-    class(psb_d_base_multivect_type), intent(inout)  :: x
+    class(psb_d_base_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in) :: val(:,:)
-    integer(psb_ipk_) :: nr, nc
-    integer(psb_ipk_) :: info
-
-    if (allocated(x%v)) then
-      nr = min(size(x%v,1),size(val,1))
-      nc = min(size(x%v,2),size(val,2))
+    integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
+    integer(psb_ipk_) :: first_row_, last_row_, first_col_, last_col_, i, j, info

-      x%v(1:nr,1:nc) = val(1:nr,1:nc)
-    else
-      x%v = val
+    if (.not.allocated(x%v)) then
+      call psb_realloc(size(val,1),size(val,2),x%v,info)
    end if

+    first_row_=1
+    last_row_=size(x%v,1)
+
+    first_col_=1
+    last_col_=size(x%v,2)
+
+    if (present(first_row)) first_row_ = max(first_row,first_row_)
+    if (present(last_row))  last_row_  = min(last_row,last_row_)
+
+    if (present(first_col)) first_col_ = max(first_col,first_col_)
+    if (present(last_col))  last_col_  = min(last_col,last_col_)
+
+    if (x%is_dev()) call x%sync()
+#if defined(OPENMP)
+    !$omp parallel do private(i,j)
+    do i = first_row_, last_row_
+      do j = first_col_, last_col_
+        x%v(i,j) = val(i-first_row_+1,j-first_col_+1)
+      end do
+    end do
+#else
+    x%v(first_row_:last_row_,first_col_:last_col_) = val(1:last_row_-first_row_+1,1:last_col_-first_col_+1)
+#endif
+    call x%set_host()
+
  end subroutine d_base_mlv_set_vect

  !
@ -2882,18 +2930,18 @@ contains
  !! \brief       Product by a mlv
  !! \param nr    Number of rows to be considered
  !! \param y     The other (base_mlv_vect) to be multiplied by
-  !! \param trans If true, x is transposed
  !! \param res   Result vector
  !!
-  function d_base_mlv_prod_v(nr,x,y,trans) result(res)
+  subroutine d_base_mlv_prod_v(nr,x,y,res)
    implicit none
-    class(psb_d_base_multivect_type), intent(inout) :: x, y
+    class(psb_d_base_multivect_type), intent(inout) :: x, y, res
    integer(psb_ipk_), intent(in)                   :: nr
-    logical, optional, intent(in)                   :: trans
-    real(psb_dpk_), allocatable                     :: res(:,:)
    external                                        :: dgemm
    integer(psb_ipk_)                               :: x_n, y_n, lda, ldb

+    write(*,*) 'CPU PROD'
+
+
    if (x%is_dev()) call x%sync()
    if (y%is_dev()) call y%sync()
    select type(yy => y)
@ -2901,22 +2949,12 @@ contains
      x_n = x%get_ncols()
      y_n = y%get_ncols()
      lda = x%get_nrows()
-      if (trans) then
-        allocate(res(x_n,y_n))
-        res = dzero
-        ldb = y%get_nrows()
-        call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y%v,ldb,dzero,res,x_n)
-      else
-        allocate(res(x%get_nrows(),y_n))
-        res = dzero
-        ldb = y_n
-        call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y%v,ldb,dzero,res,lda)
-      end if
+      ldb = y_n
+      call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y%v,ldb,dzero,res%v,lda)
    class default
-      res = x%prod(nr,y%v,trans)
+      call x%prod(nr,y%v,res)
    end select
-
-  end function d_base_mlv_prod_v
+  end subroutine d_base_mlv_prod_v

  !
  ! Multivectors product base
@ -2930,33 +2968,24 @@ contains
  !! \param trans If true, x is transposed
  !! \param res   Result vector
  !!
-  function d_base_mlv_prod_a(nr,x,y,trans) result(res)
+  subroutine d_base_mlv_prod_a(nr,x,y,res)
    implicit none
-    class(psb_d_base_multivect_type), intent(inout) :: x
+    class(psb_d_base_multivect_type), intent(inout) :: x, res
    real(psb_dpk_), intent(in)                      :: y(:,:)
    integer(psb_ipk_), intent(in)                   :: nr
-    logical, optional, intent(in)                   :: trans
-    real(psb_dpk_), allocatable                     :: res(:,:)
    external                                        :: dgemm
    integer(psb_ipk_)                               :: x_n, y_n, lda, ldb

+    write(*,*) 'CPU PROD'
+
+
    if (x%is_dev()) call x%sync()
    x_n = x%get_ncols()
    y_n = size(y,dim=2)
    lda = x%get_nrows()
-    if (trans) then
-      allocate(res(x_n,y_n))
-      res = dzero
-      ldb = size(y,dim=1)
-      call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y,ldb,dzero,res,x_n)
-    else
-      allocate(res(x%get_nrows(),y_n))
-      res = dzero
-      ldb = x_n
-      call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y,ldb,dzero,res,lda)
-    end if
-
-  end function d_base_mlv_prod_a
+    ldb = x_n
+    call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y,ldb,dzero,res%v,lda)
+  end subroutine d_base_mlv_prod_a

  !
  ! Dot products
--- a/base/modules/serial/psb_d_vect_mod.F90
+++ b/base/modules/serial/psb_d_vect_mod.F90
@ -1627,21 +1627,23 @@ contains
    end if
  end function d_vect_get_vect

-  subroutine d_vect_set_scal(x,val)
+  subroutine d_vect_set_scal(x,val,first_row,last_row,first_col,last_col)
    class(psb_d_multivect_type), intent(inout)  :: x
    real(psb_dpk_), intent(in) :: val
-
+    integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
    integer(psb_ipk_) :: info
-    if (allocated(x%v)) call x%v%set(val)
+
+    if (allocated(x%v)) call x%v%set(val,first_row,last_row,first_col,last_col)

  end subroutine d_vect_set_scal

-  subroutine d_vect_set_vect(x,val)
+  subroutine d_vect_set_vect(x,val,first_row,last_row,first_col,last_col)
    class(psb_d_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in)         :: val(:,:)
-
+    integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
    integer(psb_ipk_) :: info
-    if (allocated(x%v)) call x%v%set(val)
+    
+    if (allocated(x%v)) call x%v%set(val,first_row,last_row,first_col,last_col)

  end subroutine d_vect_set_vect

@ -1893,30 +1895,26 @@ contains
    call move_alloc(tmp,x%v)
  end subroutine d_vect_cnv

-  function d_vect_prod_v(nr,x,y,trans) result(res)
+  subroutine d_vect_prod_v(nr,x,y,res)
    implicit none
-    class(psb_d_multivect_type), intent(inout) :: x, y
+    class(psb_d_multivect_type), intent(inout) :: x, y, res
    integer(psb_ipk_), intent(in)              :: nr
-    logical, optional, intent(in)              :: trans
-    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v).and.allocated(y%v)) &
-         & res = x%v%prod(nr,y%v,trans)
+         & call x%v%prod(nr,y%v,res%v)

-  end function d_vect_prod_v
+  end subroutine d_vect_prod_v

-  function d_vect_prod_a(nr,x,y,trans) result(res)
+  subroutine d_vect_prod_a(nr,x,y,res)
    implicit none
-    class(psb_d_multivect_type), intent(inout) :: x
+    class(psb_d_multivect_type), intent(inout) :: x, res
    real(psb_dpk_), intent(in)                 :: y(:,:)
    integer(psb_ipk_), intent(in)              :: nr
-    logical, optional, intent(in)              :: trans
-    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v)) &
-         & res = x%v%prod(nr,y,trans)
+         & call x%v%prod(nr,y,res%v)

-  end function d_vect_prod_a
+  end subroutine d_vect_prod_a

  function d_vect_dot_v(nr,x,y) result(res)
    implicit none
--- a/base/psblas/psb_dprod.f90
+++ b/base/psblas/psb_dprod.f90
@ -43,14 +43,13 @@
 !    y      -  type(psb_d_multivect_type) The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).       The communication descriptor.
 !    info   -  integer.                   Return code
-!    trans  -  logical(optional)          Whether multivector X is transposed, default: .false.
 !    global -  logical(optional)          Whether to perform the global reduce, default: .true.
 !
 !  Note: from a functional point of view, X and Y are input, but here
 !        they are declared INOUT because of the sync() methods.
 !
 !
-function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
+subroutine psb_dprod_multivect(x,y,res,desc_a,info,global)
   use psb_desc_mod
   use psb_d_base_mat_mod
   use psb_check_mod
@ -59,11 +58,9 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
   use psb_d_vect_mod
   use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect
   implicit none
-   real(psb_dpk_), allocatable               :: res(:,:)
-   type(psb_d_multivect_type), intent(inout) :: x, y
+   type(psb_d_multivect_type), intent(inout) :: x, y, res
   type(psb_desc_type), intent(in)           :: desc_a
   integer(psb_ipk_), intent(out)            :: info
-   logical, intent(in), optional             :: trans
   logical, intent(in), optional             :: global

   ! locals
@ -71,7 +68,7 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
   integer(psb_ipk_) :: np, me, idx, ndm,&
   & err_act, iix, jjx, iiy, jjy, i, j, nr
   integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
-   logical :: global_, trans_
+   logical :: global_
   character(len=20)      :: name, ch_err

   name='psb_dprod_multivect'
@ -99,12 +96,6 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
      goto 9999
   endif

-   if (present(trans)) then
-      trans_ = trans
-   else
-      trans_ = .false.
-   end if
-
   if (present(global)) then
      global_ = global
   else
@ -140,25 +131,26 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)

   nr = desc_a%get_local_rows()
   if (nr > 0) then
-      res = x%prod(nr,y,trans_)
+      call x%prod(nr,y,res)
      ! adjust dot_local because overlapped elements are computed more than once
      if (size(desc_a%ovrlap_elem,1)>0) then
         if (x%v%is_dev()) call x%sync()
         if (y%v%is_dev()) call y%sync()
+         if (res%v%is_dev()) call res%sync()
         do j=1,x%get_ncols()
            do i=1,size(desc_a%ovrlap_elem,1)
               idx = desc_a%ovrlap_elem(i,1)
               ndm = desc_a%ovrlap_elem(i,2)
-               res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
+               res%v%v(j,:) = res%v%v(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
            end do
         end do
      end if
   else
-      res = dzero
+      call res%zero()
   end if

   ! compute global sum
-   if (global_) call psb_sum(ctxt, res)
+   if (global_) call psb_sum(ctxt, res%v%v)

   call psb_erractionrestore(err_act)
   return
@ -167,7 +159,7 @@ function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)

   return

-end function psb_dprod_multivect
+end subroutine psb_dprod_multivect
 !
 ! Function: psb_dprod_multivect_a
 !    psb_dprod computes the product of two distributed multivectors,
@ -181,14 +173,13 @@ end function psb_dprod_multivect
 !    y      -  real(:,:)                  The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).       The communication descriptor.
 !    info   -  integer.                   Return code
-!    trans  -  logical(optional)          Whether multivector X is transposed, default: .false.
 !    global -  logical(optional)          Whether to perform the global reduce, default: .true.
 !
 !  Note: from a functional point of view, X and Y are input, but here
 !        they are declared INOUT because of the sync() methods.
 !
 !
-function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
+subroutine psb_dprod_multivect_a(x,y,res,desc_a,info,global)
   use psb_desc_mod
   use psb_d_base_mat_mod
   use psb_check_mod
@ -197,12 +188,10 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
   use psb_d_vect_mod
   use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect_a
   implicit none
-   real(psb_dpk_), allocatable               :: res(:,:)
-   type(psb_d_multivect_type), intent(inout) :: x
+   type(psb_d_multivect_type), intent(inout) :: x, res
   real(psb_dpk_), intent(in)                :: y(:,:)
   type(psb_desc_type), intent(in)           :: desc_a
   integer(psb_ipk_), intent(out)            :: info
-   logical, intent(in), optional             :: trans
   logical, intent(in), optional             :: global

   ! locals
@ -210,7 +199,7 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
   integer(psb_ipk_) :: np, me, idx, ndm,&
   & err_act, iix, jjx, iiy, jjy, i, j, nr
   integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
-   logical :: global_, trans_
+   logical :: global_
   character(len=20)      :: name, ch_err

   name='psb_dprod_multivect'
@ -233,12 +222,6 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
      goto 9999
   endif

-   if (present(trans)) then
-      trans_ = trans
-   else
-      trans_ = .false.
-   end if
-
   if (present(global)) then
      global_ = global
   else
@ -268,164 +251,25 @@ function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)

   nr = desc_a%get_local_rows()
   if (nr > 0) then
-      res = x%prod(nr,y,trans_)
+      call x%prod(nr,y,res)
      ! adjust dot_local because overlapped elements are computed more than once
      if (size(desc_a%ovrlap_elem,1)>0) then
         if (x%v%is_dev()) call x%sync()
+         if (res%v%is_dev()) call res%sync()
         do j=1,x%get_ncols()
            do i=1,size(desc_a%ovrlap_elem,1)
               idx = desc_a%ovrlap_elem(i,1)
               ndm = desc_a%ovrlap_elem(i,2)
-               res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y(idx,:))
+               res%v%v(j,:) = res%v%v(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y(idx,:))
            end do
         end do
      end if
   else
-      res = dzero
-   end if
-
-   ! compute global sum
-   if (global_) call psb_sum(ctxt, res)
-
-   call psb_erractionrestore(err_act)
-   return
-
-9999 call psb_error_handler(ctxt,err_act)
-
-   return
-
-end function psb_dprod_multivect_a
-!
-! Function: psb_dprod_m
-!    psb_dprod computes the product of two distributed multivectors,
-!
-!    prod := ( X ) * ( Y ) or
-!    prod := ( X )**C * ( Y )
-!
-!
-! Arguments:
-!    x      -  real(:,:)                  The input vector containing the entries of sub( X ).
-!    y      -  real(:,:)                  The input vector containing the entries of sub( Y ).
-!    desc_a -  type(psb_desc_type).       The communication descriptor.
-!    info   -  integer.                   Return code
-!    trans  -  logical(optional)          Whether multivector X is transposed, default: .false.
-!    global -  logical(optional)          Whether to perform the global reduce, default: .true.
-!
-!  Note: from a functional point of view, X and Y are input, but here
-!        they are declared INOUT because of the sync() methods.
-!
-!
-function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)
-   use psb_desc_mod
-   use psb_d_base_mat_mod
-   use psb_check_mod
-   use psb_error_mod
-   use psb_penv_mod
-   use psb_d_vect_mod
-   use psb_d_psblas_mod, psb_protect_name => psb_dprod_m
-   implicit none
-   real(psb_dpk_), allocatable               :: res(:,:)
-   real(psb_dpk_), intent(in)                :: x(:,:), y(:,:)
-   type(psb_desc_type), intent(in)           :: desc_a
-   integer(psb_ipk_), intent(out)            :: info
-   logical, intent(in), optional             :: trans
-   logical, intent(in), optional             :: global
-
-   ! locals
-   type(psb_ctxt_type) :: ctxt
-   integer(psb_ipk_) :: np, me, idx, ndm,&
-   & err_act, iix, jjx, iiy, jjy, i, j, nr, x_n, y_n, lda, ldb
-   integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n
-   logical :: global_, trans_
-   character(len=20)      :: name, ch_err
-
-   name='psb_dprod_multivect'
-   info=psb_success_
-   call psb_erractionsave(err_act)
-   if (psb_errstatus_fatal()) then
-      info = psb_err_internal_error_ ;    goto 9999
-   end if
-
-   ctxt=desc_a%get_context()
-   call psb_info(ctxt, me, np)
-   if (np == -ione) then
-      info = psb_err_context_error_
-      call psb_errpush(info,name)
-      goto 9999
-   endif
-
-   if (present(trans)) then
-      trans_ = trans
-   else
-      trans_ = .false.
-   end if
-
-   if (present(global)) then
-      global_ = global
-   else
-      global_ = .false.
-   end if
-
-   ix = ione
-   ijx = ione
-
-   m = desc_a%get_global_rows()
-
-   ! check vector correctness
-   n = size(x,2)
-   call psb_chkvect(m,n,size(x,1),ix,ijx,desc_a,info,iix,jjx)
-   if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_chkvect'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-   end if
-
-   if ((iix /= ione)) then
-      info=psb_err_ix_n1_iy_n1_unsupported_
-      call psb_errpush(info,name)
-      goto 9999
-   end if
-
-   nr = desc_a%get_local_rows()
-   x_n = size(x,2)
-   y_n = size(y,2)
-   lda = size(x,1)
-   if (nr > 0) then
-
-      if (trans_) then
-         allocate(res(x_n,y_n))
-         res = dzero
-         ldb = size(y,1)
-         call dgemm('T','N',x_n,y_n,nr,done,x,lda,y,ldb,dzero,res,x_n)
-      else
-         allocate(res(lda,y_n))
-         res = dzero
-         ldb = x_n
-         call dgemm('N','N',nr,y_n,x_n,done,x,lda,y,ldb,dzero,res,lda)
-      end if
-
-      ! adjust dot_local because overlapped elements are computed more than once
-      if (size(desc_a%ovrlap_elem,1)>0) then
-         do j=1,x_n
-            do i=1,size(desc_a%ovrlap_elem,1)
-               idx = desc_a%ovrlap_elem(i,1)
-               ndm = desc_a%ovrlap_elem(i,2)
-               res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x(idx,:)*y(idx,:))
-            end do
-         end do
-      end if
-   else
-      if (trans_) then
-         allocate(res(x_n,y_n))
-      else
-         allocate(res(lda,y_n))
-      end if
-      res = dzero
+      call res%zero()
   end if

   ! compute global sum
-   if (global_) call psb_sum(ctxt, res)
+   if (global_) call psb_sum(ctxt, res%v%v)

   call psb_erractionrestore(err_act)
   return
@ -434,4 +278,4 @@ function psb_dprod_m(x,y,desc_a,info,trans,global) result(res)

   return

-end function psb_dprod_m
+end subroutine psb_dprod_multivect_a
--- a/cuda/dvectordev.c
+++ b/cuda/dvectordev.c
@ -89,10 +89,37 @@ int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld)
  return(i);
 }

+int prodMultiVecDeviceDouble(char transa, int m, int n, int k,
+			double *alpha, void *deviceX, void* deviceY, 
+			double *beta, void* deviceZ)
+{
+  struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
+  struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
+  struct MultiVectDevice *z = (struct MultiVectDevice *) deviceZ;
+  int status;
+
+  cublasHandle_t handle=psb_cudaGetCublasHandle();
+  cublasOperation_t trans=((transa == 'N')? CUBLAS_OP_N:((transa=='T')? CUBLAS_OP_T:CUBLAS_OP_C));
+  /* Note: the M,N,K choices according to TRANS have already been handled in the caller */  
+  if (n == 1) {
+    status = cublasDgemv(handle, trans, m, k,
+			 alpha, x->v_, x->pitch_, y->v_, 1,
+			 beta, z->v_, 1);
+  } else {
+    status = cublasDgemm(handle, trans, CUBLAS_OP_N, m, n, k,
+			 alpha, x->v_, x->pitch_, y->v_, y->pitch_,
+			 beta, z->v_, z->pitch_);
+  }    
+  
+  if (status == CUBLAS_STATUS_SUCCESS)  
+    return SPGPU_SUCCESS;
+  else
+    return SPGPU_UNSUPPORTED;
+}
+
 int setscalMultiVecDeviceDouble(double val, int first, int last, 
 				int indexBase, void* devMultiVecX) 
 { int i=0;
-  int pitch = 0;
  struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
  spgpuHandle_t handle=psb_cudaGetHandle();

@ -101,6 +128,20 @@ int setscalMultiVecDeviceDouble(double val, int first, int last,
  return(i);
 }

+int setscalMultiVecDeviceDoubleR2(double val, int first_row, int last_row, 
+				int first_col, int last_col, int indexBase, void* devMultiVecX) 
+{ int i=0;
+  struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+  
+  int pitch = devVecX->pitch_;
+
+  for (int j=first_col; j<=last_col; j++) {
+    spgpuDsetscal(handle, first_row, last_row, indexBase, val, ((double *) devVecX->v_)+(j-1)*pitch);
+  }
+  
+  return(i);
+}

 int geinsMultiVecDeviceDouble(int n, void* devMultiVecIrl, void* devMultiVecVal, 
 			      int dupl, int indexBase, void* devMultiVecX)
--- a/cuda/dvectordev.h
+++ b/cuda/dvectordev.h
@ -42,8 +42,14 @@ int writeMultiVecDeviceDoubleR2(void* deviceMultiVec, double* hostMultiVec, int
 int readMultiVecDeviceDouble(void* deviceMultiVec, double* hostMultiVec);
 int readMultiVecDeviceDoubleR2(void* deviceMultiVec, double* hostMultiVec, int ld);

+int prodMultiVecDeviceDouble(char transa, int m, int n, int k,
+			double *alpha, void *deviceX, void* deviceY, 
+			double *beta, void* deviceZ);
+
 int setscalMultiVecDeviceDouble(double val, int first, int last, 
-				int indexBase, void* devVecX); 
+				int indexBase, void* devVecX);
+int setscalMultiVecDeviceDoubleR2(double val, int first_row, int last_row, 
+				int first_col, int last_col, int indexBase, void* devVecX);

 int geinsMultiVecDeviceDouble(int n, void* devVecIrl, void* devVecVal, 
 			      int dupl, int indexBase, void* devVecX); 
--- a/cuda/psb_d_cuda_vect_mod.F90
+++ b/cuda/psb_d_cuda_vect_mod.F90
@ -1354,41 +1354,82 @@ module psb_d_cuda_multivect_mod
    real(c_double), allocatable :: buffer(:,:)
    type(c_ptr) :: dt_buf = c_null_ptr
  contains
+    !
+    !  Constructors/allocators
+    !
+    procedure, pass(x) :: bld_x    => d_cuda_multi_bld_x
+    procedure, pass(x) :: bld_n    => d_cuda_multi_bld_n
+    procedure, pass(x) :: all      => d_cuda_multi_all
+    !
+    ! Insert/set. Assembly and free.
+    ! Assembly does almost nothing here, but is important
+    ! in derived classes.
+    !
+    procedure, pass(x) :: ins      => d_cuda_multi_ins
+    procedure, pass(x) :: zero     => d_cuda_multi_zero
+    procedure, pass(x) :: asb      => d_cuda_multi_asb
+    procedure, pass(x) :: free     => d_cuda_multi_free
+    !
+    ! Sync: centerpiece of handling of external storage.
+    ! Any derived class having extra storage upon sync
+    ! will guarantee that both fortran/host side and
+    ! external side contain the same data. The base
+    ! version is only a placeholder.
+    !
+    procedure, pass(x) :: sync       => d_cuda_multi_sync
+    procedure, pass(x) :: sync_space => d_cuda_multi_sync_space
+    procedure, pass(x) :: is_host    => d_cuda_multi_is_host
+    procedure, pass(x) :: is_dev     => d_cuda_multi_is_dev
+    procedure, pass(x) :: is_sync    => d_cuda_multi_is_sync
+    procedure, pass(x) :: set_host   => d_cuda_multi_set_host
+    procedure, pass(x) :: set_dev    => d_cuda_multi_set_dev
+    procedure, pass(x) :: set_sync   => d_cuda_multi_set_sync
+    !
+    ! Basic info
+    !
    procedure, pass(x) :: get_nrows => d_cuda_multi_get_nrows
    procedure, pass(x) :: get_ncols => d_cuda_multi_get_ncols
    procedure, nopass  :: get_fmt   => d_cuda_multi_get_fmt
+    !
+    ! Set/get data from/to an external array; also
+    ! overload assignment.
+    !
+    procedure, pass(x) :: set_scal => d_cuda_multi_set_scal
+    procedure, pass(x) :: set_vect => d_cuda_multi_set_vect
+    !
+    ! Product, dot-product (col-by-col) and AXPBY
+    !
+    procedure, pass(x) :: prod_v   => d_cuda_multi_prod_v
+    procedure, pass(x) :: prod_a   => d_cuda_multi_prod_a
    procedure, pass(x) :: dot_v    => d_cuda_multi_dot_v
    procedure, pass(x) :: dot_a    => d_cuda_multi_dot_a
    procedure, pass(y) :: axpby_v  => d_cuda_multi_axpby_v
    procedure, pass(y) :: axpby_a  => d_cuda_multi_axpby_a
+    !
+    ! MultiVector by vector/multivector multiplication. Need all variants
+    ! to handle multiple requirements from preconditioners
+    !
 !!$    procedure, pass(y) :: mlt_v    => d_cuda_multi_mlt_v
 !!$    procedure, pass(y) :: mlt_a    => d_cuda_multi_mlt_a
 !!$    procedure, pass(z) :: mlt_a_2  => d_cuda_multi_mlt_a_2
 !!$    procedure, pass(z) :: mlt_v_2  => d_cuda_multi_mlt_v_2
-!!$    procedure, pass(x) :: scal     => d_cuda_multi_scal
+    !
+    ! Scaling and norms
+    !
    procedure, pass(x) :: nrm2     => d_cuda_multi_nrm2
    procedure, pass(x) :: amax     => d_cuda_multi_amax
    procedure, pass(x) :: asum     => d_cuda_multi_asum
-    procedure, pass(x) :: all      => d_cuda_multi_all
-    procedure, pass(x) :: zero     => d_cuda_multi_zero
-    procedure, pass(x) :: asb      => d_cuda_multi_asb
-    procedure, pass(x) :: sync     => d_cuda_multi_sync
-    procedure, pass(x) :: sync_space => d_cuda_multi_sync_space
-    procedure, pass(x) :: bld_x    => d_cuda_multi_bld_x
-    procedure, pass(x) :: bld_n    => d_cuda_multi_bld_n
-    procedure, pass(x) :: free     => d_cuda_multi_free
-    procedure, pass(x) :: ins      => d_cuda_multi_ins
-    procedure, pass(x) :: is_host  => d_cuda_multi_is_host
-    procedure, pass(x) :: is_dev   => d_cuda_multi_is_dev
-    procedure, pass(x) :: is_sync  => d_cuda_multi_is_sync
-    procedure, pass(x) :: set_host => d_cuda_multi_set_host
-    procedure, pass(x) :: set_dev  => d_cuda_multi_set_dev
-    procedure, pass(x) :: set_sync => d_cuda_multi_set_sync
-    procedure, pass(x) :: set_scal => d_cuda_multi_set_scal
-    procedure, pass(x) :: set_vect => d_cuda_multi_set_vect
+!!$    procedure, pass(x) :: scal     => d_cuda_multi_scal
+    !
+    ! Gather/scatter. These are needed for MPI interfacing.
+    ! May have to be reworked.
+    !
 !!$    procedure, pass(x) :: gthzv_x  => d_cuda_multi_gthzv_x
 !!$    procedure, pass(y) :: sctb     => d_cuda_multi_sctb
 !!$    procedure, pass(y) :: sctb_x   => d_cuda_multi_sctb_x
+    !
+    ! Finalize
+    !
    final              :: d_cuda_multi_vect_finalize
  end type psb_d_multivect_cuda

@ -1607,13 +1648,63 @@ contains
    res = 'dGPU'
  end function d_cuda_multi_get_fmt

+  subroutine d_cuda_multi_prod_v(nr,x,y,res)
+    implicit none 
+    class(psb_d_multivect_cuda), intent(inout)      :: x
+    class(psb_d_base_multivect_type), intent(inout) :: y, res
+    integer(psb_ipk_), intent(in)                   :: nr
+    integer(psb_ipk_)                               :: info
+
+    select type(yy => y)
+    type is (psb_d_multivect_cuda)
+      select type(zz => res)
+      type is (psb_d_multivect_cuda)
+        if (x%is_host()) call x%sync()
+        if (yy%is_host()) call yy%sync()
+        if (zz%is_host()) call zz%sync()
+        if (info == 0) &
+            & info = prodMultiVecDevice('N',nr,yy%get_ncols(),x%get_ncols(),done,x%deviceVect,yy%deviceVect,done,zz%deviceVect)
+        call zz%set_dev()
+      class default
+        ! TODO
+      end select
+    class default
+      call x%prod(nr,y%v,res)
+    end select
+  end subroutine d_cuda_multi_prod_v
+
+  subroutine d_cuda_multi_prod_a(nr,x,y,res)
+    implicit none 
+    class(psb_d_multivect_cuda), intent(inout)      :: x
+    real(psb_dpk_), intent(in)                      :: y(:,:)
+    class(psb_d_base_multivect_type), intent(inout) :: res
+    integer(psb_ipk_), intent(in)                   :: nr
+    type(c_ptr)                                     :: gpY
+    integer(psb_ipk_)                               :: info, i
+
+    select type(zz => res)
+    type is (psb_d_multivect_cuda)
+      if (x%is_host()) call x%sync()
+      if (zz%is_host()) call zz%sync()
+      if (info == 0) &
+          & info = FallocMultiVecDevice(gpY,size(y,2),size(y,1),spgpu_type_double)
+      if (info == 0) &
+          & info = writeMultiVecDevice(gpY,y,size(y,1))
+      if (info == 0) &
+          & info = prodMultiVecDevice('N',nr,size(y,2),x%get_ncols(),done,x%deviceVect,gpY,dzero,zz%deviceVect)
+      call freeMultiVecDevice(gpY)
+      call zz%set_dev()
+    class default
+      ! TODO
+    end select
+  end subroutine d_cuda_multi_prod_a
+
  function d_cuda_multi_dot_v(nr,x,y) result(res)
    implicit none 
    class(psb_d_multivect_cuda), intent(inout)       :: x
    class(psb_d_base_multivect_type), intent(inout) :: y
    integer(psb_ipk_), intent(in)              :: nr
    real(psb_dpk_), allocatable   :: res(:,:)
-    real(psb_dpk_), external      :: ddot
    integer(psb_ipk_) :: info
    
    !
@ -1824,22 +1915,33 @@ contains
 !!$    end select
 !!$  end subroutine d_cuda_multi_mlt_v_2

-
-  subroutine d_cuda_multi_set_scal(x,val)
+  subroutine d_cuda_multi_set_scal(x,val,first_row,last_row,first_col,last_col)
    class(psb_d_multivect_cuda), intent(inout) :: x
-    real(psb_dpk_), intent(in)           :: val
-        
-    integer(psb_ipk_) :: info
+    real(psb_dpk_), intent(in)                 :: val
+    integer(psb_ipk_), optional                :: first_row, last_row, first_col, last_col
+    integer(psb_ipk_) :: info, first_row_, last_row_, first_col_, last_col_
+
+    first_row_=1
+    last_row_=size(x%v,1)
+
+    first_col_=1
+    last_col_=size(x%v,2)
+
+    if (present(first_row)) first_row_ = max(first_row,first_row_)
+    if (present(last_row))  last_row_  = min(last_row,last_row_)
+
+    if (present(first_col)) first_col_ = max(first_col,first_col_)
+    if (present(last_col))  last_col_  = min(last_col,last_col_)
+
+    info = setScalDevice(val,first_row_,last_row_,first_col_,last_col_,1,x%deviceVect)
+    call x%set_dev()

-    if (x%is_dev()) call x%sync()
-    call x%psb_d_base_multivect_type%set_scal(val)
-    call x%set_host()
  end subroutine d_cuda_multi_set_scal

-  subroutine d_cuda_multi_set_vect(x,val)
+  subroutine d_cuda_multi_set_vect(x,val,first_row,last_row,first_col,last_col)
    class(psb_d_multivect_cuda), intent(inout) :: x
    real(psb_dpk_), intent(in)           :: val(:,:)
-    integer(psb_ipk_) :: nr
+    integer(psb_ipk_), optional :: first_row, last_row, first_col, last_col
    integer(psb_ipk_) :: info

    if (x%is_dev()) call x%sync()
@ -1848,8 +1950,6 @@ contains

  end subroutine d_cuda_multi_set_vect

-
-
 !!$  subroutine d_cuda_multi_scal(alpha, x)
 !!$    implicit none 
 !!$    class(psb_d_multivect_cuda), intent(inout) :: x
@ -1870,7 +1970,6 @@ contains
    if (x%is_host()) call x%sync()
    allocate(res(x%get_ncols()))
    info = nrm2MultiVecDevice(res,nr,x%deviceVect)
-    
  end function d_cuda_multi_nrm2
 
  ! TODO CUDA
@ -1887,7 +1986,6 @@ contains
    do i=1,nr
        res = max(res,sum(abs(x%v(i,1:nc))))
    end do
-
  end function d_cuda_multi_amax

  ! TODO CUDA
@ -1903,7 +2001,6 @@ contains
    do j=1,x%get_ncols()
        res = max(res,sum(abs(x%v(1:nr,j))))
    end do
-
  end function d_cuda_multi_asum
  
  subroutine d_cuda_multi_all(m,n, x, info)
@ -1931,8 +2028,8 @@ contains
    implicit none 
    class(psb_d_multivect_cuda), intent(inout) :: x
    
-    if (allocated(x%v)) x%v=dzero
-    call x%set_host()
+    call x%set_dev()
+    call x%set_scal(dzero)
  end subroutine d_cuda_multi_zero

  subroutine d_cuda_multi_asb(m,n, x, info)
@ -1944,7 +2041,6 @@ contains
    integer(psb_ipk_), intent(out)       :: info
    integer(psb_ipk_) :: nd, nc

-
    x%m_nrows = m
    x%m_ncols = n
    if (x%is_host()) then 
@ -2035,7 +2131,7 @@ contains
    else if (x%is_dev()) then 
      info = readMultiVecDevice(x%deviceVect,x%v,size(x%v,1))
    end if
-    if (info == 0)  call x%set_sync()
+    if (info == 0) call x%set_sync()
    if (info /= 0) then
      info=psb_err_internal_error_
      call psb_errpush(info,'d_cuda_multi_sync')
--- a/cuda/psb_d_vectordev_mod.F90
+++ b/cuda/psb_d_vectordev_mod.F90
@ -80,6 +80,18 @@ module psb_d_vectordev_mod
    end function readMultiVecDeviceDoubleR2
  end interface 

+  interface prodMultiVecDevice
+    function prodMultiVecDeviceDouble(transa,m,n,k,alpha,deviceVecA, &
+      & deviceVecB,beta,deviceVecC) result(res) bind(c,name='prodMultiVecDeviceDouble')
+      use iso_c_binding
+      character(c_char), value  :: transa
+      integer(c_int), value     :: m, n, k
+      integer(c_int)		        :: res
+      type(c_ptr), value	      :: deviceVecA, deviceVecB, deviceVecC 
+      real(c_double)            :: alpha,  beta
+    end function prodMultiVecDeviceDouble
+  end interface
+
  interface allocateDouble
    function allocateDouble(didx,n) &
         & result(res) bind(c,name='allocateDouble') 
@ -159,7 +171,6 @@ module psb_d_vectordev_mod
    end subroutine freeDouble
  end interface
  
-
  interface setScalDevice
    function setScalMultiVecDeviceDouble(val, first, last, &
         & indexBase, deviceVecX) result(res) &
@ -170,6 +181,15 @@ module psb_d_vectordev_mod
      real(c_double), value :: val
      type(c_ptr),   value  :: deviceVecX
    end function setScalMultiVecDeviceDouble
+    function setScalMultiVecDeviceDoubleR2(val, first_row, last_row, &
+         & first_col, last_col, indexBase, deviceVecX) result(res) &
+         & bind(c,name='setscalMultiVecDeviceDoubleR2')
+      use iso_c_binding
+      integer(c_int)        :: res
+      integer(c_int), value :: first_row,last_row,first_col,last_col,indexbase
+      real(c_double), value :: val
+      type(c_ptr),   value  :: deviceVecX
+    end function setScalMultiVecDeviceDoubleR2
  end interface

  interface 
--- a/krylov/psb_dbgmres.f90
+++ b/krylov/psb_dbgmres.f90
@ -63,7 +63,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
   real(psb_dpk_), allocatable               :: aux(:), h(:,:), beta(:,:), y(:,:)

   type(psb_d_multivect_type), allocatable   :: v(:)
-   type(psb_d_multivect_type)                :: w, r0, rm, temp
+   type(psb_d_multivect_type)                :: w, r0, rm, pd

   integer(psb_ipk_)                         :: naux, itrace_, n_row, n_col, nrhs, nrep
   integer(psb_lpk_)                         :: mglob, n_add, ncv
@ -153,12 +153,12 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
   if (info == psb_success_) call psb_geall(w,desc_a,info,n=nrhs)
   if (info == psb_success_) call psb_geall(r0,desc_a,info,n=nrhs)
   if (info == psb_success_) call psb_geall(rm,desc_a,info,n=nrhs)
-   if (info == psb_success_) call psb_geall(temp,desc_a,info,n=nrhs)
+   if (info == psb_success_) call psb_geall(pd,desc_a,info,n=nrhs)
   if (info == psb_success_) call psb_geasb(v,desc_a,info,mold=x%v,n=nrhs)
   if (info == psb_success_) call psb_geasb(w,desc_a,info,mold=x%v,n=nrhs)
   if (info == psb_success_) call psb_geasb(r0,desc_a,info,mold=x%v,n=nrhs)
   if (info == psb_success_) call psb_geasb(rm,desc_a,info,mold=x%v,n=nrhs)
-   if (info == psb_success_) call psb_geasb(temp,desc_a,info,mold=x%v,n=nrhs)
+   if (info == psb_success_) call psb_geasb(pd,desc_a,info,mold=x%v,n=nrhs)
   if (info /= psb_success_) then
      info=psb_err_from_subroutine_non_
      call psb_errpush(info,name)
@ -261,8 +261,8 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,

         ! STEP 7: Compute W = W - V(i)*H(i,j)

-         ! Compute temp product V(i)*H(i,j)
-         call mv_prod(v(i)%get_vect(),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add))
+         ! Compute product V(i)*H(i,j)
+         call psb_geprod(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),pd,desc_a,info)
         if (info /= psb_success_) then
            info=psb_err_from_subroutine_non_
            call psb_errpush(info,name)
@ -270,7 +270,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
         end if

         ! Compute W
-         call psb_geaxpby(-done,temp,done,w,desc_a,info)
+         call psb_geaxpby(-done,pd,done,w,desc_a,info)
         if (info /= psb_success_) then
            info=psb_err_from_subroutine_non_
            call psb_errpush(info,name)
@ -327,16 +327,16 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
      ! Compute index for Y products
      idx = (i-1)*nrhs+1

-      ! Compute temp product V(i)*Y(i)
-      call mv_prod(v(i)%get_vect(),y(idx:idx+n_add,:))
+      ! Compute product V(i)*Y(i)
+      call psb_geprod(v(i),y(idx:idx+n_add,:),pd,desc_a,info)
      if (info /= psb_success_) then
         info=psb_err_from_subroutine_non_
         call psb_errpush(info,name)
         goto 9999
      end if

-      ! Add temp to X(m)
-      call psb_geaxpby(done,temp,done,x,desc_a,info)
+      ! Add product to X(m)
+      call psb_geaxpby(done,pd,done,x,desc_a,info)
      if (info /= psb_success_) then
         info=psb_err_from_subroutine_non_
         call psb_errpush(info,name)
@ -356,6 +356,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
   if (info == psb_success_) call psb_gefree(w,desc_a,info)
   if (info == psb_success_) call psb_gefree(r0,desc_a,info)
   if (info == psb_success_) call psb_gefree(rm,desc_a,info)
+   if (info == psb_success_) call psb_gefree(pd,desc_a,info)
   if (info == psb_success_) deallocate(aux,h,y,r0n2,rmn2,stat=info)
   if (info /= psb_success_) then
      info=psb_err_from_subroutine_non_
@ -426,36 +427,7 @@ contains

   end function qr_fact

-   ! Multivectors product
-   subroutine mv_prod(x_mv,y_mv)
-      implicit none
-
-      ! I/O parameters
-      real(psb_dpk_), intent(in) :: x_mv(:,:), y_mv(:,:)
-
-      ! Utils
-      real(psb_dpk_), allocatable :: res(:,:)
-      integer(psb_ipk_)           :: lda, ldb
-
-      ! Initialize params
-      lda = size(x_mv,1)
-      ldb = size(y_mv,2)
-      allocate(res(lda,ldb))
-      res = dzero
-
-      ! Compute product
-      if (n_row > 0) then
-         call dgemm('N','N',n_row,nrhs,nrhs,done,x_mv,lda,y_mv,ldb,dzero,res,lda)
-      end if
-
-      ! Set temp multivector
-      call temp%set(res)
-
-      ! Deallocate
-      deallocate(res)
-
-   end subroutine mv_prod
-
+   ! TODO Loop con Givens rotation su ogni colonna
   ! Minimize Frobenius norm
   function frobenius_norm_min(rep) result(res)
      implicit none
--- a/test/block_krylov/gpu/psb_dpde_gen.F90
+++ b/test/block_krylov/gpu/psb_dpde_gen.F90
@ -548,7 +548,7 @@ program dpdegen

   ! solver parameters
   real(psb_dpk_)     :: err, cond
-   integer(psb_epk_)  :: amatsize, precsize, descsize
+   integer(psb_epk_)  :: amatsize, precsize, descsize, annz
   integer(psb_ipk_)  :: iter, ierr, ircode

   ! molds
@ -599,7 +599,7 @@ program dpdegen
   !
   call psb_barrier(ctxt)
   t1 = psb_wtime()
-   call psb_gen_pde3d(ctxt,idim,a,b_mv,x_mv,nrhs,desc_a,'CSR  ',info,partition=3,tnd=tnd)
+   call psb_gen_pde3d(ctxt,idim,a,b_mv,x_mv,nrhs,desc_a,'CSR  ',info,vmold=gpumold,partition=3,tnd=tnd)
   call psb_barrier(ctxt)
   t2 = psb_wtime() - t1
   if(info /= psb_success_) then
@ -640,6 +640,12 @@ program dpdegen
      stop
   end if

+   ! set random RHS
+   call random_number(b_mv%v%v)
+   b_mv%v%v = -10 + (20)*b_mv%v%v
+   call b_mv%v%set_host()
+   call b_mv%sync()
+
   call psb_geall(r_mv,desc_a,info,nrhs)
   call r_mv%zero()
   call psb_geasb(r_mv,desc_a,info)
@ -694,10 +700,12 @@ program dpdegen
   resmxp = psb_geamax(r_mv,desc_a,info)

   amatsize = a%sizeof()
+   annz     = a%get_nzeros()
   descsize = desc_a%sizeof()
   precsize = prec%sizeof()

   call psb_sum(ctxt,amatsize)
+   call psb_sum(ctxt,annz)
   call psb_sum(ctxt,descsize)
   call psb_sum(ctxt,precsize)

@ -710,6 +718,9 @@ program dpdegen
      call prec%descr(info)
      write(psb_out_unit,'(" ")')
      write(psb_out_unit,'("Computed solution on:         ",i8," processors")')np
+      write(psb_out_unit,'("Dimesion of A:                      ",i12)')desc_a%get_global_rows()
+      write(psb_out_unit,'("Number of nonzeros:                 ",i12)')annz
+      write(psb_out_unit,'("Number of RHS:                      ",i12)')nrhs
      write(psb_out_unit,'("Storage format for A:                ",a)')a%get_fmt()
      write(psb_out_unit,'("Storage format for DESC_A:           ",a)')desc_a%get_fmt()
      write(psb_out_unit,'("Total memory occupation for A:      ",i12)')amatsize
--- a/test/block_krylov/kernel/dpdegenmm.F90
+++ b/test/block_krylov/kernel/dpdegenmm.F90
@ -596,7 +596,7 @@ program pdegenmm
  ! solver parameters
  integer(psb_epk_)  :: amatsize, precsize, descsize, annz, nbytes
  real(psb_dpk_)     :: err, eps
-  integer, parameter :: ntests=50, ngpu=1, ncnv=1
+  integer, parameter :: ntests=20, ngpu=1, ncnv=1
  type(psb_d_coo_sparse_mat), target  :: acoo
  type(psb_d_csr_sparse_mat), target  :: acsr
  type(psb_d_ell_sparse_mat), target  :: aell