Working parallel (QR_fact serial)

12 months ago · 676652fcff
parent d10631530f
commit 676652fcff
10 changed files with 483 additions and 273 deletions
--- a/base/modules/psblas/psb_d_psblas_mod.F90
+++ b/base/modules/psblas/psb_d_psblas_mod.F90
@ -45,7 +45,7 @@ module psb_d_psblas_mod
      integer(psb_ipk_), intent(out)       :: info
      logical, intent(in), optional        :: global
    end function psb_ddot_vect
-    function psb_ddot_multivect_col_v(x, y, desc_a,info,global) result(res)
+    function psb_ddot_multivect(x, y, desc_a,info,global) result(res)
      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
           & psb_d_multivect_type, psb_dspmat_type
      real(psb_dpk_), allocatable               :: res(:,:)
@ -53,17 +53,7 @@ module psb_d_psblas_mod
      type(psb_desc_type), intent(in)           :: desc_a
      integer(psb_ipk_), intent(out)            :: info
      logical, intent(in), optional             :: global
-    end function psb_ddot_multivect_col_v
-    function psb_ddot_multivect_row_a(x, y, desc_a,info,global) result(res)
-      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
-           & psb_d_multivect_type, psb_dspmat_type
-      real(psb_dpk_), allocatable               :: res(:,:)
-      type(psb_d_multivect_type), intent(inout) :: x
-      real(psb_dpk_), intent(in)                :: y(:,:)
-      type(psb_desc_type), intent(in)           :: desc_a
-      integer(psb_ipk_), intent(out)            :: info
-      logical, intent(in), optional             :: global
-    end function psb_ddot_multivect_row_a
+    end function psb_ddot_multivect
    function psb_ddotv(x, y, desc_a,info,global)
      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
           & psb_d_vect_type, psb_dspmat_type
@ -107,6 +97,30 @@ module psb_d_psblas_mod
    end subroutine psb_dmdots
  end interface

+  interface psb_geprod
+    function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
+      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
+           & psb_d_multivect_type, psb_dspmat_type
+      real(psb_dpk_), allocatable               :: res(:,:)
+      type(psb_d_multivect_type), intent(inout) :: x, y
+      type(psb_desc_type), intent(in)           :: desc_a
+      integer(psb_ipk_), intent(out)            :: info
+      logical, intent(in), optional             :: trans
+      logical, intent(in), optional             :: global
+    end function psb_dprod_multivect
+    function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
+      import :: psb_desc_type, psb_dpk_, psb_ipk_, &
+           & psb_d_multivect_type, psb_dspmat_type
+      real(psb_dpk_), allocatable               :: res(:,:)
+      type(psb_d_multivect_type), intent(inout) :: x
+      real(psb_dpk_), intent(in)                :: y(:,:)
+      type(psb_desc_type), intent(in)           :: desc_a
+      integer(psb_ipk_), intent(out)            :: info
+      logical, intent(in), optional             :: trans
+      logical, intent(in), optional             :: global
+    end function psb_dprod_multivect_a
+  end interface
+
  interface psb_geaxpby
    subroutine psb_daxpby_vect(alpha, x, beta, y,&
         & desc_a, info)
--- a/base/modules/serial/psb_d_base_vect_mod.F90
+++ b/base/modules/serial/psb_d_base_vect_mod.F90
@ -2246,14 +2246,14 @@ module psb_d_base_multivect_mod
    procedure, pass(x) :: set_vect => d_base_mlv_set_vect
    generic, public    :: set      => set_vect, set_scal
    !
-    ! TODO Dot product (col-by-col and row-by-col) and AXPBY
-    !
-    procedure, pass(x) :: dot_col_v => d_base_mlv_dot_col_v
-    procedure, pass(x) :: dot_col_a => d_base_mlv_dot_col_a
-    generic, public    :: dot_col   => dot_col_v, dot_col_a
-    procedure, pass(x) :: dot_row_v => d_base_mlv_dot_row_v
-    procedure, pass(x) :: dot_row_a => d_base_mlv_dot_row_a
-    generic, public    :: dot_row   => dot_row_v, dot_row_a
+    ! Product, dot-product (col-by-col) and AXPBY
+    !
+    procedure, pass(x) :: prod_v    => d_base_mlv_prod_v
+    procedure, pass(x) :: prod_a    => d_base_mlv_prod_a
+    generic, public    :: prod      => prod_v, prod_a
+    procedure, pass(x) :: dot_v     => d_base_mlv_dot_v
+    procedure, pass(x) :: dot_a     => d_base_mlv_dot_a
+    generic, public    :: dot       => dot_v, dot_a
    procedure, pass(y) :: axpby_v   => d_base_mlv_axpby_v
    procedure, pass(y) :: axpby_a   => d_base_mlv_axpby_a
    generic, public    :: axpby     => axpby_v, axpby_a
@ -2812,92 +2812,103 @@ contains

  end subroutine d_base_mlv_set_vect

-  ! TODO
-  ! Col Dot products
+  !
+  ! Multivectors product
  !
  !
  !> Function     base_mlv_dot_col_v
  !! \memberof    psb_d_base_multivect_type
-  !! \brief     Col-by-col mult using dot product by a mlv
+  !! \brief       Product by a mlv
  !! \param nr    Number of rows to be considered
  !! \param y     The other (base_mlv_vect) to be multiplied by
+  !! \param trans If true, x is transposed
  !! \param res   Result vector
  !!
-  function d_base_mlv_dot_col_v(nr,x,y) result(res)
+  function d_base_mlv_prod_v(nr,x,y,trans) result(res)
    implicit none
    class(psb_d_base_multivect_type), intent(inout) :: x, y
    integer(psb_ipk_), intent(in)                   :: nr
+    logical, optional, intent(in)                   :: trans
    real(psb_dpk_), allocatable                     :: res(:,:)
-    real(psb_dpk_), external                        :: ddot
-    integer(psb_ipk_)                               :: i, j, n_x, n_y
+    external                                        :: dgemm
+    integer(psb_ipk_)                               :: x_n, y_n, lda, ldb

    if (x%is_dev()) call x%sync()
-    !
-    ! Note: this is the base implementation.
-    !  When we get here, we are sure that X is of
-    !  TYPE psb_d_base_mlv_vect (or its class does not care).
-    !  If Y is not, throw the burden on it, implicitly
-    !  calling dot_a
-    !
    select type(yy => y)
    type is (psb_d_base_multivect_type)
      if (y%is_dev()) call y%sync()
-      n_x = psb_size(x%v,2_psb_ipk_)
-      n_y = psb_size(y%v,2_psb_ipk_)
-      allocate(res(n_x,n_y))
-      do i=1,n_x
-        do j=1,n_y
-          res(i,j) = ddot(nr,x%v(1:nr,i),1,y%v(1:nr,j),1)
-        end do
-      end do
+      x_n = x%get_ncols()
+      y_n = y%get_ncols()
+      lda = x%get_nrows()
+      if (trans) then
+        allocate(res(x_n,y_n))
+        res = dzero
+        ldb = y%get_nrows()
+        call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y%v,ldb,dzero,res,x_n)
+      else
+        allocate(res(x%get_nrows(),y_n))
+        res = dzero
+        ldb = y_n
+        call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y%v,ldb,dzero,res,lda)
+      end if
    class default
-      res = x%dot_col(nr,y%v)
+      res = x%prod(nr,y%v,trans)
    end select

-  end function d_base_mlv_dot_col_v
+  end function d_base_mlv_prod_v

  !
-  ! Base workhorse is good old BLAS1
+  ! Multivectors product base
  !
  !
  !> Function     base_mlv_dot_col_a
  !! \memberof    psb_d_base_multivect_type
-  !! \brief         Col-by-col mult using dot product by a normal array
+  !! \brief       Product by an array(:,:)
  !! \param nr    Number of rows to be considered
-  !! \param y(:,:)  The array to be multiplied by
+  !! \param y     The other array(:,:) to be multiplied by
+  !! \param trans If true, x is transposed
  !! \param res   Result vector
  !!
-  function d_base_mlv_dot_col_a(nr,x,y) result(res)
+  function d_base_mlv_prod_a(nr,x,y,trans) result(res)
+    implicit none
    class(psb_d_base_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in)                      :: y(:,:)
+    integer(psb_ipk_), intent(in)                   :: nr
+    logical, optional, intent(in)                   :: trans
    real(psb_dpk_), allocatable                     :: res(:,:)
-    real(psb_dpk_), external                        :: ddot
-    integer(psb_ipk_)                               :: i, j, n_x, n_y
+    external                                        :: dgemm
+    integer(psb_ipk_)                               :: x_n, y_n, lda, ldb

    if (x%is_dev()) call x%sync()
-    n_x = psb_size(x%v,2_psb_ipk_)
-    n_y = size(y,2_psb_ipk_)
-    allocate(res(n_x,n_y))
-    do i=1,n_x
-      do j=1,n_y
-        res(i,j) = ddot(nr,x%v(1:nr,i),1,y(1:nr,j),1)
-      end do
-    end do
+    x_n = x%get_ncols()
+    y_n = size(y,dim=2)
+    lda = x%get_nrows()
+    if (trans) then
+      allocate(res(x_n,y_n))
+      res = dzero
+      ldb = size(y,dim=1)
+      call dgemm('T','N',x_n,y_n,nr,done,x%v,lda,y,ldb,dzero,res,x_n)
+    else
+      allocate(res(x%get_nrows(),y_n))
+      res = dzero
+      ldb = x_n
+      call dgemm('N','N',nr,y_n,x_n,done,x%v,lda,y,ldb,dzero,res,lda)
+    end if

-  end function d_base_mlv_dot_col_a
+  end function d_base_mlv_prod_a

  !
-  ! Row Dot products
+  ! Dot products
  !
  !
-  !> Function   base_mlv_dot_col_v
+  !> Function   base_mlv_dot_v
  !! \memberof  psb_d_base_multivect_type
-  !! \brief     Row-by-col mult using dot product by mlv
+  !! \brief     Dot product by another base_mlv_vector
  !! \param nr  Number of rows to be considered
  !! \param y   The other (base_mlv_vect) to be multiplied by
-  !! \param res Result vector
+  !! \param res Result matrix
  !!
-  function d_base_mlv_dot_row_v(nr,x,y) result(res)
+  function d_base_mlv_dot_v(nr,x,y) result(res)
    implicit none
    class(psb_d_base_multivect_type), intent(inout) :: x, y
    integer(psb_ipk_), intent(in)                   :: nr
@ -2918,32 +2929,33 @@ contains
      if (y%is_dev()) call y%sync()
      n_x = psb_size(x%v,2_psb_ipk_)
      n_y = psb_size(y%v,2_psb_ipk_)
-      allocate(res(nr,n_y))
-      do i=1,nr
+      allocate(res(n_x,n_y))
+      do i=1,n_x
        do j=1,n_y
-          res(i,j) = ddot(n_x,x%v(i,:),1,y%v(:,j),1)
+          res(i,j) = ddot(nr,x%v(1:nr,i),1,y%v(1:nr,j),1)
        end do
      end do
    class default
-      res = x%dot_row(nr,y%v)
+      res = x%dot_col(nr,y%v)
    end select

-  end function d_base_mlv_dot_row_v
+  end function d_base_mlv_dot_v

  !
  ! Base workhorse is good old BLAS1
  !
  !
-  !> Function       base_mlv_dot_row_a
+  !> Function       base_mlv_dot_a
  !! \memberof      psb_d_base_multivect_type
-  !! \brief         Row-by-col mult using dot product by a normal array
+  !! \brief         Dot product by a normal array
  !! \param nr      Number of rows to be considered
  !! \param y(:,:)  The array to be multiplied by
-  !! \param res     Result vector
+  !! \param res     Result matrix
  !!
-  function d_base_mlv_dot_row_a(nr,x,y) result(res)
+  function d_base_mlv_dot_a(nr,x,y) result(res)
    class(psb_d_base_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in)                      :: y(:,:)
+    integer(psb_ipk_), intent(in)                   :: nr
    real(psb_dpk_), allocatable                     :: res(:,:)
    real(psb_dpk_), external                        :: ddot
    integer(psb_ipk_)                               :: i, j, n_x, n_y
@ -2951,15 +2963,14 @@ contains
    if (x%is_dev()) call x%sync()
    n_x = psb_size(x%v,2_psb_ipk_)
    n_y = size(y,2_psb_ipk_)
-    allocate(res(psb_size(x%v,1_psb_ipk_),n_y))
-    do i=1,nr
+    allocate(res(n_x,n_y))
+    do i=1,n_x
      do j=1,n_y
-        res(i,j) = ddot(n_x,x%v(i,:),1,y(:,j),1)
+        res(i,j) = ddot(nr,x%v(1:nr,i),1,y(1:nr,j),1)
      end do
    end do

-  end function d_base_mlv_dot_row_a
-
+  end function d_base_mlv_dot_a
  !
  ! AXPBY is invoked via Y, hence the structure below.
  !
--- a/base/modules/serial/psb_d_vect_mod.F90
+++ b/base/modules/serial/psb_d_vect_mod.F90
@ -1387,11 +1387,14 @@ module psb_d_multivect_mod
    procedure, pass(x) :: set_vect => d_vect_set_vect
    generic, public    :: set      => set_vect, set_scal
    !
-    ! Dot product and AXPBY
+    ! Produc, dot-product and AXPBY
    !
-    procedure, pass(x) :: dot_col_v => d_vect_dot_col_v
-    procedure, pass(x) :: dot_col_a => d_vect_dot_col_a
-    generic, public    :: dot_col   => dot_col_v, dot_col_a
+    procedure, pass(x) :: prod_v    => d_vect_prod_v
+    procedure, pass(x) :: prod_a    => d_vect_prod_a
+    generic, public    :: prod      => prod_v, prod_a
+    procedure, pass(x) :: dot_v     => d_vect_dot_v
+    procedure, pass(x) :: dot_a     => d_vect_dot_a
+    generic, public    :: dot       => dot_v, dot_a
    procedure, pass(x) :: dot_row_v => d_vect_dot_row_v
    procedure, pass(x) :: dot_row_a => d_vect_dot_row_a
    generic, public    :: dot_row   => dot_row_v, dot_row_a
@ -1858,41 +1861,43 @@ contains
    call move_alloc(tmp,x%v)
  end subroutine d_vect_cnv

-  function d_vect_dot_col_v(nr,x,y) result(res)
+  function d_vect_prod_v(nr,x,y,trans) result(res)
    implicit none
    class(psb_d_multivect_type), intent(inout) :: x, y
    integer(psb_ipk_), intent(in)              :: nr
+    logical, optional, intent(in)              :: trans
    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v).and.allocated(y%v)) &
-         & res = x%v%dot_col(nr,y%v)
+         & res = x%v%prod(nr,y%v,trans)

-  end function d_vect_dot_col_v
+  end function d_vect_prod_v

-  function d_vect_dot_col_a(nr,x,y) result(res)
+  function d_vect_prod_a(nr,x,y,trans) result(res)
    implicit none
    class(psb_d_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in)                 :: y(:,:)
    integer(psb_ipk_), intent(in)              :: nr
+    logical, optional, intent(in)              :: trans
    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v)) &
-         & res = x%v%dot_col(nr,y)
+         & res = x%v%prod(nr,y,trans)

-  end function d_vect_dot_col_a
+  end function d_vect_prod_a

-  function d_vect_dot_row_v(nr,x,y) result(res)
+  function d_vect_dot_v(nr,x,y) result(res)
    implicit none
    class(psb_d_multivect_type), intent(inout) :: x, y
    integer(psb_ipk_), intent(in)              :: nr
    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v).and.allocated(y%v)) &
-         & res = x%v%dot_row(nr,y%v)
+         & res = x%v%dot(nr,y%v)

-  end function d_vect_dot_row_v
+  end function d_vect_dot_v

-  function d_vect_dot_row_a(nr,x,y) result(res)
+  function d_vect_dot_a(nr,x,y) result(res)
    implicit none
    class(psb_d_multivect_type), intent(inout) :: x
    real(psb_dpk_), intent(in)                 :: y(:,:)
@ -1900,9 +1905,9 @@ contains
    real(psb_dpk_), allocatable                :: res(:,:)

    if (allocated(x%v)) &
-         & res = x%v%dot_row(nr,y)
+         & res = x%v%dot(nr,y)

-  end function d_vect_dot_row_a
+  end function d_vect_dot_a

  subroutine d_vect_axpby_v(m,alpha, x, beta, y, info)
    use psi_serial_mod
--- a/base/psblas/Makefile
+++ b/base/psblas/Makefile
@ -2,7 +2,7 @@ include ../../Make.inc

 #FCOPT=-O2
 OBJS= psb_ddot.o psb_damax.o psb_dasum.o psb_daxpby.o psb_dqrfact.o\
-        psb_dnrm2.o psb_dnrmi.o psb_dspmm.o psb_dspsm.o\
+    psb_dprod.o psb_dnrm2.o psb_dnrmi.o psb_dspmm.o psb_dspsm.o\
 	psb_sspnrm1.o psb_dspnrm1.o psb_cspnrm1.o psb_zspnrm1.o \
 	psb_zamax.o psb_zasum.o psb_zaxpby.o psb_zdot.o \
 	psb_znrm2.o psb_znrmi.o psb_zspmm.o psb_zspsm.o\
--- a/base/psblas/psb_ddot.f90
+++ b/base/psblas/psb_ddot.f90
@ -158,7 +158,7 @@ function psb_ddot_vect(x, y, desc_a,info,global) result(res)

 end function psb_ddot_vect
 !
-! Function: psb_ddot_multivect_col_v
+! Function: psb_ddot_multivect
 !    psb_ddot computes the col-by-col dot product of two distributed vectors,
 !
 !    dot := ( X )**C * ( Y )
@ -175,14 +175,14 @@ end function psb_ddot_vect
 !        they are declared INOUT because of the sync() methods. 
 !
 !
-function psb_ddot_multivect_col_v(x, y, desc_a,info,global) result(res)
+function psb_ddot_multivect(x, y, desc_a,info,global) result(res)
  use psb_desc_mod
  use psb_d_base_mat_mod
  use psb_check_mod
  use psb_error_mod
  use psb_penv_mod
  use psb_d_vect_mod
-  use psb_d_psblas_mod, psb_protect_name => psb_ddot_multivect_col_v
+  use psb_d_psblas_mod, psb_protect_name => psb_ddot_multivect
  implicit none 
  real(psb_dpk_), allocatable               :: res(:,:)
  type(psb_d_multivect_type), intent(inout) :: x, y
@ -193,7 +193,7 @@ function psb_ddot_multivect_col_v(x, y, desc_a,info,global) result(res)
  ! locals
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_) :: np, me, idx, ndm,&
-       & err_act, iix, jjx, iiy, jjy, i, j, k, nr
+       & err_act, iix, jjx, iiy, jjy, i, j, nr
  integer(psb_lpk_) :: ix, ijx, iy, ijy, m
  logical :: global_
  character(len=20)      :: name, ch_err
@ -235,7 +235,6 @@ function psb_ddot_multivect_col_v(x, y, desc_a,info,global) result(res)
  iy = ione
  ijy = ione

-  ! TODO
  m = desc_a%get_global_rows()

  ! check vector correctness
@ -268,136 +267,16 @@ function psb_ddot_multivect_col_v(x, y, desc_a,info,global) result(res)
        do i=1,size(desc_a%ovrlap_elem,1)
          idx = desc_a%ovrlap_elem(i,1)
          ndm = desc_a%ovrlap_elem(i,2)
-          res(idx,j) = res(idx,j) - (real(ndm-1)/real(ndm))*(x%v%v(idx,j)*y%v%v(idx,j))
+          res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
        end do
      end do
    end if
  else
-    allocate(res(size(x%v%v,2_psb_ipk_),size(y%v%v,2_psb_ipk_)))
+    allocate(res(x%get_ncols(),y%get_ncols()))
    res = dzero
  end if

-  ! TODO compute global sum
-  if (global_) call psb_sum(ctxt, res)
-
-  call psb_erractionrestore(err_act)
-  return  
-
-9999 call psb_error_handler(ctxt,err_act)
-
-  return
-
-end function psb_ddot_multivect_col_v
-!
-! Function: psb_ddot_multivect_row_a
-!    psb_ddot computes the row-by-col dot product of two distributed vectors,
-!
-!    dot := ( X )**C * ( Y )
-!
-!
-! Arguments:
-!    x      -  type(psb_d_multivect_type) The input vector containing the entries of sub( X ).
-!    y      -  real(psb_dpk_)(:,:)        The input vector containing the entries of sub( Y ).
-!    desc_a -  type(psb_desc_type).       The communication descriptor.
-!    info   -  integer.                   Return code
-!    global -  logical(optional)          Whether to perform the global sum, default: .true.
-!
-!  Note: from a functional point of view, X and Y are input, but here
-!        they are declared INOUT because of the sync() methods. 
-!
-!
-function psb_ddot_multivect_row_a(x, y, desc_a, info, global) result(res)
-  use psb_desc_mod
-  use psb_d_base_mat_mod
-  use psb_check_mod
-  use psb_error_mod
-  use psb_penv_mod
-  use psb_d_vect_mod
-  use psb_d_psblas_mod, psb_protect_name => psb_ddot_multivect_row_a
-  implicit none 
-  real(psb_dpk_), allocatable               :: res(:,:)
-  type(psb_d_multivect_type), intent(inout) :: x
-  real(psb_dpk_), intent(in)                :: y(:,:)
-  type(psb_desc_type), intent(in)           :: desc_a
-  integer(psb_ipk_), intent(out)            :: info
-  logical, intent(in), optional             :: global
-
-  ! locals
-  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_) :: np, me, idx, ndm,&
-       & err_act, iix, jjx, i, j, nr
-  integer(psb_lpk_) :: ix, ijx, iy, ijy, m
-  logical :: global_
-  character(len=20)      :: name, ch_err
-
-  name='psb_ddot_multivect'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-
-  ctxt=desc_a%get_context()
-  call psb_info(ctxt, me, np)
-  if (np == -ione) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (present(global)) then
-    global_ = global
-  else
-    global_ = .true.
-  end if
-
-  ix = ione
-  ijx = ione
-
-  m = desc_a%get_global_rows()
-
-  ! check vector correctness
-  call psb_chkvect(m,x%get_ncols(),x%get_nrows(),ix,ijx,desc_a,info,iix,jjx)
-  if(info /= psb_success_) then
-    info=psb_err_from_subroutine_
-    ch_err='psb_chkvect'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  if (iix /= ione) then
-    info=psb_err_ix_n1_iy_n1_unsupported_
-    call psb_errpush(info,name)
-    goto 9999
-  end if
-
-  nr = desc_a%get_local_rows() 
-  if(nr > 0) then
-
-    res = x%dot_row(nr,y)
-
-    ! TODO adjust dot_local because overlapped elements are computed more than once
-    if (size(desc_a%ovrlap_elem,1)>0) then
-      if (x%v%is_dev()) call x%sync()
-      do j=1,x%get_ncols()
-        do i=1,size(desc_a%ovrlap_elem,1)
-          idx = desc_a%ovrlap_elem(i,1)
-          ndm = desc_a%ovrlap_elem(i,2)
-          res(idx,j) = res(idx,j) - (real(ndm-1)/real(ndm))*(x%v%v(idx,j)*y(idx,j))
-        end do
-      end do
-    end if
-  else
-    allocate(res(nr,size(y,2_psb_ipk_)))
-    res = dzero
-  end if
-
-  ! TODO compute global sum
+  ! compute global sum
  if (global_) call psb_sum(ctxt, res)

  call psb_erractionrestore(err_act)
@ -407,7 +286,7 @@ function psb_ddot_multivect_row_a(x, y, desc_a, info, global) result(res)

  return

-end function psb_ddot_multivect_row_a
+end function psb_ddot_multivect
 !
 ! Function: psb_ddot
 !    psb_ddot computes the dot product of two distributed vectors,
--- a/base/psblas/psb_dnrm2.f90
+++ b/base/psblas/psb_dnrm2.f90
@ -473,7 +473,6 @@ function psb_dnrm2_multivect(x, desc_a, info,global)  result(res)
    res = dzero
  end if

-  ! TODO
  if (global_) call psb_nrm2(ctxt,res)

  call psb_erractionrestore(err_act)
--- a/base/psblas/psb_dprod.f90
+++ b/base/psblas/psb_dprod.f90
@ -0,0 +1,298 @@
+!
+!                Parallel Sparse BLAS  version 3.5
+!      (C) Copyright 2006-2018
+!        Salvatore Filippone
+!        Alfredo Buttari
+!
+!    Redistribution and use in source and binary forms, with or without
+!    modification, are permitted provided that the following conditions
+!    are met:
+!      1. Redistributions of source code must retain the above copyright
+!         notice, this list of conditions and the following disclaimer.
+!      2. Redistributions in binary form must reproduce the above copyright
+!         notice, this list of conditions, and the following disclaimer in the
+!         documentation and/or other materials provided with the distribution.
+!      3. The name of the PSBLAS group or the names of its contributors may
+!         not be used to endorse or promote products derived from this
+!         software without specific written permission.
+!
+!    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+!    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+!    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+!    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
+!    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!    POSSIBILITY OF SUCH DAMAGE.
+!
+!
+! File: psb_dprod.f90
+!
+! Function: psb_dprod_multivect
+!    psb_dprod computes the product of two distributed multivectors,
+!
+!    prod := ( X ) * ( Y ) or
+!    prod := ( X )**C * ( Y )
+!
+!
+! Arguments:
+!    x      -  type(psb_d_multivect_type) The input vector containing the entries of sub( X ).
+!    y      -  type(psb_d_multivect_type) The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).       The communication descriptor.
+!    info   -  integer.                   Return code
+!    trans  -  logical(optional)          Whether multivector X is transposed, default: .false.
+!    global -  logical(optional)          Whether to perform the global reduce, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods.
+!
+!
+function psb_dprod_multivect(x,y,desc_a,info,trans,global) result(res)
+   use psb_desc_mod
+   use psb_d_base_mat_mod
+   use psb_check_mod
+   use psb_error_mod
+   use psb_penv_mod
+   use psb_d_vect_mod
+   use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect
+   implicit none
+   real(psb_dpk_), allocatable               :: res(:,:)
+   type(psb_d_multivect_type), intent(inout) :: x, y
+   type(psb_desc_type), intent(in)           :: desc_a
+   integer(psb_ipk_), intent(out)            :: info
+   logical, intent(in), optional             :: trans
+   logical, intent(in), optional             :: global
+
+   ! locals
+   type(psb_ctxt_type) :: ctxt
+   integer(psb_ipk_) :: np, me, idx, ndm,&
+   & err_act, iix, jjx, iiy, jjy, i, j, nr
+   integer(psb_lpk_) :: ix, ijx, iy, ijy, m
+   logical :: global_, trans_
+   character(len=20)      :: name, ch_err
+
+   name='psb_dprod_multivect'
+   info=psb_success_
+   call psb_erractionsave(err_act)
+   if (psb_errstatus_fatal()) then
+      info = psb_err_internal_error_ ;    goto 9999
+   end if
+
+   ctxt=desc_a%get_context()
+   call psb_info(ctxt, me, np)
+   if (np == -ione) then
+      info = psb_err_context_error_
+      call psb_errpush(info,name)
+      goto 9999
+   endif
+   if (.not.allocated(x%v)) then
+      info = psb_err_invalid_vect_state_
+      call psb_errpush(info,name)
+      goto 9999
+   endif
+   if (.not.allocated(y%v)) then
+      info = psb_err_invalid_vect_state_
+      call psb_errpush(info,name)
+      goto 9999
+   endif
+
+   if (present(trans)) then
+      trans_ = trans
+   else
+      trans_ = .false.
+   end if
+
+   if (present(global)) then
+      global_ = global
+   else
+      global_ = .true.
+   end if
+
+   ix = ione
+   ijx = ione
+
+   iy = ione
+   ijy = ione
+
+   m = desc_a%get_global_rows()
+
+   ! check vector correctness
+   call psb_chkvect(m,x%get_ncols(),x%get_nrows(),ix,ijx,desc_a,info,iix,jjx)
+   if (info == psb_success_) &
+   & call psb_chkvect(m,y%get_ncols(),y%get_nrows(),iy,ijy,desc_a,info,iiy,jjy)
+   if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='psb_chkvect'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+   end if
+
+   if ((iix /= ione).or.(iiy /= ione)) then
+      info=psb_err_ix_n1_iy_n1_unsupported_
+      call psb_errpush(info,name)
+      goto 9999
+   end if
+
+   nr = desc_a%get_local_rows()
+   if (nr > 0) then
+      res = x%prod(nr,y,trans_)
+
+      ! TODO adjust dot_local because overlapped elements are computed more than once
+      if (size(desc_a%ovrlap_elem,1)>0) then
+         if (x%v%is_dev()) call x%sync()
+         if (y%v%is_dev()) call y%sync()
+         do j=1,x%get_ncols()
+            do i=1,size(desc_a%ovrlap_elem,1)
+               idx = desc_a%ovrlap_elem(i,1)
+               ndm = desc_a%ovrlap_elem(i,2)
+               res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y%v%v(idx,:))
+            end do
+         end do
+      end if
+   else
+      res = dzero
+   end if
+
+   ! TODO forse è meglio global false di default
+   ! compute global sum
+   if (global_) call psb_sum(ctxt, res)
+
+   call psb_erractionrestore(err_act)
+   return
+
+9999 call psb_error_handler(ctxt,err_act)
+
+   return
+
+end function psb_dprod_multivect
+!
+! Function: psb_dprod_multivect_a
+!    psb_dprod computes the product of two distributed multivectors,
+!
+!    prod := ( X ) * ( Y ) or
+!    prod := ( X )**C * ( Y )
+!
+!
+! Arguments:
+!    x      -  type(psb_d_multivect_type) The input vector containing the entries of sub( X ).
+!    y      -  real(:,:)                  The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).       The communication descriptor.
+!    info   -  integer.                   Return code
+!    trans  -  logical(optional)          Whether multivector X is transposed, default: .false.
+!    global -  logical(optional)          Whether to perform the global reduce, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods.
+!
+!
+function psb_dprod_multivect_a(x,y,desc_a,info,trans,global) result(res)
+    use psb_desc_mod
+    use psb_d_base_mat_mod
+    use psb_check_mod
+    use psb_error_mod
+    use psb_penv_mod
+    use psb_d_vect_mod
+    use psb_d_psblas_mod, psb_protect_name => psb_dprod_multivect_a
+    implicit none
+    real(psb_dpk_), allocatable               :: res(:,:)
+    type(psb_d_multivect_type), intent(inout) :: x
+    real(psb_dpk_), intent(in)                :: y(:,:)
+    type(psb_desc_type), intent(in)           :: desc_a
+    integer(psb_ipk_), intent(out)            :: info
+    logical, intent(in), optional             :: trans
+    logical, intent(in), optional             :: global
+ 
+    ! locals
+    type(psb_ctxt_type) :: ctxt
+    integer(psb_ipk_) :: np, me, idx, ndm,&
+    & err_act, iix, jjx, iiy, jjy, i, j, nr
+    integer(psb_lpk_) :: ix, ijx, iy, ijy, m
+    logical :: global_, trans_
+    character(len=20)      :: name, ch_err
+ 
+    name='psb_dprod_multivect'
+    info=psb_success_
+    call psb_erractionsave(err_act)
+    if (psb_errstatus_fatal()) then
+       info = psb_err_internal_error_ ;    goto 9999
+    end if
+ 
+    ctxt=desc_a%get_context()
+    call psb_info(ctxt, me, np)
+    if (np == -ione) then
+       info = psb_err_context_error_
+       call psb_errpush(info,name)
+       goto 9999
+    endif
+    if (.not.allocated(x%v)) then
+       info = psb_err_invalid_vect_state_
+       call psb_errpush(info,name)
+       goto 9999
+    endif
+ 
+    if (present(trans)) then
+       trans_ = trans
+    else
+       trans_ = .false.
+    end if
+ 
+    if (present(global)) then
+       global_ = global
+    else
+       global_ = .true.
+    end if
+ 
+    ix = ione
+    ijx = ione
+ 
+    m = desc_a%get_global_rows()
+ 
+    ! check vector correctness
+    call psb_chkvect(m,x%get_ncols(),x%get_nrows(),ix,ijx,desc_a,info,iix,jjx)
+    if(info /= psb_success_) then
+       info=psb_err_from_subroutine_
+       ch_err='psb_chkvect'
+       call psb_errpush(info,name,a_err=ch_err)
+       goto 9999
+    end if
+ 
+    if ((iix /= ione)) then
+       info=psb_err_ix_n1_iy_n1_unsupported_
+       call psb_errpush(info,name)
+       goto 9999
+    end if
+ 
+    nr = desc_a%get_local_rows()
+    if (nr > 0) then
+       res = x%prod(nr,y,trans_)
+ 
+       ! TODO adjust dot_local because overlapped elements are computed more than once
+       if (size(desc_a%ovrlap_elem,1)>0) then
+          if (x%v%is_dev()) call x%sync()
+          do j=1,x%get_ncols()
+             do i=1,size(desc_a%ovrlap_elem,1)
+                idx = desc_a%ovrlap_elem(i,1)
+                ndm = desc_a%ovrlap_elem(i,2)
+                res(j,:) = res(j,:) - (real(ndm-1)/real(ndm))*(x%v%v(idx,:)*y(idx,:))
+             end do
+          end do
+       end if
+    else
+       res = dzero
+    end if
+ 
+    ! TODO
+    ! compute global sum
+    if (global_) call psb_sum(ctxt, res)
+ 
+    call psb_erractionrestore(err_act)
+    return
+ 
+ 9999 call psb_error_handler(ctxt,err_act)
+ 
+    return
+ 
+ end function psb_dprod_multivect_a
--- a/base/psblas/psb_dqrfact.f90
+++ b/base/psblas/psb_dqrfact.f90
@ -20,9 +20,10 @@ function psb_dqrfact(x, desc_a, info) result(res)

   ! locals
   type(psb_ctxt_type) :: ctxt
-   integer(psb_ipk_) :: np, me, err_act, iix, jjx
-   integer(psb_lpk_) :: ix, ijx, m
+   integer(psb_ipk_) :: np, me, err_act, iix, jjx, i
+   integer(psb_lpk_) :: ix, ijx, m, n
   character(len=20) :: name, ch_err
+   real(psb_dpk_), allocatable :: temp(:,:)

   name='psb_dgqrfact'
   if (psb_errstatus_fatal()) return
@ -47,8 +48,9 @@ function psb_dqrfact(x, desc_a, info) result(res)
   ijx = ione

   m = desc_a%get_global_rows()
+   n = x%get_ncols()

-   call psb_chkvect(m,x%get_ncols(),x%get_nrows(),ix,ijx,desc_a,info,iix,jjx)
+   call psb_chkvect(m,n,x%get_nrows(),ix,ijx,desc_a,info,iix,jjx)
   if(info /= psb_success_) then
      info=psb_err_from_subroutine_
      ch_err='psb_chkvect'
@ -61,11 +63,20 @@ function psb_dqrfact(x, desc_a, info) result(res)
      call psb_errpush(info,name)
   end if

-   ! TODO serial?
-   if(desc_a%get_local_rows() > 0) then
+   call psb_gather(temp,x,desc_a,info,root=psb_root_)
+
+   if (me == psb_root_) then
+      call x%set(temp)
      res = x%qr_fact(info)
+      call psb_bcast(ctxt,res)
+   else
+      allocate(res(n,n))
+      call psb_bcast(ctxt,res)
   end if

+   temp = x%get_vect()
+   call psb_scatter(temp,x,desc_a,info,root=psb_root_)
+
   call psb_erractionrestore(err_act)
   return

--- a/krylov/psb_dbgmres.f90
+++ b/krylov/psb_dbgmres.f90
@ -61,7 +61,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
   integer(psb_ipk_), Optional, Intent(out)  :: iter
   real(psb_dpk_), Optional, Intent(out)     :: err

-   real(psb_dpk_), allocatable               :: aux(:), h(:,:), beta(:,:), temp(:,:)
+   real(psb_dpk_), allocatable               :: aux(:), h(:,:), beta(:,:), beta_e1(:,:)

   type(psb_d_multivect_type), allocatable   :: v(:)
   type(psb_d_multivect_type)                :: v_tot, w
@ -72,7 +72,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
   integer(psb_ipk_)                         :: litmax, naux, itrace_, n_row, n_col, nrhs, nrep
   integer(psb_lpk_)                         :: mglob, n_add

-   integer(psb_ipk_)                         :: i, j, istop_, err_act, idx_i, idx_j, idx
+   integer(psb_ipk_)                         :: i, j, k, istop_, err_act, idx_i, idx_j, idx
   integer(psb_ipk_)                         :: debug_level, debug_unit

   type(psb_ctxt_type)                       :: ctxt
@ -165,14 +165,14 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
      goto 9999
   end if

-   naux=4*n_col
+   naux = 4*n_col
   nrhs = x%get_ncols()
   allocate(aux(naux),h((nrep+1)*nrhs,nrep*nrhs),stat=info)
   if (info == psb_success_) call psb_geall(v,desc_a,info,m=nrep+1,n=nrhs)
-   if (info == psb_success_) call psb_geall(v_tot,desc_a,info,n=(nrep+1)*nrhs)
+   if (info == psb_success_) call psb_geall(v_tot,desc_a,info,n=nrep*nrhs)
   if (info == psb_success_) call psb_geall(w,desc_a,info,n=nrhs)
   if (info == psb_success_) call psb_geasb(v,desc_a,info,mold=x%v,n=nrhs)
-   if (info == psb_success_) call psb_geasb(v_tot,desc_a,info,mold=x%v,n=(nrep+1)*nrhs)
+   if (info == psb_success_) call psb_geasb(v_tot,desc_a,info,mold=x%v,n=nrep*nrhs)
   if (info == psb_success_) call psb_geasb(w,desc_a,info,mold=x%v,n=nrhs)
   if (info /= psb_success_) then
      info=psb_err_from_subroutine_non_
@ -208,7 +208,7 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,

   ! BGMRES algorithm

-   ! TODO inserire timer operazioni tra mlv
+   ! TODO QR fact seriale per ora

   ! STEP 1: Compute R(0) = B - A*X(0)

@ -228,7 +228,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
      goto 9999
   end if

-   ! TODO gather su root e poi scatter
   ! STEP 2: Compute QR_fact(R(0))
   beta = psb_geqrfact(v(1),desc_a,info)
   if (info /= psb_success_) then
@ -279,42 +278,32 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
      ! STEP 5: Inner loop
      inner: do i=1,j

-         write(*,*) 'PROC ', me, ' LOOOP 1', i
-
         ! Compute i index for H operations
         idx_i = (i-1)*nrhs+1

         ! STEP 6: Compute H(i,j) = V(i)_T*W
-         h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_gedot(v(i),w,desc_a,info)
+         h(idx_i:idx_i+n_add,idx_j:idx_j+n_add) = psb_geprod(v(i),w,desc_a,info,trans=.true.)
         if (info /= psb_success_) then
            info=psb_err_from_subroutine_non_
            call psb_errpush(info,name)
            goto 9999
         end if

-         write(*,*) 'PROC ', me, ' LOOOP 2', i
-
         ! STEP 7: Compute W = W - V(i)*H(i,j)
-         temp = psb_gedot(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),desc_a,info)
-         !call psb_geaxpby(-done,psb_gedot(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),desc_a,info),done,w,desc_a,info)

-         write(*,*) 'PROC ', me, ' LOOOP 3', i
-
-         call psb_geaxpby(-done,temp,done,w,desc_a,info)
+         ! TODO si blocca con NRHS grandi?
+         !temp = psb_geprod(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),desc_a,info,global=.false.)
+         call psb_geaxpby(-done,psb_geprod(v(i),h(idx_i:idx_i+n_add,idx_j:idx_j+n_add),desc_a,info,global=.false.),done,w,desc_a,info)
         if (info /= psb_success_) then
            info=psb_err_from_subroutine_non_
            call psb_errpush(info,name)
            goto 9999
         end if

-         write(*,*) 'PROC ', me, ' LOOOP 4', i
-
      end do inner

      ! STEP 8: Compute QR_fact(W)

-      !write(*,*) 'PROC ', me, ' BBBB ', j
-
      ! Store R in H(j+1,j)
      h(idx_j+nrhs:idx_j+nrhs+n_add,idx_j:idx_j+n_add) = psb_geqrfact(w,desc_a,info)
      if (info /= psb_success_) then
@ -331,8 +320,6 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
         goto 9999
      end if

-      write(*,*) 'PROC ', me, ' AAAA'
-
   end do outer

   ! STEP 9: Compute Y(m)
@ -343,15 +330,15 @@ subroutine psb_dbgmres_multivect(a, prec, b, x, eps, desc_a, info, itmax, iter,
      goto 9999
   end if

-   ! TODO Va vene che ci siano altre righe perchè poi si passa localrows
+   ! TODO V_tot comprende V(nrep+1)?
   ! STEP 10: Compute V = {V(1),...,V(m)}
-   do i=1,nrep+1
+   do i=1,nrep
      idx = (i-1)*nrhs+1
      v_tot%v%v(1:n_row,idx:idx+n_add) = v(i)%v%v(1:n_row,1:nrhs)
   enddo

   ! STEP 11: X(m) = X(0) + V*Y(m)
-   call psb_geaxpby(done,psb_gedot(v_tot,h,desc_a,info),done,x,desc_a,info)
+   call psb_geaxpby(done,psb_geprod(v_tot,beta_e1,desc_a,info,global=.false.),done,x,desc_a,info)
   if (info /= psb_success_) then
      info=psb_err_from_subroutine_non_
      call psb_errpush(info,name)
@ -389,7 +376,7 @@ contains
      implicit none

      integer(psb_ipk_)           :: lwork
-      real(psb_dpk_), allocatable :: work(:), beta_e1(:,:)
+      real(psb_dpk_), allocatable :: work(:), beta_temp(:,:)

      integer(psb_ipk_) :: m_h, n_h, mn

@ -401,14 +388,19 @@ contains
      allocate(work(lwork))

      ! Compute E1*beta
-      allocate(beta_e1(m_h,nrhs))
-      beta_e1 = dzero
-      beta_e1(1:nrhs,1:nrhs) = beta
+      allocate(beta_temp(m_h,nrhs))
+      beta_temp = dzero
+      beta_temp(1:nrhs,1:nrhs) = beta

      ! Compute min Frobenius norm
-      call dgels('N',m_h,n_h,nrhs,h,m_h,beta_e1,m_h,work,lwork,info)
+      call dgels('N',m_h,n_h,nrhs,h,m_h,beta_temp,m_h,work,lwork,info)
+
+      ! Set solution
+      allocate(beta_e1(n_h,nrhs))
+      beta_e1 = beta_temp(1:n_h,1:nrhs)

-      deallocate(work,beta_e1)
+      ! Deallocate
+      deallocate(work,beta,beta_temp)

      return

--- a/test/block_krylov/psb_dbf_sample.f90
+++ b/test/block_krylov/psb_dbf_sample.f90
@ -26,6 +26,8 @@ program psb_dbf_sample
   integer(psb_ipk_)                   :: m, nrhs
   real(psb_dpk_)                      :: random_value

+   real(psb_dpk_), allocatable :: test(:,:)
+
   ! communications data structure
   type(psb_desc_type) :: desc_a
   type(psb_ctxt_type) :: ctxt
@ -128,9 +130,9 @@ program psb_dbf_sample
         b_mv_glob => aux_b(:,:)
         do i=1, m
            do j=1, nrhs
-               !b_mv_glob(i,j) = done
-               call random_number(random_value)
-               b_mv_glob(i,j) = random_value
+               b_mv_glob(i,j) = done
+               !call random_number(random_value)
+               !b_mv_glob(i,j) = random_value
            enddo
         enddo
      endif
@ -255,7 +257,6 @@ program psb_dbf_sample
      write(psb_out_unit,'("Residual norm 2:                    ",es12.5)')resmx
      write(psb_out_unit,'("Residual norm inf:                  ",es12.5)')resmxp
      write(psb_out_unit,'(" ")')
-      ! TODO
      ! do i=1,m
      !    write(psb_out_unit,993) i, x_mv_glob(i,:), r_mv_glob(i,:), b_mv_glob(i,:)
      ! enddo