From e2bd21d9bec0cd5b4cee4c411831feb39c5407c3 Mon Sep 17 00:00:00 2001
From: Salvatore Filippone <salvatore.filippone@cranfield.ac.uk>
Date: Sun, 15 Dec 2019 18:37:28 +0000
Subject: [PATCH] Update internal documentation.

---
 base/comm/internals/psi_covrl_restr.f90   |   7 +-
 base/comm/internals/psi_covrl_restr_a.f90 |   5 +
 base/comm/internals/psi_covrl_save.f90    |   5 +
 base/comm/internals/psi_covrl_save_a.f90  |   8 +-
 base/comm/internals/psi_covrl_upd.f90     |   8 +-
 base/comm/internals/psi_covrl_upd_a.f90   |   6 +
 base/comm/internals/psi_cswapdata.F90     |  31 +-
 base/comm/internals/psi_cswapdata_a.F90   |  20 +-
 base/comm/internals/psi_cswaptran.F90     |  35 +-
 base/comm/internals/psi_cswaptran_a.F90   |  18 +-
 base/comm/internals/psi_dovrl_restr.f90   |   7 +-
 base/comm/internals/psi_dovrl_restr_a.f90 |   5 +
 base/comm/internals/psi_dovrl_save.f90    |   5 +
 base/comm/internals/psi_dovrl_save_a.f90  |   8 +-
 base/comm/internals/psi_dovrl_upd.f90     |   8 +-
 base/comm/internals/psi_dovrl_upd_a.f90   |   6 +
 base/comm/internals/psi_dswapdata.F90     |  31 +-
 base/comm/internals/psi_dswapdata_a.F90   |  20 +-
 base/comm/internals/psi_dswaptran.F90     |  35 +-
 base/comm/internals/psi_dswaptran_a.F90   |  18 +-
 base/comm/internals/psi_eovrl_restr_a.f90 |   5 +
 base/comm/internals/psi_eovrl_save_a.f90  |   8 +-
 base/comm/internals/psi_eovrl_upd_a.f90   |   6 +
 base/comm/internals/psi_eswapdata_a.F90   |  20 +-
 base/comm/internals/psi_eswaptran_a.F90   |  18 +-
 base/comm/internals/psi_iovrl_restr.f90   |   7 +-
 base/comm/internals/psi_iovrl_save.f90    |   5 +
 base/comm/internals/psi_iovrl_upd.f90     |   8 +-
 base/comm/internals/psi_iswapdata.F90     |  31 +-
 base/comm/internals/psi_iswaptran.F90     |  35 +-
 base/comm/internals/psi_lovrl_restr.f90   |   7 +-
 base/comm/internals/psi_lovrl_save.f90    |   5 +
 base/comm/internals/psi_lovrl_upd.f90     |   8 +-
 base/comm/internals/psi_lswapdata.F90     |  31 +-
 base/comm/internals/psi_lswaptran.F90     |  35 +-
 base/comm/internals/psi_movrl_restr_a.f90 |   5 +
 base/comm/internals/psi_movrl_save_a.f90  |   8 +-
 base/comm/internals/psi_movrl_upd_a.f90   |   6 +
 base/comm/internals/psi_mswapdata_a.F90   |  20 +-
 base/comm/internals/psi_mswaptran_a.F90   |  18 +-
 base/comm/internals/psi_sovrl_restr.f90   |   7 +-
 base/comm/internals/psi_sovrl_restr_a.f90 |   5 +
 base/comm/internals/psi_sovrl_save.f90    |   5 +
 base/comm/internals/psi_sovrl_save_a.f90  |   8 +-
 base/comm/internals/psi_sovrl_upd.f90     |   8 +-
 base/comm/internals/psi_sovrl_upd_a.f90   |   6 +
 base/comm/internals/psi_sswapdata.F90     |  31 +-
 base/comm/internals/psi_sswapdata_a.F90   |  20 +-
 base/comm/internals/psi_sswaptran.F90     |  35 +-
 base/comm/internals/psi_sswaptran_a.F90   |  18 +-
 base/comm/internals/psi_zovrl_restr.f90   |   7 +-
 base/comm/internals/psi_zovrl_restr_a.f90 |   5 +
 base/comm/internals/psi_zovrl_save.f90    |   5 +
 base/comm/internals/psi_zovrl_save_a.f90  |   8 +-
 base/comm/internals/psi_zovrl_upd.f90     |   8 +-
 base/comm/internals/psi_zovrl_upd_a.f90   |   6 +
 base/comm/internals/psi_zswapdata.F90     |  31 +-
 base/comm/internals/psi_zswapdata_a.F90   |  20 +-
 base/comm/internals/psi_zswaptran.F90     |  35 +-
 base/comm/internals/psi_zswaptran_a.F90   |  18 +-
 base/comm/psb_cgather.f90                 |  12 +-
 base/comm/psb_cgather_a.f90               |   2 +-
 base/comm/psb_chalo.f90                   |  32 +-
 base/comm/psb_covrl.f90                   |  39 +-
 base/comm/psb_cscatter.F90                |  10 +-
 base/comm/psb_cscatter_a.F90              |   4 +-
 base/comm/psb_cspgather.F90               |  11 +
 base/comm/psb_dgather.f90                 |  12 +-
 base/comm/psb_dgather_a.f90               |   2 +-
 base/comm/psb_dhalo.f90                   |  32 +-
 base/comm/psb_dovrl.f90                   |  39 +-
 base/comm/psb_dscatter.F90                |  10 +-
 base/comm/psb_dscatter_a.F90              |   4 +-
 base/comm/psb_dspgather.F90               |  11 +
 base/comm/psb_egather_a.f90               |   2 +-
 base/comm/psb_escatter_a.F90              |   4 +-
 base/comm/psb_igather.f90                 |  12 +-
 base/comm/psb_ihalo.f90                   |  32 +-
 base/comm/psb_iovrl.f90                   |  39 +-
 base/comm/psb_iscatter.F90                |  10 +-
 base/comm/psb_ispgather.F90               |  11 +
 base/comm/psb_lgather.f90                 |  12 +-
 base/comm/psb_lhalo.f90                   |  32 +-
 base/comm/psb_lovrl.f90                   |  39 +-
 base/comm/psb_lscatter.F90                |  10 +-
 base/comm/psb_lspgather.F90               |  11 +
 base/comm/psb_mgather_a.f90               |   2 +-
 base/comm/psb_mscatter_a.F90              |   4 +-
 base/comm/psb_sgather.f90                 |  12 +-
 base/comm/psb_sgather_a.f90               |   2 +-
 base/comm/psb_shalo.f90                   |  32 +-
 base/comm/psb_sovrl.f90                   |  39 +-
 base/comm/psb_sscatter.F90                |  10 +-
 base/comm/psb_sscatter_a.F90              |   4 +-
 base/comm/psb_sspgather.F90               |  11 +
 base/comm/psb_zgather.f90                 |  12 +-
 base/comm/psb_zgather_a.f90               |   2 +-
 base/comm/psb_zhalo.f90                   |  32 +-
 base/comm/psb_zovrl.f90                   |  39 +-
 base/comm/psb_zscatter.F90                |  10 +-
 base/comm/psb_zscatter_a.F90              |   4 +-
 base/comm/psb_zspgather.F90               |  11 +
 base/psblas/psb_camax.f90                 |  22 +-
 base/psblas/psb_casum.f90                 |  13 +
 base/psblas/psb_caxpby.f90                |  19 +-
 base/psblas/psb_cdot.f90                  |  49 ++-
 base/psblas/psb_cnrm2.f90                 |  29 +-
 base/psblas/psb_cnrmi.f90                 |   5 +-
 base/psblas/psb_cspmm.f90                 | 441 +++++++++++-----------
 base/psblas/psb_cspnrm1.f90               |   1 +
 base/psblas/psb_cspsm.f90                 | 392 +++++++++----------
 base/psblas/psb_damax.f90                 |  22 +-
 base/psblas/psb_dasum.f90                 |  13 +
 base/psblas/psb_daxpby.f90                |  19 +-
 base/psblas/psb_ddot.f90                  |  49 ++-
 base/psblas/psb_dnrm2.f90                 |  29 +-
 base/psblas/psb_dnrmi.f90                 |   5 +-
 base/psblas/psb_dspmm.f90                 | 441 +++++++++++-----------
 base/psblas/psb_dspnrm1.f90               |   1 +
 base/psblas/psb_dspsm.f90                 | 392 +++++++++----------
 base/psblas/psb_samax.f90                 |  22 +-
 base/psblas/psb_sasum.f90                 |  13 +
 base/psblas/psb_saxpby.f90                |  19 +-
 base/psblas/psb_sdot.f90                  |  49 ++-
 base/psblas/psb_snrm2.f90                 |  29 +-
 base/psblas/psb_snrmi.f90                 |   5 +-
 base/psblas/psb_sspmm.f90                 | 441 +++++++++++-----------
 base/psblas/psb_sspnrm1.f90               |   1 +
 base/psblas/psb_sspsm.f90                 | 392 +++++++++----------
 base/psblas/psb_zamax.f90                 |  22 +-
 base/psblas/psb_zasum.f90                 |  13 +
 base/psblas/psb_zaxpby.f90                |  19 +-
 base/psblas/psb_zdot.f90                  |  49 ++-
 base/psblas/psb_znrm2.f90                 |  29 +-
 base/psblas/psb_znrmi.f90                 |   5 +-
 base/psblas/psb_zspmm.f90                 | 441 +++++++++++-----------
 base/psblas/psb_zspnrm1.f90               |   1 +
 base/psblas/psb_zspsm.f90                 | 392 +++++++++----------
 base/serial/psb_camax_s.f90               |   2 +-
 base/serial/psb_casum_s.f90               |   4 +-
 base/serial/psb_damax_s.f90               |   2 +-
 base/serial/psb_dasum_s.f90               |   4 +-
 base/serial/psb_samax_s.f90               |   2 +-
 base/serial/psb_sasum_s.f90               |   4 +-
 base/serial/psb_zamax_s.f90               |   2 +-
 base/serial/psb_zasum_s.f90               |   4 +-
 test/pargen/psb_d_pde3d.f90               |  50 +--
 test/pargen/runs/ppde.inp                 |   2 +-
 148 files changed, 3186 insertions(+), 2292 deletions(-)

diff --git a/base/comm/internals/psi_covrl_restr.f90 b/base/comm/internals/psi_covrl_restr.f90
index 98ecdd8b..996e85ad 100644
--- a/base/comm/internals/psi_covrl_restr.f90
+++ b/base/comm/internals/psi_covrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_covrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_covrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_covrl_restr_vect
   use psb_c_base_vect_mod
diff --git a/base/comm/internals/psi_covrl_restr_a.f90 b/base/comm/internals/psi_covrl_restr_a.f90
index 46eb0563..3c814a81 100644
--- a/base/comm/internals/psi_covrl_restr_a.f90
+++ b/base/comm/internals/psi_covrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_covrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_covrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_covrl_restrr1
diff --git a/base/comm/internals/psi_covrl_save.f90 b/base/comm/internals/psi_covrl_save.f90
index 41e65943..c48d2ade 100644
--- a/base/comm/internals/psi_covrl_save.f90
+++ b/base/comm/internals/psi_covrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_covrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_covrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_covrl_save_vect
diff --git a/base/comm/internals/psi_covrl_save_a.f90 b/base/comm/internals/psi_covrl_save_a.f90
index d853db8d..d017d921 100644
--- a/base/comm/internals/psi_covrl_save_a.f90
+++ b/base/comm/internals/psi_covrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_covrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_covrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_covrl_saver1
 
diff --git a/base/comm/internals/psi_covrl_upd.f90 b/base/comm/internals/psi_covrl_upd.f90
index f99cbb66..ce33845e 100644
--- a/base/comm/internals/psi_covrl_upd.f90
+++ b/base/comm/internals/psi_covrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_covrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_covrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_covrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_covrl_upd_a.f90 b/base/comm/internals/psi_covrl_upd_a.f90
index 633747e5..33297731 100644
--- a/base/comm/internals/psi_covrl_upd_a.f90
+++ b/base/comm/internals/psi_covrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_covrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_covrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_covrl_updr1
diff --git a/base/comm/internals/psi_cswapdata.F90 b/base/comm/internals/psi_cswapdata.F90
index 5f349de9..fed8afbf 100644
--- a/base/comm/internals/psi_cswapdata.F90
+++ b/base/comm/internals/psi_cswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_cswapdata.F90
 !
-! Subroutine: psi_cswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_cswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_cswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_cswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_cswap_vidx_vect
 ! Subroutine: psi_cswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_cswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_cswapdata_a.F90 b/base/comm/internals/psi_cswapdata_a.F90
index e700d7fd..2a113b17 100644
--- a/base/comm/internals/psi_cswapdata_a.F90
+++ b/base/comm/internals/psi_cswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_cswapdata.F90
 !
 ! Subroutine: psi_cswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_cswapidxm
 !
 !
 ! Subroutine: psi_cswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_cswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_cswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_cswaptran.F90 b/base/comm/internals/psi_cswaptran.F90
index ccfce96d..152cb045 100644
--- a/base/comm/internals/psi_cswaptran.F90
+++ b/base/comm/internals/psi_cswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_cswaptran.F90
 !
-! Subroutine: psi_cswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_cswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_c_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_cswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_cswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_cswaptran_vect
 ! Subroutine: psi_ctran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_ctran_vidx_vect
 !
 !
 !
-! Subroutine: psi_cswaptran_vect
+! Subroutine: psi_cswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_cswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_cswaptran_a.F90 b/base/comm/internals/psi_cswaptran_a.F90
index e48e0b81..ed061be6 100644
--- a/base/comm/internals/psi_cswaptran_a.F90
+++ b/base/comm/internals/psi_cswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_cswaptran.F90
 !
 ! Subroutine: psi_cswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_ctranidxm
 !
 !
 ! Subroutine: psi_cswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_ctranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_ctranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_dovrl_restr.f90 b/base/comm/internals/psi_dovrl_restr.f90
index 326f32df..1d6695cc 100644
--- a/base/comm/internals/psi_dovrl_restr.f90
+++ b/base/comm/internals/psi_dovrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_dovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_dovrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_restr_vect
   use psb_d_base_vect_mod
diff --git a/base/comm/internals/psi_dovrl_restr_a.f90 b/base/comm/internals/psi_dovrl_restr_a.f90
index 2d83c416..df47403b 100644
--- a/base/comm/internals/psi_dovrl_restr_a.f90
+++ b/base/comm/internals/psi_dovrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_dovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_dovrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_restrr1
diff --git a/base/comm/internals/psi_dovrl_save.f90 b/base/comm/internals/psi_dovrl_save.f90
index 3f8a923b..0689ee1b 100644
--- a/base/comm/internals/psi_dovrl_save.f90
+++ b/base/comm/internals/psi_dovrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_dovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_dovrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_save_vect
diff --git a/base/comm/internals/psi_dovrl_save_a.f90 b/base/comm/internals/psi_dovrl_save_a.f90
index 6bf57d87..fcc19e08 100644
--- a/base/comm/internals/psi_dovrl_save_a.f90
+++ b/base/comm/internals/psi_dovrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_dovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_dovrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_saver1
 
diff --git a/base/comm/internals/psi_dovrl_upd.f90 b/base/comm/internals/psi_dovrl_upd.f90
index 867281ff..efbb2495 100644
--- a/base/comm/internals/psi_dovrl_upd.f90
+++ b/base/comm/internals/psi_dovrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_dovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_dovrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_dovrl_upd_a.f90 b/base/comm/internals/psi_dovrl_upd_a.f90
index ccdfba89..8ad3db6b 100644
--- a/base/comm/internals/psi_dovrl_upd_a.f90
+++ b/base/comm/internals/psi_dovrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_dovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_dovrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_dovrl_updr1
diff --git a/base/comm/internals/psi_dswapdata.F90 b/base/comm/internals/psi_dswapdata.F90
index 58dd5d59..a59b9b79 100644
--- a/base/comm/internals/psi_dswapdata.F90
+++ b/base/comm/internals/psi_dswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_dswapdata.F90
 !
-! Subroutine: psi_dswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_dswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_dswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_dswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_dswap_vidx_vect
 ! Subroutine: psi_dswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_dswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_dswapdata_a.F90 b/base/comm/internals/psi_dswapdata_a.F90
index 00108cd6..b9d0aaae 100644
--- a/base/comm/internals/psi_dswapdata_a.F90
+++ b/base/comm/internals/psi_dswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_dswapdata.F90
 !
 ! Subroutine: psi_dswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_dswapidxm
 !
 !
 ! Subroutine: psi_dswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_dswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_dswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_dswaptran.F90 b/base/comm/internals/psi_dswaptran.F90
index f4a0c78d..1feee33c 100644
--- a/base/comm/internals/psi_dswaptran.F90
+++ b/base/comm/internals/psi_dswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_dswaptran.F90
 !
-! Subroutine: psi_dswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_dswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_d_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_dswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_dswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_dswaptran_vect
 ! Subroutine: psi_dtran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_dtran_vidx_vect
 !
 !
 !
-! Subroutine: psi_dswaptran_vect
+! Subroutine: psi_dswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_dswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_dswaptran_a.F90 b/base/comm/internals/psi_dswaptran_a.F90
index 411a34cc..aad6348e 100644
--- a/base/comm/internals/psi_dswaptran_a.F90
+++ b/base/comm/internals/psi_dswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_dswaptran.F90
 !
 ! Subroutine: psi_dswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_dtranidxm
 !
 !
 ! Subroutine: psi_dswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_dtranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_dtranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_eovrl_restr_a.f90 b/base/comm/internals/psi_eovrl_restr_a.f90
index fe981855..3dbb2ac4 100644
--- a/base/comm/internals/psi_eovrl_restr_a.f90
+++ b/base/comm/internals/psi_eovrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_eovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_eovrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_eovrl_restrr1
diff --git a/base/comm/internals/psi_eovrl_save_a.f90 b/base/comm/internals/psi_eovrl_save_a.f90
index de6878f0..4f0b7d30 100644
--- a/base/comm/internals/psi_eovrl_save_a.f90
+++ b/base/comm/internals/psi_eovrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_eovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_eovrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_eovrl_saver1
 
diff --git a/base/comm/internals/psi_eovrl_upd_a.f90 b/base/comm/internals/psi_eovrl_upd_a.f90
index f8589e41..e8e40738 100644
--- a/base/comm/internals/psi_eovrl_upd_a.f90
+++ b/base/comm/internals/psi_eovrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_eovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_eovrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_eovrl_updr1
diff --git a/base/comm/internals/psi_eswapdata_a.F90 b/base/comm/internals/psi_eswapdata_a.F90
index 55571aa0..f7c67ac3 100644
--- a/base/comm/internals/psi_eswapdata_a.F90
+++ b/base/comm/internals/psi_eswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_eswapdata.F90
 !
 ! Subroutine: psi_eswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_eswapidxm
 !
 !
 ! Subroutine: psi_eswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_eswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_eswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_eswaptran_a.F90 b/base/comm/internals/psi_eswaptran_a.F90
index 41f7abb9..42954369 100644
--- a/base/comm/internals/psi_eswaptran_a.F90
+++ b/base/comm/internals/psi_eswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_eswaptran.F90
 !
 ! Subroutine: psi_eswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_etranidxm
 !
 !
 ! Subroutine: psi_eswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_etranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_etranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_iovrl_restr.f90 b/base/comm/internals/psi_iovrl_restr.f90
index 89ff5ee0..b0efa2ee 100644
--- a/base/comm/internals/psi_iovrl_restr.f90
+++ b/base/comm/internals/psi_iovrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_iovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_iovrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_iovrl_restr_vect
   use psb_i_base_vect_mod
diff --git a/base/comm/internals/psi_iovrl_save.f90 b/base/comm/internals/psi_iovrl_save.f90
index b48a7dbc..cc7619f9 100644
--- a/base/comm/internals/psi_iovrl_save.f90
+++ b/base/comm/internals/psi_iovrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_iovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_iovrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_iovrl_save_vect
diff --git a/base/comm/internals/psi_iovrl_upd.f90 b/base/comm/internals/psi_iovrl_upd.f90
index a84a1cc0..988bf006 100644
--- a/base/comm/internals/psi_iovrl_upd.f90
+++ b/base/comm/internals/psi_iovrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_iovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_iovrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_iovrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_iswapdata.F90 b/base/comm/internals/psi_iswapdata.F90
index 335de5d7..d05ab8ac 100644
--- a/base/comm/internals/psi_iswapdata.F90
+++ b/base/comm/internals/psi_iswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_iswapdata.F90
 !
-! Subroutine: psi_iswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_iswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_iswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_iswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_iswap_vidx_vect
 ! Subroutine: psi_iswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_iswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_iswaptran.F90 b/base/comm/internals/psi_iswaptran.F90
index d77cfd66..e5719735 100644
--- a/base/comm/internals/psi_iswaptran.F90
+++ b/base/comm/internals/psi_iswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_iswaptran.F90
 !
-! Subroutine: psi_iswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_iswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_i_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_iswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_iswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_iswaptran_vect
 ! Subroutine: psi_itran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_itran_vidx_vect
 !
 !
 !
-! Subroutine: psi_iswaptran_vect
+! Subroutine: psi_iswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_iswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_lovrl_restr.f90 b/base/comm/internals/psi_lovrl_restr.f90
index ba96e9c0..dc33bbb1 100644
--- a/base/comm/internals/psi_lovrl_restr.f90
+++ b/base/comm/internals/psi_lovrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_lovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_lovrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_lovrl_restr_vect
   use psb_l_base_vect_mod
diff --git a/base/comm/internals/psi_lovrl_save.f90 b/base/comm/internals/psi_lovrl_save.f90
index 5a06c697..496eec90 100644
--- a/base/comm/internals/psi_lovrl_save.f90
+++ b/base/comm/internals/psi_lovrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_lovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_lovrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_lovrl_save_vect
diff --git a/base/comm/internals/psi_lovrl_upd.f90 b/base/comm/internals/psi_lovrl_upd.f90
index 4364ac6f..d20d80cc 100644
--- a/base/comm/internals/psi_lovrl_upd.f90
+++ b/base/comm/internals/psi_lovrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_lovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_lovrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_lovrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_lswapdata.F90 b/base/comm/internals/psi_lswapdata.F90
index 67a4f0b8..939f9596 100644
--- a/base/comm/internals/psi_lswapdata.F90
+++ b/base/comm/internals/psi_lswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_lswapdata.F90
 !
-! Subroutine: psi_lswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_lswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_lswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_lswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_lswap_vidx_vect
 ! Subroutine: psi_lswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_lswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_lswaptran.F90 b/base/comm/internals/psi_lswaptran.F90
index 28b0311a..ccc1b6e3 100644
--- a/base/comm/internals/psi_lswaptran.F90
+++ b/base/comm/internals/psi_lswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_lswaptran.F90
 !
-! Subroutine: psi_lswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_lswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_l_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_lswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_lswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_lswaptran_vect
 ! Subroutine: psi_ltran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_ltran_vidx_vect
 !
 !
 !
-! Subroutine: psi_lswaptran_vect
+! Subroutine: psi_lswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_lswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_movrl_restr_a.f90 b/base/comm/internals/psi_movrl_restr_a.f90
index a5759f98..92e06793 100644
--- a/base/comm/internals/psi_movrl_restr_a.f90
+++ b/base/comm/internals/psi_movrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_movrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_movrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_movrl_restrr1
diff --git a/base/comm/internals/psi_movrl_save_a.f90 b/base/comm/internals/psi_movrl_save_a.f90
index 7935333f..fbc021cf 100644
--- a/base/comm/internals/psi_movrl_save_a.f90
+++ b/base/comm/internals/psi_movrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_movrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_movrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_movrl_saver1
 
diff --git a/base/comm/internals/psi_movrl_upd_a.f90 b/base/comm/internals/psi_movrl_upd_a.f90
index 92bfccce..03670659 100644
--- a/base/comm/internals/psi_movrl_upd_a.f90
+++ b/base/comm/internals/psi_movrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_movrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_movrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_movrl_updr1
diff --git a/base/comm/internals/psi_mswapdata_a.F90 b/base/comm/internals/psi_mswapdata_a.F90
index 2dc1dedf..b71e61ef 100644
--- a/base/comm/internals/psi_mswapdata_a.F90
+++ b/base/comm/internals/psi_mswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_mswapdata.F90
 !
 ! Subroutine: psi_mswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_mswapidxm
 !
 !
 ! Subroutine: psi_mswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_mswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_mswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_mswaptran_a.F90 b/base/comm/internals/psi_mswaptran_a.F90
index 7c7693d7..add5a608 100644
--- a/base/comm/internals/psi_mswaptran_a.F90
+++ b/base/comm/internals/psi_mswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_mswaptran.F90
 !
 ! Subroutine: psi_mswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_mtranidxm
 !
 !
 ! Subroutine: psi_mswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_mtranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_mtranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_sovrl_restr.f90 b/base/comm/internals/psi_sovrl_restr.f90
index b74c4335..3854040c 100644
--- a/base/comm/internals/psi_sovrl_restr.f90
+++ b/base/comm/internals/psi_sovrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_sovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_sovrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_restr_vect
   use psb_s_base_vect_mod
diff --git a/base/comm/internals/psi_sovrl_restr_a.f90 b/base/comm/internals/psi_sovrl_restr_a.f90
index 5ff1a9f7..e66297d5 100644
--- a/base/comm/internals/psi_sovrl_restr_a.f90
+++ b/base/comm/internals/psi_sovrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_sovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_sovrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_restrr1
diff --git a/base/comm/internals/psi_sovrl_save.f90 b/base/comm/internals/psi_sovrl_save.f90
index a10ae218..9b06fef7 100644
--- a/base/comm/internals/psi_sovrl_save.f90
+++ b/base/comm/internals/psi_sovrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_sovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_sovrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_save_vect
diff --git a/base/comm/internals/psi_sovrl_save_a.f90 b/base/comm/internals/psi_sovrl_save_a.f90
index 5766dede..cf400127 100644
--- a/base/comm/internals/psi_sovrl_save_a.f90
+++ b/base/comm/internals/psi_sovrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_sovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_sovrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_saver1
 
diff --git a/base/comm/internals/psi_sovrl_upd.f90 b/base/comm/internals/psi_sovrl_upd.f90
index b95a49ba..636261b1 100644
--- a/base/comm/internals/psi_sovrl_upd.f90
+++ b/base/comm/internals/psi_sovrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_sovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_sovrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_sovrl_upd_a.f90 b/base/comm/internals/psi_sovrl_upd_a.f90
index d553b5f7..82211657 100644
--- a/base/comm/internals/psi_sovrl_upd_a.f90
+++ b/base/comm/internals/psi_sovrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_sovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_sovrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_sovrl_updr1
diff --git a/base/comm/internals/psi_sswapdata.F90 b/base/comm/internals/psi_sswapdata.F90
index 3dc55de7..b5185198 100644
--- a/base/comm/internals/psi_sswapdata.F90
+++ b/base/comm/internals/psi_sswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_sswapdata.F90
 !
-! Subroutine: psi_sswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_sswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_sswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_sswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_sswap_vidx_vect
 ! Subroutine: psi_sswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_sswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_sswapdata_a.F90 b/base/comm/internals/psi_sswapdata_a.F90
index fa8b1afb..de8587c1 100644
--- a/base/comm/internals/psi_sswapdata_a.F90
+++ b/base/comm/internals/psi_sswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_sswapdata.F90
 !
 ! Subroutine: psi_sswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_sswapidxm
 !
 !
 ! Subroutine: psi_sswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_sswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_sswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_sswaptran.F90 b/base/comm/internals/psi_sswaptran.F90
index cdde2cf3..cb3ccc75 100644
--- a/base/comm/internals/psi_sswaptran.F90
+++ b/base/comm/internals/psi_sswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_sswaptran.F90
 !
-! Subroutine: psi_sswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_sswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_s_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_sswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_sswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_sswaptran_vect
 ! Subroutine: psi_stran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_stran_vidx_vect
 !
 !
 !
-! Subroutine: psi_sswaptran_vect
+! Subroutine: psi_sswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_sswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_sswaptran_a.F90 b/base/comm/internals/psi_sswaptran_a.F90
index 49f0d89f..f0f82965 100644
--- a/base/comm/internals/psi_sswaptran_a.F90
+++ b/base/comm/internals/psi_sswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_sswaptran.F90
 !
 ! Subroutine: psi_sswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_stranidxm
 !
 !
 ! Subroutine: psi_sswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_stranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_stranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_zovrl_restr.f90 b/base/comm/internals/psi_zovrl_restr.f90
index bd83e618..e52232e7 100644
--- a/base/comm/internals/psi_zovrl_restr.f90
+++ b/base/comm/internals/psi_zovrl_restr.f90
@@ -28,8 +28,13 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_zovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 !    
-
 subroutine  psi_zovrl_restr_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_restr_vect
   use psb_z_base_vect_mod
diff --git a/base/comm/internals/psi_zovrl_restr_a.f90 b/base/comm/internals/psi_zovrl_restr_a.f90
index a0a22a31..379efdcb 100644
--- a/base/comm/internals/psi_zovrl_restr_a.f90
+++ b/base/comm/internals/psi_zovrl_restr_a.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_zovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_zovrl_restrr1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_restrr1
diff --git a/base/comm/internals/psi_zovrl_save.f90 b/base/comm/internals/psi_zovrl_save.f90
index 16238573..acf65181 100644
--- a/base/comm/internals/psi_zovrl_save.f90
+++ b/base/comm/internals/psi_zovrl_save.f90
@@ -28,6 +28,11 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_zovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 !    
 subroutine  psi_zovrl_save_vect(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_save_vect
diff --git a/base/comm/internals/psi_zovrl_save_a.f90 b/base/comm/internals/psi_zovrl_save_a.f90
index 32d5d212..1e8fdb89 100644
--- a/base/comm/internals/psi_zovrl_save_a.f90
+++ b/base/comm/internals/psi_zovrl_save_a.f90
@@ -28,8 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
-!    
-
+!
+!
+! Subroutine: psi_zovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_zovrl_saver1(x,xs,desc_a,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_saver1
 
diff --git a/base/comm/internals/psi_zovrl_upd.f90 b/base/comm/internals/psi_zovrl_upd.f90
index 83fcf702..cd7ea0de 100644
--- a/base/comm/internals/psi_zovrl_upd.f90
+++ b/base/comm/internals/psi_zovrl_upd.f90
@@ -28,8 +28,14 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_zovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 !    
-
 subroutine  psi_zovrl_upd_vect(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_upd_vect
   use psb_realloc_mod
diff --git a/base/comm/internals/psi_zovrl_upd_a.f90 b/base/comm/internals/psi_zovrl_upd_a.f90
index b149addb..9ea2fbae 100644
--- a/base/comm/internals/psi_zovrl_upd_a.f90
+++ b/base/comm/internals/psi_zovrl_upd_a.f90
@@ -28,6 +28,12 @@
 !    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
+!
+! Subroutine: psi_zovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
 !    
 subroutine  psi_zovrl_updr1(x,desc_a,update,info)
   use psi_mod, psi_protect_name =>   psi_zovrl_updr1
diff --git a/base/comm/internals/psi_zswapdata.F90 b/base/comm/internals/psi_zswapdata.F90
index 014a37e5..7d34012e 100644
--- a/base/comm/internals/psi_zswapdata.F90
+++ b/base/comm/internals/psi_zswapdata.F90
@@ -32,8 +32,10 @@
 !
 ! File: psi_zswapdata.F90
 !
-! Subroutine: psi_zswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_zswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -83,14 +88,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-!
-!
-! Subroutine: psi_zswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 ! 
 subroutine psi_zswapdata_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -426,7 +423,7 @@ end subroutine psi_zswap_vidx_vect
 ! Subroutine: psi_zswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_zswapdata_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_zswapdata_a.F90 b/base/comm/internals/psi_zswapdata_a.F90
index ebc74a6d..6e821f4f 100644
--- a/base/comm/internals/psi_zswapdata_a.F90
+++ b/base/comm/internals/psi_zswapdata_a.F90
@@ -33,9 +33,9 @@
 ! File: psi_zswapdata.F90
 !
 ! Subroutine: psi_zswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -507,7 +508,7 @@ end subroutine psi_zswapidxm
 !
 !
 ! Subroutine: psi_zswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@@ -523,6 +524,7 @@ end subroutine psi_zswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -543,10 +545,10 @@ end subroutine psi_zswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/internals/psi_zswaptran.F90 b/base/comm/internals/psi_zswaptran.F90
index 04e3584e..c78a5e1a 100644
--- a/base/comm/internals/psi_zswaptran.F90
+++ b/base/comm/internals/psi_zswaptran.F90
@@ -32,8 +32,8 @@
 !
 ! File: psi_zswaptran.F90
 !
-! Subroutine: psi_zswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_zswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_z_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -86,13 +90,6 @@
 !                                       psb_comm_ovrl_    use ovrl_index
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
-!
-!
-! Subroutine: psi_zswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
 !   
 subroutine psi_zswaptran_vect(flag,beta,y,desc_a,work,info,data)
 
@@ -171,7 +168,7 @@ end subroutine psi_zswaptran_vect
 ! Subroutine: psi_ztran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@@ -436,10 +433,10 @@ end subroutine psi_ztran_vidx_vect
 !
 !
 !
-! Subroutine: psi_zswaptran_vect
+! Subroutine: psi_zswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_zswaptran_multivect(flag,beta,y,desc_a,work,info,data)
diff --git a/base/comm/internals/psi_zswaptran_a.F90 b/base/comm/internals/psi_zswaptran_a.F90
index c2e1111a..aaf305ac 100644
--- a/base/comm/internals/psi_zswaptran_a.F90
+++ b/base/comm/internals/psi_zswaptran_a.F90
@@ -33,7 +33,7 @@
 ! File: psi_zswaptran.F90
 !
 ! Subroutine: psi_zswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@@ -516,7 +517,7 @@ end subroutine psi_ztranidxm
 !
 !
 ! Subroutine: psi_zswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@@ -536,6 +537,7 @@ end subroutine psi_ztranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@@ -556,10 +558,10 @@ end subroutine psi_ztranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
diff --git a/base/comm/psb_cgather.f90 b/base/comm/psb_cgather.f90
index d4e675c2..f4ed3f4a 100644
--- a/base/comm/psb_cgather.f90
+++ b/base/comm/psb_cgather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_cgather.f90
 !
-! Subroutine: psb_cgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_cgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).          The local matrix into which gather 
+!   globx     -  complex,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  complex,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_c_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_cgather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_cgather_vect
 
+! Subroutine: psb_cgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_cgather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_cgather_multivect
diff --git a/base/comm/psb_cgather_a.f90 b/base/comm/psb_cgather_a.f90
index 09e35678..5f75abd6 100644
--- a/base/comm/psb_cgather_a.f90
+++ b/base/comm/psb_cgather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_cgather.f90
 !
-! Subroutine: psb_cgatherm
+! Subroutine: psb_cgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_chalo.f90 b/base/comm/psb_chalo.f90
index e34168c6..ad483a51 100644
--- a/base/comm/psb_chalo.f90
+++ b/base/comm/psb_chalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_chalo.f90
 !
-! Subroutine: psb_chalom
+! Subroutine: psb_chalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  complex,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_c_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  complex(optional).            Work  area.
+!   work      -  complex(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_chalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_chalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_chalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_chalo_vect
 
-
+!
+! Subroutine: psb_chalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_c_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  complex(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_chalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_chalo_multivect
   use psi_mod
diff --git a/base/comm/psb_covrl.f90 b/base/comm/psb_covrl.f90
index f6155ffe..7ac851b6 100644
--- a/base/comm/psb_covrl.f90
+++ b/base/comm/psb_covrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_covrl.f90
 !
-! Subroutine: psb_covrlm
+! Subroutine: psb_covrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  complex                   The local part of the dense matrix.
+!   x           -  type(psb_c_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_covrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_covrl_vect
 
-
+!
+! Subroutine: psb_covrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_c_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  complex(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_covrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_covrl_multivect
   use psi_mod
diff --git a/base/comm/psb_cscatter.F90 b/base/comm/psb_cscatter.F90
index 77036d04..7ac22a68 100644
--- a/base/comm/psb_cscatter.F90
+++ b/base/comm/psb_cscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_cscatter.f90
 !
-! Subroutine: psb_cscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_cscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).       The global matrix to scatter.
-!   locx      -  complex,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  complex,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_c_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_cscatter_a.F90 b/base/comm/psb_cscatter_a.F90
index 1cadd168..d16ae980 100644
--- a/base/comm/psb_cscatter_a.F90
+++ b/base/comm/psb_cscatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_cscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_cscatterm
 
 ! Subroutine: psb_cscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_cspgather.F90 b/base/comm/psb_cspgather.F90
index b9401061..45dcc667 100644
--- a/base/comm/psb_cspgather.F90
+++ b/base/comm/psb_cspgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_cspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_c_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_lc_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_csp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/comm/psb_dgather.f90 b/base/comm/psb_dgather.f90
index 3659b487..c767c8ec 100644
--- a/base/comm/psb_dgather.f90
+++ b/base/comm/psb_dgather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_dgather.f90
 !
-! Subroutine: psb_dgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_dgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).          The local matrix into which gather 
+!   globx     -  real,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  real,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_d_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_dgather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_dgather_vect
 
+! Subroutine: psb_dgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_dgather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_dgather_multivect
diff --git a/base/comm/psb_dgather_a.f90 b/base/comm/psb_dgather_a.f90
index 277eb3d0..5ae9ed50 100644
--- a/base/comm/psb_dgather_a.f90
+++ b/base/comm/psb_dgather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_dgather.f90
 !
-! Subroutine: psb_dgatherm
+! Subroutine: psb_dgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_dhalo.f90 b/base/comm/psb_dhalo.f90
index 85caa81f..b5f584dc 100644
--- a/base/comm/psb_dhalo.f90
+++ b/base/comm/psb_dhalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_dhalo.f90
 !
-! Subroutine: psb_dhalom
+! Subroutine: psb_dhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  real,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_d_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  real(optional).            Work  area.
+!   work      -  real(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_dhalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_dhalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_dhalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_dhalo_vect
 
-
+!
+! Subroutine: psb_dhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_d_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  real(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_dhalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_dhalo_multivect
   use psi_mod
diff --git a/base/comm/psb_dovrl.f90 b/base/comm/psb_dovrl.f90
index 28b2cd24..1177be29 100644
--- a/base/comm/psb_dovrl.f90
+++ b/base/comm/psb_dovrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_dovrl.f90
 !
-! Subroutine: psb_dovrlm
+! Subroutine: psb_dovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  real                   The local part of the dense matrix.
+!   x           -  type(psb_d_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_dovrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_dovrl_vect
 
-
+!
+! Subroutine: psb_dovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_d_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  real(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_dovrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_dovrl_multivect
   use psi_mod
diff --git a/base/comm/psb_dscatter.F90 b/base/comm/psb_dscatter.F90
index d559fa12..157af3c6 100644
--- a/base/comm/psb_dscatter.F90
+++ b/base/comm/psb_dscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_dscatter.f90
 !
-! Subroutine: psb_dscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_dscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).       The global matrix to scatter.
-!   locx      -  real,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  real,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_d_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_dscatter_a.F90 b/base/comm/psb_dscatter_a.F90
index 8285afc1..e0b39f57 100644
--- a/base/comm/psb_dscatter_a.F90
+++ b/base/comm/psb_dscatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_dscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_dscatterm
 
 ! Subroutine: psb_dscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_dspgather.F90 b/base/comm/psb_dspgather.F90
index d5593504..62084d48 100644
--- a/base/comm/psb_dspgather.F90
+++ b/base/comm/psb_dspgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_dspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_d_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_ld_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_dsp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/comm/psb_egather_a.f90 b/base/comm/psb_egather_a.f90
index 54b77bc9..b910a4f7 100644
--- a/base/comm/psb_egather_a.f90
+++ b/base/comm/psb_egather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_egather.f90
 !
-! Subroutine: psb_egatherm
+! Subroutine: psb_egather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_escatter_a.F90 b/base/comm/psb_escatter_a.F90
index e8b4b062..dbb2026f 100644
--- a/base/comm/psb_escatter_a.F90
+++ b/base/comm/psb_escatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_escatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_escatterm
 
 ! Subroutine: psb_escatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_igather.f90 b/base/comm/psb_igather.f90
index a6e59497..48a4f2fe 100644
--- a/base/comm/psb_igather.f90
+++ b/base/comm/psb_igather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_igather.f90
 !
-! Subroutine: psb_igatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_igather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).          The local matrix into which gather 
+!   globx     -  integer,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  integer,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_i_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_igather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_igather_vect
 
+! Subroutine: psb_igather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_igather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_igather_multivect
diff --git a/base/comm/psb_ihalo.f90 b/base/comm/psb_ihalo.f90
index ed4d2c10..4bccfc10 100644
--- a/base/comm/psb_ihalo.f90
+++ b/base/comm/psb_ihalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_ihalo.f90
 !
-! Subroutine: psb_ihalom
+! Subroutine: psb_ihalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  integer,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_i_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  integer(optional).            Work  area.
+!   work      -  integer(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_ihalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_ihalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_ihalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_ihalo_vect
 
-
+!
+! Subroutine: psb_ihalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_i_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  integer(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_ihalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_ihalo_multivect
   use psi_mod
diff --git a/base/comm/psb_iovrl.f90 b/base/comm/psb_iovrl.f90
index 1e3ca4f3..06f720b0 100644
--- a/base/comm/psb_iovrl.f90
+++ b/base/comm/psb_iovrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_iovrl.f90
 !
-! Subroutine: psb_iovrlm
+! Subroutine: psb_iovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  integer                   The local part of the dense matrix.
+!   x           -  type(psb_i_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_iovrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_iovrl_vect
 
-
+!
+! Subroutine: psb_iovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_i_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  integer(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_iovrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_iovrl_multivect
   use psi_mod
diff --git a/base/comm/psb_iscatter.F90 b/base/comm/psb_iscatter.F90
index 1a545e9b..f159d5b4 100644
--- a/base/comm/psb_iscatter.F90
+++ b/base/comm/psb_iscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_iscatter.f90
 !
-! Subroutine: psb_iscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_iscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).       The global matrix to scatter.
-!   locx      -  integer,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  integer,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_i_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_ispgather.F90 b/base/comm/psb_ispgather.F90
index a19d24d9..d13c2e7c 100644
--- a/base/comm/psb_ispgather.F90
+++ b/base/comm/psb_ispgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_ispgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_i_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_@LX@_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_isp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/comm/psb_lgather.f90 b/base/comm/psb_lgather.f90
index 45ea0d94..1f6f448f 100644
--- a/base/comm/psb_lgather.f90
+++ b/base/comm/psb_lgather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_lgather.f90
 !
-! Subroutine: psb_lgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_lgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).          The local matrix into which gather 
+!   globx     -  integer,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  integer,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_l_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_lgather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_lgather_vect
 
+! Subroutine: psb_lgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_lgather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_lgather_multivect
diff --git a/base/comm/psb_lhalo.f90 b/base/comm/psb_lhalo.f90
index 49b4d13d..b35613a1 100644
--- a/base/comm/psb_lhalo.f90
+++ b/base/comm/psb_lhalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_lhalo.f90
 !
-! Subroutine: psb_lhalom
+! Subroutine: psb_lhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  integer,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_l_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  integer(optional).            Work  area.
+!   work      -  integer(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_lhalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_lhalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_lhalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_lhalo_vect
 
-
+!
+! Subroutine: psb_lhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_l_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  integer(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_lhalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_lhalo_multivect
   use psi_mod
diff --git a/base/comm/psb_lovrl.f90 b/base/comm/psb_lovrl.f90
index 75b816b8..bb3ece65 100644
--- a/base/comm/psb_lovrl.f90
+++ b/base/comm/psb_lovrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_lovrl.f90
 !
-! Subroutine: psb_lovrlm
+! Subroutine: psb_lovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  integer                   The local part of the dense matrix.
+!   x           -  type(psb_l_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_lovrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_lovrl_vect
 
-
+!
+! Subroutine: psb_lovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_l_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  integer(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_lovrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_lovrl_multivect
   use psi_mod
diff --git a/base/comm/psb_lscatter.F90 b/base/comm/psb_lscatter.F90
index ed4051fa..ceb60e4f 100644
--- a/base/comm/psb_lscatter.F90
+++ b/base/comm/psb_lscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_lscatter.f90
 !
-! Subroutine: psb_lscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_lscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).       The global matrix to scatter.
-!   locx      -  integer,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  integer,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_l_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_lspgather.F90 b/base/comm/psb_lspgather.F90
index 81c9c67c..2f588116 100644
--- a/base/comm/psb_lspgather.F90
+++ b/base/comm/psb_lspgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_lspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_l_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_@LX@_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_lsp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/comm/psb_mgather_a.f90 b/base/comm/psb_mgather_a.f90
index 1ee8e7f4..251e06c9 100644
--- a/base/comm/psb_mgather_a.f90
+++ b/base/comm/psb_mgather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_mgather.f90
 !
-! Subroutine: psb_mgatherm
+! Subroutine: psb_mgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_mscatter_a.F90 b/base/comm/psb_mscatter_a.F90
index 2c9baf6f..f85a3849 100644
--- a/base/comm/psb_mscatter_a.F90
+++ b/base/comm/psb_mscatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_mscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_mscatterm
 
 ! Subroutine: psb_mscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_sgather.f90 b/base/comm/psb_sgather.f90
index 10c6b7c2..538d1c43 100644
--- a/base/comm/psb_sgather.f90
+++ b/base/comm/psb_sgather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_sgather.f90
 !
-! Subroutine: psb_sgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_sgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).          The local matrix into which gather 
+!   globx     -  real,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  real,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_s_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_sgather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_sgather_vect
 
+! Subroutine: psb_sgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_sgather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_sgather_multivect
diff --git a/base/comm/psb_sgather_a.f90 b/base/comm/psb_sgather_a.f90
index 68076959..47a8c79a 100644
--- a/base/comm/psb_sgather_a.f90
+++ b/base/comm/psb_sgather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_sgather.f90
 !
-! Subroutine: psb_sgatherm
+! Subroutine: psb_sgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_shalo.f90 b/base/comm/psb_shalo.f90
index 2948624c..78ab39e3 100644
--- a/base/comm/psb_shalo.f90
+++ b/base/comm/psb_shalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_shalo.f90
 !
-! Subroutine: psb_shalom
+! Subroutine: psb_shalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  real,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_s_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  real(optional).            Work  area.
+!   work      -  real(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_shalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_shalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_shalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_shalo_vect
 
-
+!
+! Subroutine: psb_shalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_s_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  real(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_shalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_shalo_multivect
   use psi_mod
diff --git a/base/comm/psb_sovrl.f90 b/base/comm/psb_sovrl.f90
index 25af2d48..3930645a 100644
--- a/base/comm/psb_sovrl.f90
+++ b/base/comm/psb_sovrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_sovrl.f90
 !
-! Subroutine: psb_sovrlm
+! Subroutine: psb_sovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  real                   The local part of the dense matrix.
+!   x           -  type(psb_s_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_sovrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_sovrl_vect
 
-
+!
+! Subroutine: psb_sovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_s_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  real(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_sovrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_sovrl_multivect
   use psi_mod
diff --git a/base/comm/psb_sscatter.F90 b/base/comm/psb_sscatter.F90
index 60bdd342..960c5bac 100644
--- a/base/comm/psb_sscatter.F90
+++ b/base/comm/psb_sscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_sscatter.f90
 !
-! Subroutine: psb_sscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_sscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).       The global matrix to scatter.
-!   locx      -  real,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  real,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_s_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_sscatter_a.F90 b/base/comm/psb_sscatter_a.F90
index bb3002a9..d756b712 100644
--- a/base/comm/psb_sscatter_a.F90
+++ b/base/comm/psb_sscatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_sscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_sscatterm
 
 ! Subroutine: psb_sscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_sspgather.F90 b/base/comm/psb_sspgather.F90
index 27444ecd..7123a988 100644
--- a/base/comm/psb_sspgather.F90
+++ b/base/comm/psb_sspgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_sspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_s_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_ls_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_ssp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/comm/psb_zgather.f90 b/base/comm/psb_zgather.f90
index d22e6644..b8b9a6c8 100644
--- a/base/comm/psb_zgather.f90
+++ b/base/comm/psb_zgather.f90
@@ -31,14 +31,14 @@
 !    
 ! File:  psb_zgather.f90
 !
-! Subroutine: psb_zgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_zgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).          The local matrix into which gather 
+!   globx     -  complex,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  complex,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_z_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@@ -159,6 +159,8 @@ subroutine  psb_zgather_vect(globx, locx, desc_a, info, iroot)
 
 end subroutine psb_zgather_vect
 
+! Subroutine: psb_zgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.
 
 subroutine  psb_zgather_multivect(globx, locx, desc_a, info, iroot)
   use psb_base_mod, psb_protect_name => psb_zgather_multivect
diff --git a/base/comm/psb_zgather_a.f90 b/base/comm/psb_zgather_a.f90
index d2c9bd91..1c3838cb 100644
--- a/base/comm/psb_zgather_a.f90
+++ b/base/comm/psb_zgather_a.f90
@@ -31,7 +31,7 @@
 !    
 ! File:  psb_zgather.f90
 !
-! Subroutine: psb_zgatherm
+! Subroutine: psb_zgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
diff --git a/base/comm/psb_zhalo.f90 b/base/comm/psb_zhalo.f90
index 57fe43ee..595cbc03 100644
--- a/base/comm/psb_zhalo.f90
+++ b/base/comm/psb_zhalo.f90
@@ -31,17 +31,17 @@
 !    
 ! File:  psb_zhalo.f90
 !
-! Subroutine: psb_zhalom
+! Subroutine: psb_zhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  complex,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_z_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  complex(optional).            Work  area.
+!   work      -  complex(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_zhalo_vect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_zhalo_vect
   use psi_mod
@@ -185,7 +184,28 @@ subroutine  psb_zhalo_vect(x,desc_a,info,work,tran,mode,data)
     return
 end subroutine psb_zhalo_vect
 
-
+!
+! Subroutine: psb_zhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_z_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  complex(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_zhalo_multivect(x,desc_a,info,work,tran,mode,data)
   use psb_base_mod, psb_protect_name => psb_zhalo_multivect
   use psi_mod
diff --git a/base/comm/psb_zovrl.f90 b/base/comm/psb_zovrl.f90
index 66b42303..c7463e19 100644
--- a/base/comm/psb_zovrl.f90
+++ b/base/comm/psb_zovrl.f90
@@ -32,12 +32,12 @@
 !
 ! File:  psb_zovrl.f90
 !
-! Subroutine: psb_zovrlm
+! Subroutine: psb_zovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  complex                   The local part of the dense matrix.
+!   x           -  type(psb_z_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@@ -180,7 +180,38 @@ subroutine  psb_zovrl_vect(x,desc_a,info,work,update,mode)
     return
 end subroutine psb_zovrl_vect
 
-
+!
+! Subroutine: psb_zovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_z_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  complex(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_zovrl_multivect(x,desc_a,info,work,update,mode)
   use psb_base_mod, psb_protect_name => psb_zovrl_multivect
   use psi_mod
diff --git a/base/comm/psb_zscatter.F90 b/base/comm/psb_zscatter.F90
index 008f8084..10c2c78d 100644
--- a/base/comm/psb_zscatter.F90
+++ b/base/comm/psb_zscatter.F90
@@ -31,13 +31,13 @@
 !    
 ! File:  psb_zscatter.f90
 !
-! Subroutine: psb_zscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_zscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).       The global matrix to scatter.
-!   locx      -  complex,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  complex,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_z_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
diff --git a/base/comm/psb_zscatter_a.F90 b/base/comm/psb_zscatter_a.F90
index 69e70d87..b206d8d3 100644
--- a/base/comm/psb_zscatter_a.F90
+++ b/base/comm/psb_zscatter_a.F90
@@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_zscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:,:).       The global matrix to scatter.
@@ -278,7 +278,7 @@ end subroutine psb_zscatterm
 
 ! Subroutine: psb_zscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:).         The global vector to scatter.
diff --git a/base/comm/psb_zspgather.F90 b/base/comm/psb_zspgather.F90
index fbcae1f3..f4a27681 100644
--- a/base/comm/psb_zspgather.F90
+++ b/base/comm/psb_zspgather.F90
@@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_zspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_z_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_lz_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_zsp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
   use iso_fortran_env
diff --git a/base/psblas/psb_camax.f90 b/base/psblas/psb_camax.f90
index 526c5e40..0de31a0c 100644
--- a/base/psblas/psb_camax.f90
+++ b/base/psblas/psb_camax.f90
@@ -32,7 +32,7 @@
 ! File: psb_camax.f90
 !
 ! Function: psb_camax
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X
 !
 !    normi := max(abs(sub(X)(i))  
 !
@@ -164,7 +164,7 @@ end function psb_camax
 !!$
 !
 ! Function: psb_camaxv
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value  of X.
 !
 !    normi := max(abs(X(i))  
 !
@@ -252,6 +252,17 @@ function psb_camaxv (x,desc_a, info,global) result(res)
   return
 end function psb_camaxv
 
+!
+! Function: psb_camax_vect
+!    Computes the maximum absolute value  of X.
+!
+!    normi := max(abs(X(i))  
+!
+! Arguments:
+!    x      - type(psb_c_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!
 
 function psb_camax_vect(x, desc_a, info,global) result(res)
   use psb_penv_mod
@@ -374,17 +385,16 @@ end function psb_camax_vect
 !!$  
 !
 ! Subroutine: psb_camaxvs
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X, subroutine version
 !
 !    normi := max(abs(sub(X)(i))  
 !
-!    where sub( X ) denotes X(1:N,JX:).
+!    where sub( X ) denotes X(1:N).
 !
 ! Arguments:
 !    res    -  real                 The result.
-!    x(:,:) -  complex              The input vector.
+!    x(:)   -  complex              The input vector.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
-!    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset.
 !
 subroutine psb_camaxvs(res,x,desc_a, info,global)
diff --git a/base/psblas/psb_casum.f90 b/base/psblas/psb_casum.f90
index 2b0beda5..a0bf5262 100644
--- a/base/psblas/psb_casum.f90
+++ b/base/psblas/psb_casum.f90
@@ -135,6 +135,19 @@ function psb_casum (x,desc_a, info, jx,global) result(res)
   return
 end function psb_casum
 
+! Function: psb_casum_vect
+!    Computes norm1 of X
+!
+!    norm1 := sum(sub( X )(i))
+!
+!    where sub( X ) denotes X(1:N,JX:).
+!
+! Arguments:
+!    x      - type(psb_c_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).   The communication descriptor.
+!    info   -  integer.               Return code
+!    jx     -  integer(optional).     The column offset.
+!
 
 function psb_casum_vect(x, desc_a, info,global) result(res)
   use psb_base_mod, psb_protect_name => psb_casum_vect
diff --git a/base/psblas/psb_caxpby.f90 b/base/psblas/psb_caxpby.f90
index ab00bd60..daa47829 100644
--- a/base/psblas/psb_caxpby.f90
+++ b/base/psblas/psb_caxpby.f90
@@ -31,6 +31,23 @@
 !    
 ! File: psb_caxpby.f90
 
+!
+! Subroutine: psb_caxpby_vect
+!    Adds one distributed vector to another,
+!
+!    Y := beta * Y + alpha * X
+!
+! Arguments:
+!    alpha  -  complex,input        The scalar used to multiply each component of X
+!    x      - type(psb_c_vect_type) The input vector containing the entries of X
+!    beta   -  complex,input        The scalar used to multiply each component of Y
+!    y      - type(psb_c_vect_type)  The input/output vector Y
+!    desc_a -  type(psb_desc_type)  The communication descriptor.
+!    info   -  integer              Return code
+!
+!  Note: from a functional point of view, X is input, but here
+!        it's declared INOUT because of the sync() methods. 
+!
 subroutine psb_caxpby_vect(alpha, x, beta, y,&
      & desc_a, info)
   use psb_base_mod, psb_protect_name => psb_caxpby_vect
@@ -269,7 +286,7 @@ end subroutine psb_caxpby
 !!$  
 !
 ! Subroutine: psb_caxpbyv
-!    Adds one distributed matrix to another,
+!    Adds one distributed vector to another,
 !
 !    Y := beta * Y + alpha * X
 !
diff --git a/base/psblas/psb_cdot.f90 b/base/psblas/psb_cdot.f90
index 39cda431..5432eb32 100644
--- a/base/psblas/psb_cdot.f90
+++ b/base/psblas/psb_cdot.f90
@@ -31,22 +31,22 @@
 !    
 ! File: psb_cdot.f90
 !
-! Function: psb_cdot
-!    psb_cdot forms the dot product of two distributed vectors,
+! Function: psb_cdot_vect
+!    psb_cdot computes the dot product of two distributed vectors,
 !
-!    dot := sub( X )**C * sub( Y )
+!    dot := ( X )**C * ( Y )
 !
-!    where sub( X ) denotes X(:,JX)
-!
-!    sub( Y ) denotes Y(:,JY).
 !
 ! Arguments:
-!    x(:,:) -  complex               The input vector containing the entries of sub( X ).
-!    y(:,:) -  complex               The input vector containing the entries of sub( Y ).
+!    x      -  type(psb_c_vect_type) The input vector containing the entries of sub( X ).
+!    y      -  type(psb_c_vect_type) The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
-!    jx     -  integer(optional).    The column offset for sub( X ).
-!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods. 
+!
 !
 function psb_cdot_vect(x, y, desc_a,info,global) result(res)
   use psb_desc_mod
@@ -156,7 +156,25 @@ function psb_cdot_vect(x, y, desc_a,info,global) result(res)
   return
 
 end function psb_cdot_vect
-
+!
+! Function: psb_cdot
+!    psb_cdot computes the dot product of two distributed vectors,
+!
+!    dot := sub( X )**C * sub( Y )
+!
+!    where sub( X ) denotes X(:,JX)
+!
+!    sub( Y ) denotes Y(:,JY).
+!
+! Arguments:
+!    x(:,:) -  complex                The input vector containing the entries of sub( X ).
+!    y(:,:) -  complex                The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    jx     -  integer(optional).    The column offset for sub( X ).
+!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
 function psb_cdot(x, y,desc_a, info, jx, jy,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_cdot
   implicit none
@@ -298,7 +316,7 @@ end function psb_cdot
 !!$
 ! 
 ! Function: psb_cdotv
-!    psb_cdotv forms the dot product of two distributed vectors,
+!    psb_cdotv computes the dot product of two distributed vectors,
 !
 !    dot := X**C * Y
 !
@@ -307,6 +325,7 @@ end function psb_cdot
 !    y(:)   -  complex               The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 function psb_cdotv(x, y,desc_a, info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_cdotv
@@ -432,7 +451,7 @@ end function psb_cdotv
 !!$
 !  
 ! Subroutine: psb_cdotvs
-!    psb_cdotvs forms the dot product of two distributed vectors,
+!    psb_cdotvs computes the dot product of two distributed vectors,
 !
 !    res := X**C * Y
 !
@@ -442,6 +461,7 @@ end function psb_cdotv
 !    y(:)   -  complex              The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_cdotvs(res, x, y,desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_cdotvs
@@ -565,7 +585,7 @@ end subroutine psb_cdotvs
 !!$
 !
 ! Subroutine: psb_cmdots
-!    psb_cmdots forms the dot product of multiple distributed vectors,
+!    psb_cmdots computes the dot product of multiple distributed vectors,
 !
 !    res(i) := ( X(:,i) )**C * ( Y(:,i) )
 !
@@ -575,6 +595,7 @@ end subroutine psb_cdotvs
 !    y(:)   -  complex              The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_cmdots(res, x, y, desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_cmdots
diff --git a/base/psblas/psb_cnrm2.f90 b/base/psblas/psb_cnrm2.f90
index 43ab876c..1db5773a 100644
--- a/base/psblas/psb_cnrm2.f90
+++ b/base/psblas/psb_cnrm2.f90
@@ -32,9 +32,9 @@
 ! File: psb_cnrm2.f90
 !
 ! Function: psb_cnrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( sub( X )**T * sub( X ) )
+!    norm2 := sqrt ( sub( X )**C * sub( X ) )
 !
 !    where sub( X ) denotes X(:,JX).
 !
@@ -43,6 +43,7 @@
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset for sub( X ).
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_cnrm2(x, desc_a, info, jx,global)  result(res)
   use psb_desc_mod
@@ -170,14 +171,15 @@ end function psb_cnrm2
 !!$
 !
 ! Function: psb_cnrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    x(:)   -  complex               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_cnrm2v(x, desc_a, info,global)  result(res)
   use psb_desc_mod
@@ -263,7 +265,17 @@ function psb_cnrm2v(x, desc_a, info,global)  result(res)
 end function psb_cnrm2v
 
 
-
+! Function: psb_cnrm2_vect
+!    Computes the norm2 of a distributed vector,
+!
+!    norm2 := sqrt ( X**C * X)
+!
+! Arguments:
+!    x      -  type(psb_c_vect_type) The input vector containing the entries of X.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
+!
 function psb_cnrm2_vect(x, desc_a, info,global)  result(res)
   use psb_desc_mod
   use psb_check_mod
@@ -344,7 +356,7 @@ function psb_cnrm2_vect(x, desc_a, info,global)  result(res)
         res = res - sqrt(cone - dd*(abs(x%v%v(idx))/res)**2)
       end do
     end if
-    else 	    
+  else 	    
     res = szero
   end if
 
@@ -392,15 +404,16 @@ end function psb_cnrm2_vect
 !!$
 !
 ! Subroutine: psb_cnrm2vs
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector, subroutine version
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    res    -  real                  The result.
 !    x(:)   -  complex               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 subroutine psb_cnrm2vs(res, x, desc_a, info,global)
   use psb_desc_mod
diff --git a/base/psblas/psb_cnrmi.f90 b/base/psblas/psb_cnrmi.f90
index 0ac0a04b..0abece53 100644
--- a/base/psblas/psb_cnrmi.f90
+++ b/base/psblas/psb_cnrmi.f90
@@ -32,14 +32,15 @@
 ! File: psb_cnrmi.f90
 !
 ! Function: psb_cnrmi
-!    Forms the infinity norm of a sparse matrix,                           
+!    Computes the infinity norm of a sparse matrix,                           
 !
-!    nrmi := max_i(abs(sum(A(i,:))))                                              
+!    nrmi := max_i(sum(abs(A(i,:))))                                              
 !
 ! Arguments:
 !    a      -  type(psb_cspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_cnrmi(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_cnrmi
diff --git a/base/psblas/psb_cspmm.f90 b/base/psblas/psb_cspmm.f90
index 3d1d3b7d..5a059647 100644
--- a/base/psblas/psb_cspmm.f90
+++ b/base/psblas/psb_cspmm.f90
@@ -31,6 +31,235 @@
 !    
 ! File: psb_cspmm.f90
 !
+!
+! Subroutine: psb_cspm_vect
+!     Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A * Pc * X  + beta * Y,  or
+!
+!     Y := alpha * Pr * A' * Pr * X  + beta * Y,
+!
+!  alpha and beta are scalars, X and Y are distributed
+!  vectors and A is a M-by-N distributed matrix.
+!
+! Arguments:   
+!    alpha   -  complex                The scalar alpha.
+!    a       -  type(psb_cspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_c_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  complex                The scalar beta.
+!    y       -  type(psb_c_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. Default:  'N' 
+!    work(:) -  complex,(optional).    Working area.
+!    doswap  -  logical(optional).     Whether to performe halo updates.
+! 
+subroutine  psb_cspmv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, work, doswap)   
+  use psb_base_mod, psb_protect_name => psb_cspmv_vect
+  use psi_mod
+  implicit none
+
+  complex(psb_spk_), intent(in)            :: alpha, beta
+  type(psb_c_vect_type), intent(inout)     :: x
+  type(psb_c_vect_type), intent(inout)     :: y
+  type(psb_cspmat_type), intent(in)        :: a
+  type(psb_desc_type), intent(in)          :: desc_a
+  integer(psb_ipk_), intent(out)                     :: info
+  complex(psb_spk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional          :: trans
+  logical, intent(in), optional            :: doswap
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me,&
+       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
+       & liwork, iiy, jjy, ib, ip, idx
+  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
+  integer(psb_ipk_), parameter       :: nb=4
+  complex(psb_spk_), pointer :: iwork(:), xp(:), yp(:)
+  complex(psb_spk_), allocatable :: xvsave(:)
+  character                :: trans_
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw, doswap_
+  integer(psb_ipk_) :: debug_level, debug_unit
+
+  name='psb_cspmv'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+  debug_unit  = psb_get_debug_unit()
+  debug_level = psb_get_debug_level()
+
+  ictxt=desc_a%get_context()
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (present(doswap)) then
+    doswap_ = doswap
+  else
+    doswap_ = .true.
+  endif
+
+  if (present(trans)) then     
+    trans_ = psb_toupper(trans)
+  else
+    trans_ = 'N'
+  endif
+  if ( (trans_ == 'N').or.(trans_ == 'T')&
+       & .or.(trans_ == 'C')) then
+  else
+    info = psb_err_iarg_invalid_value_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+  m    = desc_a%get_global_rows()
+  n    = desc_a%get_global_cols()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then
+    if (size(work) >= liwork) then
+      aliw =.false.
+    else
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='Allocate'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
+
+  if (trans_ == 'N') then
+    !  Matrix is not transposed
+
+    if (doswap_) then
+      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & czero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
+    end if
+
+    call psb_csmm(alpha,a,x,beta,y,info)
+
+    if(info /= psb_success_) then
+      info = psb_err_from_subroutine_non_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+
+  else
+    !  Matrix is transposed
+
+    !
+    ! Non-empty overlap, need a buffer to hold
+    ! the entries updated with average operator.
+    ! Why the average? because in this way they will contribute
+    ! with a proper scale factor (1/np) to the overall product.
+    ! 
+    call psi_ovrl_save(x%v,xvsave,desc_a,info)
+    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
+
+    if (beta /= czero) call y%set(czero,nrow+1,ncol)
+    !  local Matrix-vector product
+    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
+
+    if (debug_level >= psb_debug_comp_) &
+         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
+
+    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
+    if (info /= psb_success_) then
+      info = psb_err_from_subroutine_
+      ch_err='psb_csmm'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+
+    if (doswap_) then
+      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
+           & cone,y%v,desc_a,iwork,info)
+      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & cone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+      if (debug_level >= psb_debug_comp_) &
+           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
+      if(info /= psb_success_) then
+        info = psb_err_from_subroutine_
+        ch_err='PSI_SwapTran'
+        call psb_errpush(info,name,a_err=ch_err)
+        goto 9999
+      end if
+    end if
+
+  end if
+
+  if (aliw) deallocate(iwork,stat=info)
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='Deallocate iwork'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  nullify(iwork)
+
+  call psb_erractionrestore(err_act)
+  if (debug_level >= psb_debug_comp_) then 
+    call psb_barrier(ictxt)
+    write(debug_unit,*) me,' ',trim(name),' Returning '
+  endif
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_cspmv_vect
+!
 ! Subroutine: psb_cspmm
 !     Performs one of the distributed matrix-vector operations
 !
@@ -356,10 +585,6 @@ subroutine  psb_cspmm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_cspmm
-
-
-
-
 !!$ 
 !!$              Parallel Sparse BLAS  version 3.5
 !!$    (C) Copyright 2006-2018
@@ -670,211 +895,3 @@ subroutine  psb_cspmv(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_cspmv
-
-
-
-subroutine  psb_cspmv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, work, doswap)   
-  use psb_base_mod, psb_protect_name => psb_cspmv_vect
-  use psi_mod
-  implicit none
-
-  complex(psb_spk_), intent(in)            :: alpha, beta
-  type(psb_c_vect_type), intent(inout)     :: x
-  type(psb_c_vect_type), intent(inout)     :: y
-  type(psb_cspmat_type), intent(in)        :: a
-  type(psb_desc_type), intent(in)          :: desc_a
-  integer(psb_ipk_), intent(out)                     :: info
-  complex(psb_spk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional          :: trans
-  logical, intent(in), optional            :: doswap
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me,&
-       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
-       & liwork, iiy, jjy, ib, ip, idx
-  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
-  integer(psb_ipk_), parameter       :: nb=4
-  complex(psb_spk_), pointer :: iwork(:), xp(:), yp(:)
-  complex(psb_spk_), allocatable :: xvsave(:)
-  character                :: trans_
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw, doswap_
-  integer(psb_ipk_) :: debug_level, debug_unit
-
-  name='psb_cspmv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-  debug_unit  = psb_get_debug_unit()
-  debug_level = psb_get_debug_level()
-
-  ictxt=desc_a%get_context()
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (present(doswap)) then
-    doswap_ = doswap
-  else
-    doswap_ = .true.
-  endif
-
-  if (present(trans)) then     
-    trans_ = psb_toupper(trans)
-  else
-    trans_ = 'N'
-  endif
-  if ( (trans_ == 'N').or.(trans_ == 'T')&
-       & .or.(trans_ == 'C')) then
-  else
-    info = psb_err_iarg_invalid_value_
-    call psb_errpush(info,name)
-    goto 9999
-  end if
-
-  m    = desc_a%get_global_rows()
-  n    = desc_a%get_global_cols()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then
-    if (size(work) >= liwork) then
-      aliw =.false.
-    else
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='Allocate'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
-
-  if (trans_ == 'N') then
-    !  Matrix is not transposed
-
-    if (doswap_) then
-      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & czero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
-    end if
-
-    call psb_csmm(alpha,a,x,beta,y,info)
-
-    if(info /= psb_success_) then
-      info = psb_err_from_subroutine_non_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-
-  else
-    !  Matrix is transposed
-
-    !
-    ! Non-empty overlap, need a buffer to hold
-    ! the entries updated with average operator.
-    ! Why the average? because in this way they will contribute
-    ! with a proper scale factor (1/np) to the overall product.
-    ! 
-    call psi_ovrl_save(x%v,xvsave,desc_a,info)
-    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
-
-    if (beta /= czero) call y%set(czero,nrow+1,ncol)
-    !  local Matrix-vector product
-    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
-
-    if (debug_level >= psb_debug_comp_) &
-         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
-
-    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
-    if (info /= psb_success_) then
-      info = psb_err_from_subroutine_
-      ch_err='psb_csmm'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    if (doswap_) then
-      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
-           & cone,y%v,desc_a,iwork,info)
-      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & cone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-      if (debug_level >= psb_debug_comp_) &
-           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
-      if(info /= psb_success_) then
-        info = psb_err_from_subroutine_
-        ch_err='PSI_SwapTran'
-        call psb_errpush(info,name,a_err=ch_err)
-        goto 9999
-      end if
-    end if
-
-  end if
-
-  if (aliw) deallocate(iwork,stat=info)
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='Deallocate iwork'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  nullify(iwork)
-
-  call psb_erractionrestore(err_act)
-  if (debug_level >= psb_debug_comp_) then 
-    call psb_barrier(ictxt)
-    write(debug_unit,*) me,' ',trim(name),' Returning '
-  endif
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_cspmv_vect
diff --git a/base/psblas/psb_cspnrm1.f90 b/base/psblas/psb_cspnrm1.f90
index a5076a0a..c49fdcc1 100644
--- a/base/psblas/psb_cspnrm1.f90
+++ b/base/psblas/psb_cspnrm1.f90
@@ -40,6 +40,7 @@
 !    a      -  type(psb_cspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_cspnrm1(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_cspnrm1
diff --git a/base/psblas/psb_cspsm.f90 b/base/psblas/psb_cspsm.f90
index a8e0c570..3fc8138c 100644
--- a/base/psblas/psb_cspsm.f90
+++ b/base/psblas/psb_cspsm.f90
@@ -31,6 +31,203 @@
 !    
 ! File: psb_cspsm.f90
 !
+!
+! Subroutine: psb_csps_vect
+!  Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A-1 * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-1 * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-1 * Pc * D * X + beta * Y,  or
+!
+!     Y := alpha * Pr * A-T * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-T * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-T * Pc * D * X + beta * Y,  or
+!
+!  X is a distributed
+!  vector and T is a M-by-M distributed triangular matrix.
+!
+!
+! Arguments:   
+!    alpha   -  complex.               The scalar alpha.
+!    a       -  type(psb_cspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_c_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  complex                The scalar beta.
+!    y       -  type(psb_c_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. If not present 'N' is assumed.
+!    scale   -  character(optional).   Specify some type of operation with
+!                                      the diagonal matrix D.
+!    choice  -  integer(optional).     The kind of update to perform on overlap elements.
+!    d(:)    -  complex, optional      Matrix for diagonal scaling.
+!    work(:) -  complex, optional      Working area.
+! 
+subroutine  psb_cspsv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, scale, choice, diag, work)   
+  use psb_base_mod, psb_protect_name => psb_cspsv_vect
+  use psi_mod
+  implicit none 
+
+  complex(psb_spk_), intent(in)           :: alpha, beta
+  type(psb_c_vect_type), intent(inout)    :: x
+  type(psb_c_vect_type), intent(inout)    :: y
+  type(psb_cspmat_type), intent(inout)    :: a
+  type(psb_desc_type), intent(in)         :: desc_a
+  integer(psb_ipk_), intent(out)                    :: info
+  type(psb_c_vect_type), intent(inout), optional  :: diag
+  complex(psb_spk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional         :: trans, scale
+  integer(psb_ipk_), intent(in), optional           :: choice
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me, &
+       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
+       & ix, iy, ik, jx, jy, i, lld,&
+       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
+
+  character                :: lscale
+  integer(psb_ipk_), parameter       :: nb=4
+  complex(psb_spk_),pointer :: iwork(:), xp(:), yp(:)
+  character                :: itrans
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw
+
+  name='psb_cspsv_vect'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+
+  ictxt=desc_a%get_context()
+
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  if (present(choice)) then     
+    choice_ = choice
+  else
+    choice_ = psb_avg_
+  endif
+
+  if (present(scale)) then     
+    lscale = psb_toupper(scale)
+  else
+    lscale = 'U'
+  endif
+
+  if (present(trans)) then     
+    itrans = psb_toupper(trans)
+    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
+      ! Ok
+    else
+      info = psb_err_iarg_invalid_value_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+  else
+    itrans = 'N'
+  endif
+
+  m    = desc_a%get_global_rows()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then     
+    if (size(work) >= liwork) then 
+      aliw =.false.
+    else 
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then 
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='psb_realloc'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  iwork(1)=0.d0
+
+  ! Perform local triangular system solve
+  if (present(diag)) then 
+    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
+  else
+    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
+  end if
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='dcssm'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  ! update overlap elements
+  if (choice_ > 0) then
+    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+         & cone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+
+    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
+    if (info /= psb_success_) then
+      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
+      goto 9999
+    end if
+  end if
+
+  if (aliw) deallocate(iwork)
+
+  call psb_erractionrestore(err_act)
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_cspsv_vect
+!
 ! Subroutine: psb_cspsm
 !  Performs one of the distributed matrix-vector operations
 !
@@ -283,38 +480,6 @@ subroutine  psb_cspsm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_cspsm
-
-!!$ 
-!!$              Parallel Sparse BLAS  version 3.5
-!!$    (C) Copyright 2006-2018
-!!$                       Salvatore Filippone    University of Rome Tor Vergata
-!!$                       Alfredo Buttari      
-!!$ 
-!!$  Redistribution and use in source and binary forms, with or without
-!!$  modification, are permitted provided that the following conditions
-!!$  are met:
-!!$    1. Redistributions of source code must retain the above copyright
-!!$       notice, this list of conditions and the following disclaimer.
-!!$    2. Redistributions in binary form must reproduce the above copyright
-!!$       notice, this list of conditions, and the following disclaimer in the
-!!$       documentation and/or other materials provided with the distribution.
-!!$    3. The name of the PSBLAS group or the names of its contributors may
-!!$       not be used to endorse or promote products derived from this
-!!$       software without specific written permission.
-!!$ 
-!!$  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!!$  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!!$  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!!$  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!!$  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!!$  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!!$  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!!$  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!!$  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!!$  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!!$  POSSIBILITY OF SUCH DAMAGE.
-!!$ 
-!!$
 !
 ! Subroutine: psb_cspsv
 !  Performs one of the distributed matrix-vector operations
@@ -545,166 +710,3 @@ subroutine  psb_cspsv(alpha,a,x,beta,y,desc_a,info,&
   return
 end subroutine psb_cspsv
 
-
-subroutine  psb_cspsv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, scale, choice, diag, work)   
-  use psb_base_mod, psb_protect_name => psb_cspsv_vect
-  use psi_mod
-  implicit none 
-
-  complex(psb_spk_), intent(in)           :: alpha, beta
-  type(psb_c_vect_type), intent(inout)    :: x
-  type(psb_c_vect_type), intent(inout)    :: y
-  type(psb_cspmat_type), intent(inout)    :: a
-  type(psb_desc_type), intent(in)         :: desc_a
-  integer(psb_ipk_), intent(out)                    :: info
-  type(psb_c_vect_type), intent(inout), optional  :: diag
-  complex(psb_spk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional         :: trans, scale
-  integer(psb_ipk_), intent(in), optional           :: choice
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me, &
-       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
-       & ix, iy, ik, jx, jy, i, lld,&
-       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
-
-  character                :: lscale
-  integer(psb_ipk_), parameter       :: nb=4
-  complex(psb_spk_),pointer :: iwork(:), xp(:), yp(:)
-  character                :: itrans
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw
-
-  name='psb_sspsv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-
-  ictxt=desc_a%get_context()
-
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-
-  if (present(choice)) then     
-    choice_ = choice
-  else
-    choice_ = psb_avg_
-  endif
-
-  if (present(scale)) then     
-    lscale = psb_toupper(scale)
-  else
-    lscale = 'U'
-  endif
-
-  if (present(trans)) then     
-    itrans = psb_toupper(trans)
-    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
-      ! Ok
-    else
-      info = psb_err_iarg_invalid_value_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-  else
-    itrans = 'N'
-  endif
-
-  m    = desc_a%get_global_rows()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then     
-    if (size(work) >= liwork) then 
-      aliw =.false.
-    else 
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then 
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_realloc'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  iwork(1)=0.d0
-
-  ! Perform local triangular system solve
-  if (present(diag)) then 
-    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
-  else
-    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
-  end if
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='dcssm'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  ! update overlap elements
-  if (choice_ > 0) then
-    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-         & cone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-
-    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
-    if (info /= psb_success_) then
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
-      goto 9999
-    end if
-  end if
-
-  if (aliw) deallocate(iwork)
-
-  call psb_erractionrestore(err_act)
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_cspsv_vect
-
diff --git a/base/psblas/psb_damax.f90 b/base/psblas/psb_damax.f90
index 33dd2dc9..0f3ea0b6 100644
--- a/base/psblas/psb_damax.f90
+++ b/base/psblas/psb_damax.f90
@@ -32,7 +32,7 @@
 ! File: psb_damax.f90
 !
 ! Function: psb_damax
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X
 !
 !    normi := max(abs(sub(X)(i))  
 !
@@ -164,7 +164,7 @@ end function psb_damax
 !!$
 !
 ! Function: psb_damaxv
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value  of X.
 !
 !    normi := max(abs(X(i))  
 !
@@ -252,6 +252,17 @@ function psb_damaxv (x,desc_a, info,global) result(res)
   return
 end function psb_damaxv
 
+!
+! Function: psb_damax_vect
+!    Computes the maximum absolute value  of X.
+!
+!    normi := max(abs(X(i))  
+!
+! Arguments:
+!    x      - type(psb_d_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!
 
 function psb_damax_vect(x, desc_a, info,global) result(res)
   use psb_penv_mod
@@ -374,17 +385,16 @@ end function psb_damax_vect
 !!$  
 !
 ! Subroutine: psb_damaxvs
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X, subroutine version
 !
 !    normi := max(abs(sub(X)(i))  
 !
-!    where sub( X ) denotes X(1:N,JX:).
+!    where sub( X ) denotes X(1:N).
 !
 ! Arguments:
 !    res    -  real                 The result.
-!    x(:,:) -  real              The input vector.
+!    x(:)   -  real              The input vector.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
-!    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset.
 !
 subroutine psb_damaxvs(res,x,desc_a, info,global)
diff --git a/base/psblas/psb_dasum.f90 b/base/psblas/psb_dasum.f90
index cf2d8fe3..5de367f7 100644
--- a/base/psblas/psb_dasum.f90
+++ b/base/psblas/psb_dasum.f90
@@ -135,6 +135,19 @@ function psb_dasum (x,desc_a, info, jx,global) result(res)
   return
 end function psb_dasum
 
+! Function: psb_dasum_vect
+!    Computes norm1 of X
+!
+!    norm1 := sum(sub( X )(i))
+!
+!    where sub( X ) denotes X(1:N,JX:).
+!
+! Arguments:
+!    x      - type(psb_d_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).   The communication descriptor.
+!    info   -  integer.               Return code
+!    jx     -  integer(optional).     The column offset.
+!
 
 function psb_dasum_vect(x, desc_a, info,global) result(res)
   use psb_base_mod, psb_protect_name => psb_dasum_vect
diff --git a/base/psblas/psb_daxpby.f90 b/base/psblas/psb_daxpby.f90
index b1c83d3a..73f62881 100644
--- a/base/psblas/psb_daxpby.f90
+++ b/base/psblas/psb_daxpby.f90
@@ -31,6 +31,23 @@
 !    
 ! File: psb_daxpby.f90
 
+!
+! Subroutine: psb_daxpby_vect
+!    Adds one distributed vector to another,
+!
+!    Y := beta * Y + alpha * X
+!
+! Arguments:
+!    alpha  -  real,input        The scalar used to multiply each component of X
+!    x      - type(psb_d_vect_type) The input vector containing the entries of X
+!    beta   -  real,input        The scalar used to multiply each component of Y
+!    y      - type(psb_d_vect_type)  The input/output vector Y
+!    desc_a -  type(psb_desc_type)  The communication descriptor.
+!    info   -  integer              Return code
+!
+!  Note: from a functional point of view, X is input, but here
+!        it's declared INOUT because of the sync() methods. 
+!
 subroutine psb_daxpby_vect(alpha, x, beta, y,&
      & desc_a, info)
   use psb_base_mod, psb_protect_name => psb_daxpby_vect
@@ -269,7 +286,7 @@ end subroutine psb_daxpby
 !!$  
 !
 ! Subroutine: psb_daxpbyv
-!    Adds one distributed matrix to another,
+!    Adds one distributed vector to another,
 !
 !    Y := beta * Y + alpha * X
 !
diff --git a/base/psblas/psb_ddot.f90 b/base/psblas/psb_ddot.f90
index c2231d8a..ba1d9619 100644
--- a/base/psblas/psb_ddot.f90
+++ b/base/psblas/psb_ddot.f90
@@ -31,22 +31,22 @@
 !    
 ! File: psb_ddot.f90
 !
-! Function: psb_ddot
-!    psb_ddot forms the dot product of two distributed vectors,
+! Function: psb_ddot_vect
+!    psb_ddot computes the dot product of two distributed vectors,
 !
-!    dot := sub( X )**C * sub( Y )
+!    dot := ( X )**C * ( Y )
 !
-!    where sub( X ) denotes X(:,JX)
-!
-!    sub( Y ) denotes Y(:,JY).
 !
 ! Arguments:
-!    x(:,:) -  complex               The input vector containing the entries of sub( X ).
-!    y(:,:) -  complex               The input vector containing the entries of sub( Y ).
+!    x      -  type(psb_d_vect_type) The input vector containing the entries of sub( X ).
+!    y      -  type(psb_d_vect_type) The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
-!    jx     -  integer(optional).    The column offset for sub( X ).
-!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods. 
+!
 !
 function psb_ddot_vect(x, y, desc_a,info,global) result(res)
   use psb_desc_mod
@@ -156,7 +156,25 @@ function psb_ddot_vect(x, y, desc_a,info,global) result(res)
   return
 
 end function psb_ddot_vect
-
+!
+! Function: psb_ddot
+!    psb_ddot computes the dot product of two distributed vectors,
+!
+!    dot := sub( X )**C * sub( Y )
+!
+!    where sub( X ) denotes X(:,JX)
+!
+!    sub( Y ) denotes Y(:,JY).
+!
+! Arguments:
+!    x(:,:) -  real                The input vector containing the entries of sub( X ).
+!    y(:,:) -  real                The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    jx     -  integer(optional).    The column offset for sub( X ).
+!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
 function psb_ddot(x, y,desc_a, info, jx, jy,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_ddot
   implicit none
@@ -298,7 +316,7 @@ end function psb_ddot
 !!$
 ! 
 ! Function: psb_ddotv
-!    psb_ddotv forms the dot product of two distributed vectors,
+!    psb_ddotv computes the dot product of two distributed vectors,
 !
 !    dot := X**C * Y
 !
@@ -307,6 +325,7 @@ end function psb_ddot
 !    y(:)   -  real               The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 function psb_ddotv(x, y,desc_a, info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_ddotv
@@ -432,7 +451,7 @@ end function psb_ddotv
 !!$
 !  
 ! Subroutine: psb_ddotvs
-!    psb_ddotvs forms the dot product of two distributed vectors,
+!    psb_ddotvs computes the dot product of two distributed vectors,
 !
 !    res := X**C * Y
 !
@@ -442,6 +461,7 @@ end function psb_ddotv
 !    y(:)   -  real              The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_ddotvs(res, x, y,desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_ddotvs
@@ -565,7 +585,7 @@ end subroutine psb_ddotvs
 !!$
 !
 ! Subroutine: psb_dmdots
-!    psb_dmdots forms the dot product of multiple distributed vectors,
+!    psb_dmdots computes the dot product of multiple distributed vectors,
 !
 !    res(i) := ( X(:,i) )**C * ( Y(:,i) )
 !
@@ -575,6 +595,7 @@ end subroutine psb_ddotvs
 !    y(:)   -  real              The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_dmdots(res, x, y, desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_dmdots
diff --git a/base/psblas/psb_dnrm2.f90 b/base/psblas/psb_dnrm2.f90
index b72a72ee..0845aac4 100644
--- a/base/psblas/psb_dnrm2.f90
+++ b/base/psblas/psb_dnrm2.f90
@@ -32,9 +32,9 @@
 ! File: psb_dnrm2.f90
 !
 ! Function: psb_dnrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( sub( X )**T * sub( X ) )
+!    norm2 := sqrt ( sub( X )**C * sub( X ) )
 !
 !    where sub( X ) denotes X(:,JX).
 !
@@ -43,6 +43,7 @@
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset for sub( X ).
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_dnrm2(x, desc_a, info, jx,global)  result(res)
   use psb_desc_mod
@@ -170,14 +171,15 @@ end function psb_dnrm2
 !!$
 !
 ! Function: psb_dnrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    x(:)   -  real               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_dnrm2v(x, desc_a, info,global)  result(res)
   use psb_desc_mod
@@ -263,7 +265,17 @@ function psb_dnrm2v(x, desc_a, info,global)  result(res)
 end function psb_dnrm2v
 
 
-
+! Function: psb_dnrm2_vect
+!    Computes the norm2 of a distributed vector,
+!
+!    norm2 := sqrt ( X**C * X)
+!
+! Arguments:
+!    x      -  type(psb_d_vect_type) The input vector containing the entries of X.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
+!
 function psb_dnrm2_vect(x, desc_a, info,global)  result(res)
   use psb_desc_mod
   use psb_check_mod
@@ -344,7 +356,7 @@ function psb_dnrm2_vect(x, desc_a, info,global)  result(res)
         res = res - sqrt(done - dd*(abs(x%v%v(idx))/res)**2)
       end do
     end if
-    else 	    
+  else 	    
     res = dzero
   end if
 
@@ -392,15 +404,16 @@ end function psb_dnrm2_vect
 !!$
 !
 ! Subroutine: psb_dnrm2vs
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector, subroutine version
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    res    -  real                  The result.
 !    x(:)   -  real               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 subroutine psb_dnrm2vs(res, x, desc_a, info,global)
   use psb_desc_mod
diff --git a/base/psblas/psb_dnrmi.f90 b/base/psblas/psb_dnrmi.f90
index 6d585981..a6a97751 100644
--- a/base/psblas/psb_dnrmi.f90
+++ b/base/psblas/psb_dnrmi.f90
@@ -32,14 +32,15 @@
 ! File: psb_dnrmi.f90
 !
 ! Function: psb_dnrmi
-!    Forms the infinity norm of a sparse matrix,                           
+!    Computes the infinity norm of a sparse matrix,                           
 !
-!    nrmi := max_i(abs(sum(A(i,:))))                                              
+!    nrmi := max_i(sum(abs(A(i,:))))                                              
 !
 ! Arguments:
 !    a      -  type(psb_dspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_dnrmi(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_dnrmi
diff --git a/base/psblas/psb_dspmm.f90 b/base/psblas/psb_dspmm.f90
index bf262541..1fdf6171 100644
--- a/base/psblas/psb_dspmm.f90
+++ b/base/psblas/psb_dspmm.f90
@@ -31,6 +31,235 @@
 !    
 ! File: psb_dspmm.f90
 !
+!
+! Subroutine: psb_dspm_vect
+!     Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A * Pc * X  + beta * Y,  or
+!
+!     Y := alpha * Pr * A' * Pr * X  + beta * Y,
+!
+!  alpha and beta are scalars, X and Y are distributed
+!  vectors and A is a M-by-N distributed matrix.
+!
+! Arguments:   
+!    alpha   -  real                The scalar alpha.
+!    a       -  type(psb_dspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_d_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  real                The scalar beta.
+!    y       -  type(psb_d_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. Default:  'N' 
+!    work(:) -  real,(optional).    Working area.
+!    doswap  -  logical(optional).     Whether to performe halo updates.
+! 
+subroutine  psb_dspmv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, work, doswap)   
+  use psb_base_mod, psb_protect_name => psb_dspmv_vect
+  use psi_mod
+  implicit none
+
+  real(psb_dpk_), intent(in)            :: alpha, beta
+  type(psb_d_vect_type), intent(inout)     :: x
+  type(psb_d_vect_type), intent(inout)     :: y
+  type(psb_dspmat_type), intent(in)        :: a
+  type(psb_desc_type), intent(in)          :: desc_a
+  integer(psb_ipk_), intent(out)                     :: info
+  real(psb_dpk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional          :: trans
+  logical, intent(in), optional            :: doswap
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me,&
+       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
+       & liwork, iiy, jjy, ib, ip, idx
+  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
+  integer(psb_ipk_), parameter       :: nb=4
+  real(psb_dpk_), pointer :: iwork(:), xp(:), yp(:)
+  real(psb_dpk_), allocatable :: xvsave(:)
+  character                :: trans_
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw, doswap_
+  integer(psb_ipk_) :: debug_level, debug_unit
+
+  name='psb_dspmv'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+  debug_unit  = psb_get_debug_unit()
+  debug_level = psb_get_debug_level()
+
+  ictxt=desc_a%get_context()
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (present(doswap)) then
+    doswap_ = doswap
+  else
+    doswap_ = .true.
+  endif
+
+  if (present(trans)) then     
+    trans_ = psb_toupper(trans)
+  else
+    trans_ = 'N'
+  endif
+  if ( (trans_ == 'N').or.(trans_ == 'T')&
+       & .or.(trans_ == 'C')) then
+  else
+    info = psb_err_iarg_invalid_value_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+  m    = desc_a%get_global_rows()
+  n    = desc_a%get_global_cols()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then
+    if (size(work) >= liwork) then
+      aliw =.false.
+    else
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='Allocate'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
+
+  if (trans_ == 'N') then
+    !  Matrix is not transposed
+
+    if (doswap_) then
+      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & dzero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
+    end if
+
+    call psb_csmm(alpha,a,x,beta,y,info)
+
+    if(info /= psb_success_) then
+      info = psb_err_from_subroutine_non_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+
+  else
+    !  Matrix is transposed
+
+    !
+    ! Non-empty overlap, need a buffer to hold
+    ! the entries updated with average operator.
+    ! Why the average? because in this way they will contribute
+    ! with a proper scale factor (1/np) to the overall product.
+    ! 
+    call psi_ovrl_save(x%v,xvsave,desc_a,info)
+    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
+
+    if (beta /= dzero) call y%set(dzero,nrow+1,ncol)
+    !  local Matrix-vector product
+    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
+
+    if (debug_level >= psb_debug_comp_) &
+         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
+
+    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
+    if (info /= psb_success_) then
+      info = psb_err_from_subroutine_
+      ch_err='psb_csmm'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+
+    if (doswap_) then
+      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
+           & done,y%v,desc_a,iwork,info)
+      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & done,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+      if (debug_level >= psb_debug_comp_) &
+           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
+      if(info /= psb_success_) then
+        info = psb_err_from_subroutine_
+        ch_err='PSI_SwapTran'
+        call psb_errpush(info,name,a_err=ch_err)
+        goto 9999
+      end if
+    end if
+
+  end if
+
+  if (aliw) deallocate(iwork,stat=info)
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='Deallocate iwork'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  nullify(iwork)
+
+  call psb_erractionrestore(err_act)
+  if (debug_level >= psb_debug_comp_) then 
+    call psb_barrier(ictxt)
+    write(debug_unit,*) me,' ',trim(name),' Returning '
+  endif
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_dspmv_vect
+!
 ! Subroutine: psb_dspmm
 !     Performs one of the distributed matrix-vector operations
 !
@@ -356,10 +585,6 @@ subroutine  psb_dspmm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_dspmm
-
-
-
-
 !!$ 
 !!$              Parallel Sparse BLAS  version 3.5
 !!$    (C) Copyright 2006-2018
@@ -670,211 +895,3 @@ subroutine  psb_dspmv(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_dspmv
-
-
-
-subroutine  psb_dspmv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, work, doswap)   
-  use psb_base_mod, psb_protect_name => psb_dspmv_vect
-  use psi_mod
-  implicit none
-
-  real(psb_dpk_), intent(in)            :: alpha, beta
-  type(psb_d_vect_type), intent(inout)     :: x
-  type(psb_d_vect_type), intent(inout)     :: y
-  type(psb_dspmat_type), intent(in)        :: a
-  type(psb_desc_type), intent(in)          :: desc_a
-  integer(psb_ipk_), intent(out)                     :: info
-  real(psb_dpk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional          :: trans
-  logical, intent(in), optional            :: doswap
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me,&
-       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
-       & liwork, iiy, jjy, ib, ip, idx
-  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
-  integer(psb_ipk_), parameter       :: nb=4
-  real(psb_dpk_), pointer :: iwork(:), xp(:), yp(:)
-  real(psb_dpk_), allocatable :: xvsave(:)
-  character                :: trans_
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw, doswap_
-  integer(psb_ipk_) :: debug_level, debug_unit
-
-  name='psb_dspmv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-  debug_unit  = psb_get_debug_unit()
-  debug_level = psb_get_debug_level()
-
-  ictxt=desc_a%get_context()
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (present(doswap)) then
-    doswap_ = doswap
-  else
-    doswap_ = .true.
-  endif
-
-  if (present(trans)) then     
-    trans_ = psb_toupper(trans)
-  else
-    trans_ = 'N'
-  endif
-  if ( (trans_ == 'N').or.(trans_ == 'T')&
-       & .or.(trans_ == 'C')) then
-  else
-    info = psb_err_iarg_invalid_value_
-    call psb_errpush(info,name)
-    goto 9999
-  end if
-
-  m    = desc_a%get_global_rows()
-  n    = desc_a%get_global_cols()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then
-    if (size(work) >= liwork) then
-      aliw =.false.
-    else
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='Allocate'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
-
-  if (trans_ == 'N') then
-    !  Matrix is not transposed
-
-    if (doswap_) then
-      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & dzero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
-    end if
-
-    call psb_csmm(alpha,a,x,beta,y,info)
-
-    if(info /= psb_success_) then
-      info = psb_err_from_subroutine_non_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-
-  else
-    !  Matrix is transposed
-
-    !
-    ! Non-empty overlap, need a buffer to hold
-    ! the entries updated with average operator.
-    ! Why the average? because in this way they will contribute
-    ! with a proper scale factor (1/np) to the overall product.
-    ! 
-    call psi_ovrl_save(x%v,xvsave,desc_a,info)
-    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
-
-    if (beta /= dzero) call y%set(dzero,nrow+1,ncol)
-    !  local Matrix-vector product
-    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
-
-    if (debug_level >= psb_debug_comp_) &
-         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
-
-    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
-    if (info /= psb_success_) then
-      info = psb_err_from_subroutine_
-      ch_err='psb_csmm'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    if (doswap_) then
-      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
-           & done,y%v,desc_a,iwork,info)
-      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & done,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-      if (debug_level >= psb_debug_comp_) &
-           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
-      if(info /= psb_success_) then
-        info = psb_err_from_subroutine_
-        ch_err='PSI_SwapTran'
-        call psb_errpush(info,name,a_err=ch_err)
-        goto 9999
-      end if
-    end if
-
-  end if
-
-  if (aliw) deallocate(iwork,stat=info)
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='Deallocate iwork'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  nullify(iwork)
-
-  call psb_erractionrestore(err_act)
-  if (debug_level >= psb_debug_comp_) then 
-    call psb_barrier(ictxt)
-    write(debug_unit,*) me,' ',trim(name),' Returning '
-  endif
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_dspmv_vect
diff --git a/base/psblas/psb_dspnrm1.f90 b/base/psblas/psb_dspnrm1.f90
index 01a7de96..9d367615 100644
--- a/base/psblas/psb_dspnrm1.f90
+++ b/base/psblas/psb_dspnrm1.f90
@@ -40,6 +40,7 @@
 !    a      -  type(psb_dspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_dspnrm1(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_dspnrm1
diff --git a/base/psblas/psb_dspsm.f90 b/base/psblas/psb_dspsm.f90
index 4efad6b7..f1a019ad 100644
--- a/base/psblas/psb_dspsm.f90
+++ b/base/psblas/psb_dspsm.f90
@@ -31,6 +31,203 @@
 !    
 ! File: psb_dspsm.f90
 !
+!
+! Subroutine: psb_dsps_vect
+!  Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A-1 * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-1 * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-1 * Pc * D * X + beta * Y,  or
+!
+!     Y := alpha * Pr * A-T * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-T * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-T * Pc * D * X + beta * Y,  or
+!
+!  X is a distributed
+!  vector and T is a M-by-M distributed triangular matrix.
+!
+!
+! Arguments:   
+!    alpha   -  real.               The scalar alpha.
+!    a       -  type(psb_dspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_d_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  real                The scalar beta.
+!    y       -  type(psb_d_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. If not present 'N' is assumed.
+!    scale   -  character(optional).   Specify some type of operation with
+!                                      the diagonal matrix D.
+!    choice  -  integer(optional).     The kind of update to perform on overlap elements.
+!    d(:)    -  real, optional      Matrix for diagonal scaling.
+!    work(:) -  real, optional      Working area.
+! 
+subroutine  psb_dspsv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, scale, choice, diag, work)   
+  use psb_base_mod, psb_protect_name => psb_dspsv_vect
+  use psi_mod
+  implicit none 
+
+  real(psb_dpk_), intent(in)           :: alpha, beta
+  type(psb_d_vect_type), intent(inout)    :: x
+  type(psb_d_vect_type), intent(inout)    :: y
+  type(psb_dspmat_type), intent(inout)    :: a
+  type(psb_desc_type), intent(in)         :: desc_a
+  integer(psb_ipk_), intent(out)                    :: info
+  type(psb_d_vect_type), intent(inout), optional  :: diag
+  real(psb_dpk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional         :: trans, scale
+  integer(psb_ipk_), intent(in), optional           :: choice
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me, &
+       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
+       & ix, iy, ik, jx, jy, i, lld,&
+       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
+
+  character                :: lscale
+  integer(psb_ipk_), parameter       :: nb=4
+  real(psb_dpk_),pointer :: iwork(:), xp(:), yp(:)
+  character                :: itrans
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw
+
+  name='psb_dspsv_vect'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+
+  ictxt=desc_a%get_context()
+
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  if (present(choice)) then     
+    choice_ = choice
+  else
+    choice_ = psb_avg_
+  endif
+
+  if (present(scale)) then     
+    lscale = psb_toupper(scale)
+  else
+    lscale = 'U'
+  endif
+
+  if (present(trans)) then     
+    itrans = psb_toupper(trans)
+    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
+      ! Ok
+    else
+      info = psb_err_iarg_invalid_value_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+  else
+    itrans = 'N'
+  endif
+
+  m    = desc_a%get_global_rows()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then     
+    if (size(work) >= liwork) then 
+      aliw =.false.
+    else 
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then 
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='psb_realloc'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  iwork(1)=0.d0
+
+  ! Perform local triangular system solve
+  if (present(diag)) then 
+    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
+  else
+    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
+  end if
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='dcssm'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  ! update overlap elements
+  if (choice_ > 0) then
+    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+         & done,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+
+    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
+    if (info /= psb_success_) then
+      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
+      goto 9999
+    end if
+  end if
+
+  if (aliw) deallocate(iwork)
+
+  call psb_erractionrestore(err_act)
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_dspsv_vect
+!
 ! Subroutine: psb_dspsm
 !  Performs one of the distributed matrix-vector operations
 !
@@ -283,38 +480,6 @@ subroutine  psb_dspsm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_dspsm
-
-!!$ 
-!!$              Parallel Sparse BLAS  version 3.5
-!!$    (C) Copyright 2006-2018
-!!$                       Salvatore Filippone    University of Rome Tor Vergata
-!!$                       Alfredo Buttari      
-!!$ 
-!!$  Redistribution and use in source and binary forms, with or without
-!!$  modification, are permitted provided that the following conditions
-!!$  are met:
-!!$    1. Redistributions of source code must retain the above copyright
-!!$       notice, this list of conditions and the following disclaimer.
-!!$    2. Redistributions in binary form must reproduce the above copyright
-!!$       notice, this list of conditions, and the following disclaimer in the
-!!$       documentation and/or other materials provided with the distribution.
-!!$    3. The name of the PSBLAS group or the names of its contributors may
-!!$       not be used to endorse or promote products derived from this
-!!$       software without specific written permission.
-!!$ 
-!!$  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!!$  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!!$  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!!$  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!!$  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!!$  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!!$  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!!$  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!!$  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!!$  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!!$  POSSIBILITY OF SUCH DAMAGE.
-!!$ 
-!!$
 !
 ! Subroutine: psb_dspsv
 !  Performs one of the distributed matrix-vector operations
@@ -545,166 +710,3 @@ subroutine  psb_dspsv(alpha,a,x,beta,y,desc_a,info,&
   return
 end subroutine psb_dspsv
 
-
-subroutine  psb_dspsv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, scale, choice, diag, work)   
-  use psb_base_mod, psb_protect_name => psb_dspsv_vect
-  use psi_mod
-  implicit none 
-
-  real(psb_dpk_), intent(in)           :: alpha, beta
-  type(psb_d_vect_type), intent(inout)    :: x
-  type(psb_d_vect_type), intent(inout)    :: y
-  type(psb_dspmat_type), intent(inout)    :: a
-  type(psb_desc_type), intent(in)         :: desc_a
-  integer(psb_ipk_), intent(out)                    :: info
-  type(psb_d_vect_type), intent(inout), optional  :: diag
-  real(psb_dpk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional         :: trans, scale
-  integer(psb_ipk_), intent(in), optional           :: choice
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me, &
-       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
-       & ix, iy, ik, jx, jy, i, lld,&
-       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
-
-  character                :: lscale
-  integer(psb_ipk_), parameter       :: nb=4
-  real(psb_dpk_),pointer :: iwork(:), xp(:), yp(:)
-  character                :: itrans
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw
-
-  name='psb_sspsv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-
-  ictxt=desc_a%get_context()
-
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-
-  if (present(choice)) then     
-    choice_ = choice
-  else
-    choice_ = psb_avg_
-  endif
-
-  if (present(scale)) then     
-    lscale = psb_toupper(scale)
-  else
-    lscale = 'U'
-  endif
-
-  if (present(trans)) then     
-    itrans = psb_toupper(trans)
-    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
-      ! Ok
-    else
-      info = psb_err_iarg_invalid_value_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-  else
-    itrans = 'N'
-  endif
-
-  m    = desc_a%get_global_rows()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then     
-    if (size(work) >= liwork) then 
-      aliw =.false.
-    else 
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then 
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_realloc'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  iwork(1)=0.d0
-
-  ! Perform local triangular system solve
-  if (present(diag)) then 
-    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
-  else
-    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
-  end if
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='dcssm'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  ! update overlap elements
-  if (choice_ > 0) then
-    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-         & done,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-
-    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
-    if (info /= psb_success_) then
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
-      goto 9999
-    end if
-  end if
-
-  if (aliw) deallocate(iwork)
-
-  call psb_erractionrestore(err_act)
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_dspsv_vect
-
diff --git a/base/psblas/psb_samax.f90 b/base/psblas/psb_samax.f90
index ee70314f..174c7a28 100644
--- a/base/psblas/psb_samax.f90
+++ b/base/psblas/psb_samax.f90
@@ -32,7 +32,7 @@
 ! File: psb_samax.f90
 !
 ! Function: psb_samax
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X
 !
 !    normi := max(abs(sub(X)(i))  
 !
@@ -164,7 +164,7 @@ end function psb_samax
 !!$
 !
 ! Function: psb_samaxv
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value  of X.
 !
 !    normi := max(abs(X(i))  
 !
@@ -252,6 +252,17 @@ function psb_samaxv (x,desc_a, info,global) result(res)
   return
 end function psb_samaxv
 
+!
+! Function: psb_samax_vect
+!    Computes the maximum absolute value  of X.
+!
+!    normi := max(abs(X(i))  
+!
+! Arguments:
+!    x      - type(psb_s_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!
 
 function psb_samax_vect(x, desc_a, info,global) result(res)
   use psb_penv_mod
@@ -374,17 +385,16 @@ end function psb_samax_vect
 !!$  
 !
 ! Subroutine: psb_samaxvs
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X, subroutine version
 !
 !    normi := max(abs(sub(X)(i))  
 !
-!    where sub( X ) denotes X(1:N,JX:).
+!    where sub( X ) denotes X(1:N).
 !
 ! Arguments:
 !    res    -  real                 The result.
-!    x(:,:) -  real              The input vector.
+!    x(:)   -  real              The input vector.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
-!    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset.
 !
 subroutine psb_samaxvs(res,x,desc_a, info,global)
diff --git a/base/psblas/psb_sasum.f90 b/base/psblas/psb_sasum.f90
index 2abf254d..a61f1851 100644
--- a/base/psblas/psb_sasum.f90
+++ b/base/psblas/psb_sasum.f90
@@ -135,6 +135,19 @@ function psb_sasum (x,desc_a, info, jx,global) result(res)
   return
 end function psb_sasum
 
+! Function: psb_sasum_vect
+!    Computes norm1 of X
+!
+!    norm1 := sum(sub( X )(i))
+!
+!    where sub( X ) denotes X(1:N,JX:).
+!
+! Arguments:
+!    x      - type(psb_s_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).   The communication descriptor.
+!    info   -  integer.               Return code
+!    jx     -  integer(optional).     The column offset.
+!
 
 function psb_sasum_vect(x, desc_a, info,global) result(res)
   use psb_base_mod, psb_protect_name => psb_sasum_vect
diff --git a/base/psblas/psb_saxpby.f90 b/base/psblas/psb_saxpby.f90
index 26cecd30..d3f573dc 100644
--- a/base/psblas/psb_saxpby.f90
+++ b/base/psblas/psb_saxpby.f90
@@ -31,6 +31,23 @@
 !    
 ! File: psb_saxpby.f90
 
+!
+! Subroutine: psb_saxpby_vect
+!    Adds one distributed vector to another,
+!
+!    Y := beta * Y + alpha * X
+!
+! Arguments:
+!    alpha  -  real,input        The scalar used to multiply each component of X
+!    x      - type(psb_s_vect_type) The input vector containing the entries of X
+!    beta   -  real,input        The scalar used to multiply each component of Y
+!    y      - type(psb_s_vect_type)  The input/output vector Y
+!    desc_a -  type(psb_desc_type)  The communication descriptor.
+!    info   -  integer              Return code
+!
+!  Note: from a functional point of view, X is input, but here
+!        it's declared INOUT because of the sync() methods. 
+!
 subroutine psb_saxpby_vect(alpha, x, beta, y,&
      & desc_a, info)
   use psb_base_mod, psb_protect_name => psb_saxpby_vect
@@ -269,7 +286,7 @@ end subroutine psb_saxpby
 !!$  
 !
 ! Subroutine: psb_saxpbyv
-!    Adds one distributed matrix to another,
+!    Adds one distributed vector to another,
 !
 !    Y := beta * Y + alpha * X
 !
diff --git a/base/psblas/psb_sdot.f90 b/base/psblas/psb_sdot.f90
index 2c8bea51..cce9e15c 100644
--- a/base/psblas/psb_sdot.f90
+++ b/base/psblas/psb_sdot.f90
@@ -31,22 +31,22 @@
 !    
 ! File: psb_sdot.f90
 !
-! Function: psb_sdot
-!    psb_sdot forms the dot product of two distributed vectors,
+! Function: psb_sdot_vect
+!    psb_sdot computes the dot product of two distributed vectors,
 !
-!    dot := sub( X )**C * sub( Y )
+!    dot := ( X )**C * ( Y )
 !
-!    where sub( X ) denotes X(:,JX)
-!
-!    sub( Y ) denotes Y(:,JY).
 !
 ! Arguments:
-!    x(:,:) -  complex               The input vector containing the entries of sub( X ).
-!    y(:,:) -  complex               The input vector containing the entries of sub( Y ).
+!    x      -  type(psb_s_vect_type) The input vector containing the entries of sub( X ).
+!    y      -  type(psb_s_vect_type) The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
-!    jx     -  integer(optional).    The column offset for sub( X ).
-!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods. 
+!
 !
 function psb_sdot_vect(x, y, desc_a,info,global) result(res)
   use psb_desc_mod
@@ -156,7 +156,25 @@ function psb_sdot_vect(x, y, desc_a,info,global) result(res)
   return
 
 end function psb_sdot_vect
-
+!
+! Function: psb_sdot
+!    psb_sdot computes the dot product of two distributed vectors,
+!
+!    dot := sub( X )**C * sub( Y )
+!
+!    where sub( X ) denotes X(:,JX)
+!
+!    sub( Y ) denotes Y(:,JY).
+!
+! Arguments:
+!    x(:,:) -  real                The input vector containing the entries of sub( X ).
+!    y(:,:) -  real                The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    jx     -  integer(optional).    The column offset for sub( X ).
+!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
 function psb_sdot(x, y,desc_a, info, jx, jy,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_sdot
   implicit none
@@ -298,7 +316,7 @@ end function psb_sdot
 !!$
 ! 
 ! Function: psb_sdotv
-!    psb_sdotv forms the dot product of two distributed vectors,
+!    psb_sdotv computes the dot product of two distributed vectors,
 !
 !    dot := X**C * Y
 !
@@ -307,6 +325,7 @@ end function psb_sdot
 !    y(:)   -  real               The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 function psb_sdotv(x, y,desc_a, info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_sdotv
@@ -432,7 +451,7 @@ end function psb_sdotv
 !!$
 !  
 ! Subroutine: psb_sdotvs
-!    psb_sdotvs forms the dot product of two distributed vectors,
+!    psb_sdotvs computes the dot product of two distributed vectors,
 !
 !    res := X**C * Y
 !
@@ -442,6 +461,7 @@ end function psb_sdotv
 !    y(:)   -  real              The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_sdotvs(res, x, y,desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_sdotvs
@@ -565,7 +585,7 @@ end subroutine psb_sdotvs
 !!$
 !
 ! Subroutine: psb_smdots
-!    psb_smdots forms the dot product of multiple distributed vectors,
+!    psb_smdots computes the dot product of multiple distributed vectors,
 !
 !    res(i) := ( X(:,i) )**C * ( Y(:,i) )
 !
@@ -575,6 +595,7 @@ end subroutine psb_sdotvs
 !    y(:)   -  real              The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_smdots(res, x, y, desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_smdots
diff --git a/base/psblas/psb_snrm2.f90 b/base/psblas/psb_snrm2.f90
index d182a2cd..35260ef7 100644
--- a/base/psblas/psb_snrm2.f90
+++ b/base/psblas/psb_snrm2.f90
@@ -32,9 +32,9 @@
 ! File: psb_snrm2.f90
 !
 ! Function: psb_snrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( sub( X )**T * sub( X ) )
+!    norm2 := sqrt ( sub( X )**C * sub( X ) )
 !
 !    where sub( X ) denotes X(:,JX).
 !
@@ -43,6 +43,7 @@
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset for sub( X ).
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_snrm2(x, desc_a, info, jx,global)  result(res)
   use psb_desc_mod
@@ -170,14 +171,15 @@ end function psb_snrm2
 !!$
 !
 ! Function: psb_snrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    x(:)   -  real               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_snrm2v(x, desc_a, info,global)  result(res)
   use psb_desc_mod
@@ -263,7 +265,17 @@ function psb_snrm2v(x, desc_a, info,global)  result(res)
 end function psb_snrm2v
 
 
-
+! Function: psb_snrm2_vect
+!    Computes the norm2 of a distributed vector,
+!
+!    norm2 := sqrt ( X**C * X)
+!
+! Arguments:
+!    x      -  type(psb_s_vect_type) The input vector containing the entries of X.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
+!
 function psb_snrm2_vect(x, desc_a, info,global)  result(res)
   use psb_desc_mod
   use psb_check_mod
@@ -344,7 +356,7 @@ function psb_snrm2_vect(x, desc_a, info,global)  result(res)
         res = res - sqrt(sone - dd*(abs(x%v%v(idx))/res)**2)
       end do
     end if
-    else 	    
+  else 	    
     res = szero
   end if
 
@@ -392,15 +404,16 @@ end function psb_snrm2_vect
 !!$
 !
 ! Subroutine: psb_snrm2vs
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector, subroutine version
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    res    -  real                  The result.
 !    x(:)   -  real               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 subroutine psb_snrm2vs(res, x, desc_a, info,global)
   use psb_desc_mod
diff --git a/base/psblas/psb_snrmi.f90 b/base/psblas/psb_snrmi.f90
index 07b88de2..9fc41073 100644
--- a/base/psblas/psb_snrmi.f90
+++ b/base/psblas/psb_snrmi.f90
@@ -32,14 +32,15 @@
 ! File: psb_snrmi.f90
 !
 ! Function: psb_snrmi
-!    Forms the infinity norm of a sparse matrix,                           
+!    Computes the infinity norm of a sparse matrix,                           
 !
-!    nrmi := max_i(abs(sum(A(i,:))))                                              
+!    nrmi := max_i(sum(abs(A(i,:))))                                              
 !
 ! Arguments:
 !    a      -  type(psb_sspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_snrmi(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_snrmi
diff --git a/base/psblas/psb_sspmm.f90 b/base/psblas/psb_sspmm.f90
index a84962dc..56b881b8 100644
--- a/base/psblas/psb_sspmm.f90
+++ b/base/psblas/psb_sspmm.f90
@@ -31,6 +31,235 @@
 !    
 ! File: psb_sspmm.f90
 !
+!
+! Subroutine: psb_sspm_vect
+!     Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A * Pc * X  + beta * Y,  or
+!
+!     Y := alpha * Pr * A' * Pr * X  + beta * Y,
+!
+!  alpha and beta are scalars, X and Y are distributed
+!  vectors and A is a M-by-N distributed matrix.
+!
+! Arguments:   
+!    alpha   -  real                The scalar alpha.
+!    a       -  type(psb_sspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_s_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  real                The scalar beta.
+!    y       -  type(psb_s_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. Default:  'N' 
+!    work(:) -  real,(optional).    Working area.
+!    doswap  -  logical(optional).     Whether to performe halo updates.
+! 
+subroutine  psb_sspmv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, work, doswap)   
+  use psb_base_mod, psb_protect_name => psb_sspmv_vect
+  use psi_mod
+  implicit none
+
+  real(psb_spk_), intent(in)            :: alpha, beta
+  type(psb_s_vect_type), intent(inout)     :: x
+  type(psb_s_vect_type), intent(inout)     :: y
+  type(psb_sspmat_type), intent(in)        :: a
+  type(psb_desc_type), intent(in)          :: desc_a
+  integer(psb_ipk_), intent(out)                     :: info
+  real(psb_spk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional          :: trans
+  logical, intent(in), optional            :: doswap
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me,&
+       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
+       & liwork, iiy, jjy, ib, ip, idx
+  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
+  integer(psb_ipk_), parameter       :: nb=4
+  real(psb_spk_), pointer :: iwork(:), xp(:), yp(:)
+  real(psb_spk_), allocatable :: xvsave(:)
+  character                :: trans_
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw, doswap_
+  integer(psb_ipk_) :: debug_level, debug_unit
+
+  name='psb_sspmv'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+  debug_unit  = psb_get_debug_unit()
+  debug_level = psb_get_debug_level()
+
+  ictxt=desc_a%get_context()
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (present(doswap)) then
+    doswap_ = doswap
+  else
+    doswap_ = .true.
+  endif
+
+  if (present(trans)) then     
+    trans_ = psb_toupper(trans)
+  else
+    trans_ = 'N'
+  endif
+  if ( (trans_ == 'N').or.(trans_ == 'T')&
+       & .or.(trans_ == 'C')) then
+  else
+    info = psb_err_iarg_invalid_value_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+  m    = desc_a%get_global_rows()
+  n    = desc_a%get_global_cols()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then
+    if (size(work) >= liwork) then
+      aliw =.false.
+    else
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='Allocate'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
+
+  if (trans_ == 'N') then
+    !  Matrix is not transposed
+
+    if (doswap_) then
+      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & szero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
+    end if
+
+    call psb_csmm(alpha,a,x,beta,y,info)
+
+    if(info /= psb_success_) then
+      info = psb_err_from_subroutine_non_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+
+  else
+    !  Matrix is transposed
+
+    !
+    ! Non-empty overlap, need a buffer to hold
+    ! the entries updated with average operator.
+    ! Why the average? because in this way they will contribute
+    ! with a proper scale factor (1/np) to the overall product.
+    ! 
+    call psi_ovrl_save(x%v,xvsave,desc_a,info)
+    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
+
+    if (beta /= szero) call y%set(szero,nrow+1,ncol)
+    !  local Matrix-vector product
+    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
+
+    if (debug_level >= psb_debug_comp_) &
+         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
+
+    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
+    if (info /= psb_success_) then
+      info = psb_err_from_subroutine_
+      ch_err='psb_csmm'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+
+    if (doswap_) then
+      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
+           & sone,y%v,desc_a,iwork,info)
+      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & sone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+      if (debug_level >= psb_debug_comp_) &
+           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
+      if(info /= psb_success_) then
+        info = psb_err_from_subroutine_
+        ch_err='PSI_SwapTran'
+        call psb_errpush(info,name,a_err=ch_err)
+        goto 9999
+      end if
+    end if
+
+  end if
+
+  if (aliw) deallocate(iwork,stat=info)
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='Deallocate iwork'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  nullify(iwork)
+
+  call psb_erractionrestore(err_act)
+  if (debug_level >= psb_debug_comp_) then 
+    call psb_barrier(ictxt)
+    write(debug_unit,*) me,' ',trim(name),' Returning '
+  endif
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_sspmv_vect
+!
 ! Subroutine: psb_sspmm
 !     Performs one of the distributed matrix-vector operations
 !
@@ -356,10 +585,6 @@ subroutine  psb_sspmm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_sspmm
-
-
-
-
 !!$ 
 !!$              Parallel Sparse BLAS  version 3.5
 !!$    (C) Copyright 2006-2018
@@ -670,211 +895,3 @@ subroutine  psb_sspmv(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_sspmv
-
-
-
-subroutine  psb_sspmv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, work, doswap)   
-  use psb_base_mod, psb_protect_name => psb_sspmv_vect
-  use psi_mod
-  implicit none
-
-  real(psb_spk_), intent(in)            :: alpha, beta
-  type(psb_s_vect_type), intent(inout)     :: x
-  type(psb_s_vect_type), intent(inout)     :: y
-  type(psb_sspmat_type), intent(in)        :: a
-  type(psb_desc_type), intent(in)          :: desc_a
-  integer(psb_ipk_), intent(out)                     :: info
-  real(psb_spk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional          :: trans
-  logical, intent(in), optional            :: doswap
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me,&
-       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
-       & liwork, iiy, jjy, ib, ip, idx
-  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
-  integer(psb_ipk_), parameter       :: nb=4
-  real(psb_spk_), pointer :: iwork(:), xp(:), yp(:)
-  real(psb_spk_), allocatable :: xvsave(:)
-  character                :: trans_
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw, doswap_
-  integer(psb_ipk_) :: debug_level, debug_unit
-
-  name='psb_sspmv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-  debug_unit  = psb_get_debug_unit()
-  debug_level = psb_get_debug_level()
-
-  ictxt=desc_a%get_context()
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (present(doswap)) then
-    doswap_ = doswap
-  else
-    doswap_ = .true.
-  endif
-
-  if (present(trans)) then     
-    trans_ = psb_toupper(trans)
-  else
-    trans_ = 'N'
-  endif
-  if ( (trans_ == 'N').or.(trans_ == 'T')&
-       & .or.(trans_ == 'C')) then
-  else
-    info = psb_err_iarg_invalid_value_
-    call psb_errpush(info,name)
-    goto 9999
-  end if
-
-  m    = desc_a%get_global_rows()
-  n    = desc_a%get_global_cols()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then
-    if (size(work) >= liwork) then
-      aliw =.false.
-    else
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='Allocate'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
-
-  if (trans_ == 'N') then
-    !  Matrix is not transposed
-
-    if (doswap_) then
-      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & szero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
-    end if
-
-    call psb_csmm(alpha,a,x,beta,y,info)
-
-    if(info /= psb_success_) then
-      info = psb_err_from_subroutine_non_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-
-  else
-    !  Matrix is transposed
-
-    !
-    ! Non-empty overlap, need a buffer to hold
-    ! the entries updated with average operator.
-    ! Why the average? because in this way they will contribute
-    ! with a proper scale factor (1/np) to the overall product.
-    ! 
-    call psi_ovrl_save(x%v,xvsave,desc_a,info)
-    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
-
-    if (beta /= szero) call y%set(szero,nrow+1,ncol)
-    !  local Matrix-vector product
-    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
-
-    if (debug_level >= psb_debug_comp_) &
-         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
-
-    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
-    if (info /= psb_success_) then
-      info = psb_err_from_subroutine_
-      ch_err='psb_csmm'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    if (doswap_) then
-      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
-           & sone,y%v,desc_a,iwork,info)
-      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & sone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-      if (debug_level >= psb_debug_comp_) &
-           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
-      if(info /= psb_success_) then
-        info = psb_err_from_subroutine_
-        ch_err='PSI_SwapTran'
-        call psb_errpush(info,name,a_err=ch_err)
-        goto 9999
-      end if
-    end if
-
-  end if
-
-  if (aliw) deallocate(iwork,stat=info)
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='Deallocate iwork'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  nullify(iwork)
-
-  call psb_erractionrestore(err_act)
-  if (debug_level >= psb_debug_comp_) then 
-    call psb_barrier(ictxt)
-    write(debug_unit,*) me,' ',trim(name),' Returning '
-  endif
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_sspmv_vect
diff --git a/base/psblas/psb_sspnrm1.f90 b/base/psblas/psb_sspnrm1.f90
index 391d60ee..9d2afeb8 100644
--- a/base/psblas/psb_sspnrm1.f90
+++ b/base/psblas/psb_sspnrm1.f90
@@ -40,6 +40,7 @@
 !    a      -  type(psb_sspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_sspnrm1(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_sspnrm1
diff --git a/base/psblas/psb_sspsm.f90 b/base/psblas/psb_sspsm.f90
index 8bcfdcad..1cbbd6d2 100644
--- a/base/psblas/psb_sspsm.f90
+++ b/base/psblas/psb_sspsm.f90
@@ -31,6 +31,203 @@
 !    
 ! File: psb_sspsm.f90
 !
+!
+! Subroutine: psb_ssps_vect
+!  Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A-1 * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-1 * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-1 * Pc * D * X + beta * Y,  or
+!
+!     Y := alpha * Pr * A-T * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-T * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-T * Pc * D * X + beta * Y,  or
+!
+!  X is a distributed
+!  vector and T is a M-by-M distributed triangular matrix.
+!
+!
+! Arguments:   
+!    alpha   -  real.               The scalar alpha.
+!    a       -  type(psb_sspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_s_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  real                The scalar beta.
+!    y       -  type(psb_s_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. If not present 'N' is assumed.
+!    scale   -  character(optional).   Specify some type of operation with
+!                                      the diagonal matrix D.
+!    choice  -  integer(optional).     The kind of update to perform on overlap elements.
+!    d(:)    -  real, optional      Matrix for diagonal scaling.
+!    work(:) -  real, optional      Working area.
+! 
+subroutine  psb_sspsv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, scale, choice, diag, work)   
+  use psb_base_mod, psb_protect_name => psb_sspsv_vect
+  use psi_mod
+  implicit none 
+
+  real(psb_spk_), intent(in)           :: alpha, beta
+  type(psb_s_vect_type), intent(inout)    :: x
+  type(psb_s_vect_type), intent(inout)    :: y
+  type(psb_sspmat_type), intent(inout)    :: a
+  type(psb_desc_type), intent(in)         :: desc_a
+  integer(psb_ipk_), intent(out)                    :: info
+  type(psb_s_vect_type), intent(inout), optional  :: diag
+  real(psb_spk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional         :: trans, scale
+  integer(psb_ipk_), intent(in), optional           :: choice
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me, &
+       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
+       & ix, iy, ik, jx, jy, i, lld,&
+       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
+
+  character                :: lscale
+  integer(psb_ipk_), parameter       :: nb=4
+  real(psb_spk_),pointer :: iwork(:), xp(:), yp(:)
+  character                :: itrans
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw
+
+  name='psb_sspsv_vect'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+
+  ictxt=desc_a%get_context()
+
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  if (present(choice)) then     
+    choice_ = choice
+  else
+    choice_ = psb_avg_
+  endif
+
+  if (present(scale)) then     
+    lscale = psb_toupper(scale)
+  else
+    lscale = 'U'
+  endif
+
+  if (present(trans)) then     
+    itrans = psb_toupper(trans)
+    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
+      ! Ok
+    else
+      info = psb_err_iarg_invalid_value_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+  else
+    itrans = 'N'
+  endif
+
+  m    = desc_a%get_global_rows()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then     
+    if (size(work) >= liwork) then 
+      aliw =.false.
+    else 
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then 
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='psb_realloc'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  iwork(1)=0.d0
+
+  ! Perform local triangular system solve
+  if (present(diag)) then 
+    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
+  else
+    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
+  end if
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='dcssm'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  ! update overlap elements
+  if (choice_ > 0) then
+    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+         & sone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+
+    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
+    if (info /= psb_success_) then
+      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
+      goto 9999
+    end if
+  end if
+
+  if (aliw) deallocate(iwork)
+
+  call psb_erractionrestore(err_act)
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_sspsv_vect
+!
 ! Subroutine: psb_sspsm
 !  Performs one of the distributed matrix-vector operations
 !
@@ -283,38 +480,6 @@ subroutine  psb_sspsm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_sspsm
-
-!!$ 
-!!$              Parallel Sparse BLAS  version 3.5
-!!$    (C) Copyright 2006-2018
-!!$                       Salvatore Filippone    University of Rome Tor Vergata
-!!$                       Alfredo Buttari      
-!!$ 
-!!$  Redistribution and use in source and binary forms, with or without
-!!$  modification, are permitted provided that the following conditions
-!!$  are met:
-!!$    1. Redistributions of source code must retain the above copyright
-!!$       notice, this list of conditions and the following disclaimer.
-!!$    2. Redistributions in binary form must reproduce the above copyright
-!!$       notice, this list of conditions, and the following disclaimer in the
-!!$       documentation and/or other materials provided with the distribution.
-!!$    3. The name of the PSBLAS group or the names of its contributors may
-!!$       not be used to endorse or promote products derived from this
-!!$       software without specific written permission.
-!!$ 
-!!$  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!!$  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!!$  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!!$  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!!$  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!!$  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!!$  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!!$  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!!$  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!!$  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!!$  POSSIBILITY OF SUCH DAMAGE.
-!!$ 
-!!$
 !
 ! Subroutine: psb_sspsv
 !  Performs one of the distributed matrix-vector operations
@@ -545,166 +710,3 @@ subroutine  psb_sspsv(alpha,a,x,beta,y,desc_a,info,&
   return
 end subroutine psb_sspsv
 
-
-subroutine  psb_sspsv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, scale, choice, diag, work)   
-  use psb_base_mod, psb_protect_name => psb_sspsv_vect
-  use psi_mod
-  implicit none 
-
-  real(psb_spk_), intent(in)           :: alpha, beta
-  type(psb_s_vect_type), intent(inout)    :: x
-  type(psb_s_vect_type), intent(inout)    :: y
-  type(psb_sspmat_type), intent(inout)    :: a
-  type(psb_desc_type), intent(in)         :: desc_a
-  integer(psb_ipk_), intent(out)                    :: info
-  type(psb_s_vect_type), intent(inout), optional  :: diag
-  real(psb_spk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional         :: trans, scale
-  integer(psb_ipk_), intent(in), optional           :: choice
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me, &
-       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
-       & ix, iy, ik, jx, jy, i, lld,&
-       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
-
-  character                :: lscale
-  integer(psb_ipk_), parameter       :: nb=4
-  real(psb_spk_),pointer :: iwork(:), xp(:), yp(:)
-  character                :: itrans
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw
-
-  name='psb_sspsv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-
-  ictxt=desc_a%get_context()
-
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-
-  if (present(choice)) then     
-    choice_ = choice
-  else
-    choice_ = psb_avg_
-  endif
-
-  if (present(scale)) then     
-    lscale = psb_toupper(scale)
-  else
-    lscale = 'U'
-  endif
-
-  if (present(trans)) then     
-    itrans = psb_toupper(trans)
-    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
-      ! Ok
-    else
-      info = psb_err_iarg_invalid_value_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-  else
-    itrans = 'N'
-  endif
-
-  m    = desc_a%get_global_rows()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then     
-    if (size(work) >= liwork) then 
-      aliw =.false.
-    else 
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then 
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_realloc'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  iwork(1)=0.d0
-
-  ! Perform local triangular system solve
-  if (present(diag)) then 
-    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
-  else
-    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
-  end if
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='dcssm'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  ! update overlap elements
-  if (choice_ > 0) then
-    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-         & sone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-
-    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
-    if (info /= psb_success_) then
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
-      goto 9999
-    end if
-  end if
-
-  if (aliw) deallocate(iwork)
-
-  call psb_erractionrestore(err_act)
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_sspsv_vect
-
diff --git a/base/psblas/psb_zamax.f90 b/base/psblas/psb_zamax.f90
index 21a82b39..5e768023 100644
--- a/base/psblas/psb_zamax.f90
+++ b/base/psblas/psb_zamax.f90
@@ -32,7 +32,7 @@
 ! File: psb_zamax.f90
 !
 ! Function: psb_zamax
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X
 !
 !    normi := max(abs(sub(X)(i))  
 !
@@ -164,7 +164,7 @@ end function psb_zamax
 !!$
 !
 ! Function: psb_zamaxv
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value  of X.
 !
 !    normi := max(abs(X(i))  
 !
@@ -252,6 +252,17 @@ function psb_zamaxv (x,desc_a, info,global) result(res)
   return
 end function psb_zamaxv
 
+!
+! Function: psb_zamax_vect
+!    Computes the maximum absolute value  of X.
+!
+!    normi := max(abs(X(i))  
+!
+! Arguments:
+!    x      - type(psb_z_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!
 
 function psb_zamax_vect(x, desc_a, info,global) result(res)
   use psb_penv_mod
@@ -374,17 +385,16 @@ end function psb_zamax_vect
 !!$  
 !
 ! Subroutine: psb_zamaxvs
-!    Searches the absolute max of X.
+!    Computes the maximum absolute value of X, subroutine version
 !
 !    normi := max(abs(sub(X)(i))  
 !
-!    where sub( X ) denotes X(1:N,JX:).
+!    where sub( X ) denotes X(1:N).
 !
 ! Arguments:
 !    res    -  real                 The result.
-!    x(:,:) -  complex              The input vector.
+!    x(:)   -  complex              The input vector.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
-!    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset.
 !
 subroutine psb_zamaxvs(res,x,desc_a, info,global)
diff --git a/base/psblas/psb_zasum.f90 b/base/psblas/psb_zasum.f90
index 6bfd01ca..9b6fddd7 100644
--- a/base/psblas/psb_zasum.f90
+++ b/base/psblas/psb_zasum.f90
@@ -135,6 +135,19 @@ function psb_zasum (x,desc_a, info, jx,global) result(res)
   return
 end function psb_zasum
 
+! Function: psb_zasum_vect
+!    Computes norm1 of X
+!
+!    norm1 := sum(sub( X )(i))
+!
+!    where sub( X ) denotes X(1:N,JX:).
+!
+! Arguments:
+!    x      - type(psb_z_vect_type) The input vector.
+!    desc_a -  type(psb_desc_type).   The communication descriptor.
+!    info   -  integer.               Return code
+!    jx     -  integer(optional).     The column offset.
+!
 
 function psb_zasum_vect(x, desc_a, info,global) result(res)
   use psb_base_mod, psb_protect_name => psb_zasum_vect
diff --git a/base/psblas/psb_zaxpby.f90 b/base/psblas/psb_zaxpby.f90
index 273aa882..b48bb5ff 100644
--- a/base/psblas/psb_zaxpby.f90
+++ b/base/psblas/psb_zaxpby.f90
@@ -31,6 +31,23 @@
 !    
 ! File: psb_zaxpby.f90
 
+!
+! Subroutine: psb_zaxpby_vect
+!    Adds one distributed vector to another,
+!
+!    Y := beta * Y + alpha * X
+!
+! Arguments:
+!    alpha  -  complex,input        The scalar used to multiply each component of X
+!    x      - type(psb_z_vect_type) The input vector containing the entries of X
+!    beta   -  complex,input        The scalar used to multiply each component of Y
+!    y      - type(psb_z_vect_type)  The input/output vector Y
+!    desc_a -  type(psb_desc_type)  The communication descriptor.
+!    info   -  integer              Return code
+!
+!  Note: from a functional point of view, X is input, but here
+!        it's declared INOUT because of the sync() methods. 
+!
 subroutine psb_zaxpby_vect(alpha, x, beta, y,&
      & desc_a, info)
   use psb_base_mod, psb_protect_name => psb_zaxpby_vect
@@ -269,7 +286,7 @@ end subroutine psb_zaxpby
 !!$  
 !
 ! Subroutine: psb_zaxpbyv
-!    Adds one distributed matrix to another,
+!    Adds one distributed vector to another,
 !
 !    Y := beta * Y + alpha * X
 !
diff --git a/base/psblas/psb_zdot.f90 b/base/psblas/psb_zdot.f90
index 8f673c02..ad21b1d8 100644
--- a/base/psblas/psb_zdot.f90
+++ b/base/psblas/psb_zdot.f90
@@ -31,22 +31,22 @@
 !    
 ! File: psb_zdot.f90
 !
-! Function: psb_zdot
-!    psb_zdot forms the dot product of two distributed vectors,
+! Function: psb_zdot_vect
+!    psb_zdot computes the dot product of two distributed vectors,
 !
-!    dot := sub( X )**C * sub( Y )
+!    dot := ( X )**C * ( Y )
 !
-!    where sub( X ) denotes X(:,JX)
-!
-!    sub( Y ) denotes Y(:,JY).
 !
 ! Arguments:
-!    x(:,:) -  complex               The input vector containing the entries of sub( X ).
-!    y(:,:) -  complex               The input vector containing the entries of sub( Y ).
+!    x      -  type(psb_z_vect_type) The input vector containing the entries of sub( X ).
+!    y      -  type(psb_z_vect_type) The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
-!    jx     -  integer(optional).    The column offset for sub( X ).
-!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
+!  Note: from a functional point of view, X and Y are input, but here
+!        they are declared INOUT because of the sync() methods. 
+!
 !
 function psb_zdot_vect(x, y, desc_a,info,global) result(res)
   use psb_desc_mod
@@ -156,7 +156,25 @@ function psb_zdot_vect(x, y, desc_a,info,global) result(res)
   return
 
 end function psb_zdot_vect
-
+!
+! Function: psb_zdot
+!    psb_zdot computes the dot product of two distributed vectors,
+!
+!    dot := sub( X )**C * sub( Y )
+!
+!    where sub( X ) denotes X(:,JX)
+!
+!    sub( Y ) denotes Y(:,JY).
+!
+! Arguments:
+!    x(:,:) -  complex                The input vector containing the entries of sub( X ).
+!    y(:,:) -  complex                The input vector containing the entries of sub( Y ).
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    jx     -  integer(optional).    The column offset for sub( X ).
+!    jy     -  integer(optional).    The column offset for sub( Y ).
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
+!
 function psb_zdot(x, y,desc_a, info, jx, jy,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_zdot
   implicit none
@@ -298,7 +316,7 @@ end function psb_zdot
 !!$
 ! 
 ! Function: psb_zdotv
-!    psb_zdotv forms the dot product of two distributed vectors,
+!    psb_zdotv computes the dot product of two distributed vectors,
 !
 !    dot := X**C * Y
 !
@@ -307,6 +325,7 @@ end function psb_zdot
 !    y(:)   -  complex               The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 function psb_zdotv(x, y,desc_a, info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_zdotv
@@ -432,7 +451,7 @@ end function psb_zdotv
 !!$
 !  
 ! Subroutine: psb_zdotvs
-!    psb_zdotvs forms the dot product of two distributed vectors,
+!    psb_zdotvs computes the dot product of two distributed vectors,
 !
 !    res := X**C * Y
 !
@@ -442,6 +461,7 @@ end function psb_zdotv
 !    y(:)   -  complex              The input vector containing the entries of Y.
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_zdotvs(res, x, y,desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_zdotvs
@@ -565,7 +585,7 @@ end subroutine psb_zdotvs
 !!$
 !
 ! Subroutine: psb_zmdots
-!    psb_zmdots forms the dot product of multiple distributed vectors,
+!    psb_zmdots computes the dot product of multiple distributed vectors,
 !
 !    res(i) := ( X(:,i) )**C * ( Y(:,i) )
 !
@@ -575,6 +595,7 @@ end subroutine psb_zdotvs
 !    y(:)   -  complex              The input vector containing the entries of sub( Y ).
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
+!    global -  logical(optional)     Whether to perform the global sum, default: .true.
 !
 subroutine psb_zmdots(res, x, y, desc_a, info,global)  
   use psb_base_mod, psb_protect_name => psb_zmdots
diff --git a/base/psblas/psb_znrm2.f90 b/base/psblas/psb_znrm2.f90
index 1e1cac26..9f327773 100644
--- a/base/psblas/psb_znrm2.f90
+++ b/base/psblas/psb_znrm2.f90
@@ -32,9 +32,9 @@
 ! File: psb_znrm2.f90
 !
 ! Function: psb_znrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( sub( X )**T * sub( X ) )
+!    norm2 := sqrt ( sub( X )**C * sub( X ) )
 !
 !    where sub( X ) denotes X(:,JX).
 !
@@ -43,6 +43,7 @@
 !    desc_a -  type(psb_desc_type). The communication descriptor.
 !    info   -  integer.             Return code
 !    jx     -  integer(optional).   The column offset for sub( X ).
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_znrm2(x, desc_a, info, jx,global)  result(res)
   use psb_desc_mod
@@ -170,14 +171,15 @@ end function psb_znrm2
 !!$
 !
 ! Function: psb_znrm2
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector,
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    x(:)   -  complex               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_znrm2v(x, desc_a, info,global)  result(res)
   use psb_desc_mod
@@ -263,7 +265,17 @@ function psb_znrm2v(x, desc_a, info,global)  result(res)
 end function psb_znrm2v
 
 
-
+! Function: psb_znrm2_vect
+!    Computes the norm2 of a distributed vector,
+!
+!    norm2 := sqrt ( X**C * X)
+!
+! Arguments:
+!    x      -  type(psb_z_vect_type) The input vector containing the entries of X.
+!    desc_a -  type(psb_desc_type).  The communication descriptor.
+!    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
+!
 function psb_znrm2_vect(x, desc_a, info,global)  result(res)
   use psb_desc_mod
   use psb_check_mod
@@ -344,7 +356,7 @@ function psb_znrm2_vect(x, desc_a, info,global)  result(res)
         res = res - sqrt(zone - dd*(abs(x%v%v(idx))/res)**2)
       end do
     end if
-    else 	    
+  else 	    
     res = dzero
   end if
 
@@ -392,15 +404,16 @@ end function psb_znrm2_vect
 !!$
 !
 ! Subroutine: psb_znrm2vs
-!    Forms the norm2 of a distributed vector,
+!    Computes the norm2 of a distributed vector, subroutine version
 !
-!    norm2 := sqrt ( X**T * X)
+!    norm2 := sqrt ( X**C * X)
 !
 ! Arguments:
 !    res    -  real                  The result.
 !    x(:)   -  complex               The input vector containing the entries of X.
 !    desc_a -  type(psb_desc_type).  The communication descriptor.
 !    info   -  integer.              Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 subroutine psb_znrm2vs(res, x, desc_a, info,global)
   use psb_desc_mod
diff --git a/base/psblas/psb_znrmi.f90 b/base/psblas/psb_znrmi.f90
index 9e0440ff..19b071e4 100644
--- a/base/psblas/psb_znrmi.f90
+++ b/base/psblas/psb_znrmi.f90
@@ -32,14 +32,15 @@
 ! File: psb_znrmi.f90
 !
 ! Function: psb_znrmi
-!    Forms the infinity norm of a sparse matrix,                           
+!    Computes the infinity norm of a sparse matrix,                           
 !
-!    nrmi := max_i(abs(sum(A(i,:))))                                              
+!    nrmi := max_i(sum(abs(A(i,:))))                                              
 !
 ! Arguments:
 !    a      -  type(psb_zspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_znrmi(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_znrmi
diff --git a/base/psblas/psb_zspmm.f90 b/base/psblas/psb_zspmm.f90
index 83c42bc0..add3205b 100644
--- a/base/psblas/psb_zspmm.f90
+++ b/base/psblas/psb_zspmm.f90
@@ -31,6 +31,235 @@
 !    
 ! File: psb_zspmm.f90
 !
+!
+! Subroutine: psb_zspm_vect
+!     Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A * Pc * X  + beta * Y,  or
+!
+!     Y := alpha * Pr * A' * Pr * X  + beta * Y,
+!
+!  alpha and beta are scalars, X and Y are distributed
+!  vectors and A is a M-by-N distributed matrix.
+!
+! Arguments:   
+!    alpha   -  complex                The scalar alpha.
+!    a       -  type(psb_zspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_z_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  complex                The scalar beta.
+!    y       -  type(psb_z_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. Default:  'N' 
+!    work(:) -  complex,(optional).    Working area.
+!    doswap  -  logical(optional).     Whether to performe halo updates.
+! 
+subroutine  psb_zspmv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, work, doswap)   
+  use psb_base_mod, psb_protect_name => psb_zspmv_vect
+  use psi_mod
+  implicit none
+
+  complex(psb_dpk_), intent(in)            :: alpha, beta
+  type(psb_z_vect_type), intent(inout)     :: x
+  type(psb_z_vect_type), intent(inout)     :: y
+  type(psb_zspmat_type), intent(in)        :: a
+  type(psb_desc_type), intent(in)          :: desc_a
+  integer(psb_ipk_), intent(out)                     :: info
+  complex(psb_dpk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional          :: trans
+  logical, intent(in), optional            :: doswap
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me,&
+       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
+       & liwork, iiy, jjy, ib, ip, idx
+  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
+  integer(psb_ipk_), parameter       :: nb=4
+  complex(psb_dpk_), pointer :: iwork(:), xp(:), yp(:)
+  complex(psb_dpk_), allocatable :: xvsave(:)
+  character                :: trans_
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw, doswap_
+  integer(psb_ipk_) :: debug_level, debug_unit
+
+  name='psb_zspmv'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+  debug_unit  = psb_get_debug_unit()
+  debug_level = psb_get_debug_level()
+
+  ictxt=desc_a%get_context()
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (present(doswap)) then
+    doswap_ = doswap
+  else
+    doswap_ = .true.
+  endif
+
+  if (present(trans)) then     
+    trans_ = psb_toupper(trans)
+  else
+    trans_ = 'N'
+  endif
+  if ( (trans_ == 'N').or.(trans_ == 'T')&
+       & .or.(trans_ == 'C')) then
+  else
+    info = psb_err_iarg_invalid_value_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+  m    = desc_a%get_global_rows()
+  n    = desc_a%get_global_cols()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then
+    if (size(work) >= liwork) then
+      aliw =.false.
+    else
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='Allocate'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
+
+  if (trans_ == 'N') then
+    !  Matrix is not transposed
+
+    if (doswap_) then
+      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & zzero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
+    end if
+
+    call psb_csmm(alpha,a,x,beta,y,info)
+
+    if(info /= psb_success_) then
+      info = psb_err_from_subroutine_non_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+
+  else
+    !  Matrix is transposed
+
+    !
+    ! Non-empty overlap, need a buffer to hold
+    ! the entries updated with average operator.
+    ! Why the average? because in this way they will contribute
+    ! with a proper scale factor (1/np) to the overall product.
+    ! 
+    call psi_ovrl_save(x%v,xvsave,desc_a,info)
+    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
+
+    if (beta /= zzero) call y%set(zzero,nrow+1,ncol)
+    !  local Matrix-vector product
+    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
+
+    if (debug_level >= psb_debug_comp_) &
+         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
+
+    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
+    if (info /= psb_success_) then
+      info = psb_err_from_subroutine_
+      ch_err='psb_csmm'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+
+    if (doswap_) then
+      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
+           & zone,y%v,desc_a,iwork,info)
+      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+           & zone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+      if (debug_level >= psb_debug_comp_) &
+           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
+      if(info /= psb_success_) then
+        info = psb_err_from_subroutine_
+        ch_err='PSI_SwapTran'
+        call psb_errpush(info,name,a_err=ch_err)
+        goto 9999
+      end if
+    end if
+
+  end if
+
+  if (aliw) deallocate(iwork,stat=info)
+  if (debug_level >= psb_debug_comp_) &
+       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='Deallocate iwork'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  nullify(iwork)
+
+  call psb_erractionrestore(err_act)
+  if (debug_level >= psb_debug_comp_) then 
+    call psb_barrier(ictxt)
+    write(debug_unit,*) me,' ',trim(name),' Returning '
+  endif
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_zspmv_vect
+!
 ! Subroutine: psb_zspmm
 !     Performs one of the distributed matrix-vector operations
 !
@@ -356,10 +585,6 @@ subroutine  psb_zspmm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_zspmm
-
-
-
-
 !!$ 
 !!$              Parallel Sparse BLAS  version 3.5
 !!$    (C) Copyright 2006-2018
@@ -670,211 +895,3 @@ subroutine  psb_zspmv(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_zspmv
-
-
-
-subroutine  psb_zspmv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, work, doswap)   
-  use psb_base_mod, psb_protect_name => psb_zspmv_vect
-  use psi_mod
-  implicit none
-
-  complex(psb_dpk_), intent(in)            :: alpha, beta
-  type(psb_z_vect_type), intent(inout)     :: x
-  type(psb_z_vect_type), intent(inout)     :: y
-  type(psb_zspmat_type), intent(in)        :: a
-  type(psb_desc_type), intent(in)          :: desc_a
-  integer(psb_ipk_), intent(out)                     :: info
-  complex(psb_dpk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional          :: trans
-  logical, intent(in), optional            :: doswap
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me,&
-       & err_act, iix, jjx, iia, jja,  nrow, ncol, lldx, lldy, &
-       & liwork, iiy, jjy, ib, ip, idx
-  integer(psb_lpk_) :: ix, ijx, iy, ijy, m, n, ia, ja
-  integer(psb_ipk_), parameter       :: nb=4
-  complex(psb_dpk_), pointer :: iwork(:), xp(:), yp(:)
-  complex(psb_dpk_), allocatable :: xvsave(:)
-  character                :: trans_
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw, doswap_
-  integer(psb_ipk_) :: debug_level, debug_unit
-
-  name='psb_zspmv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-  debug_unit  = psb_get_debug_unit()
-  debug_level = psb_get_debug_level()
-
-  ictxt=desc_a%get_context()
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (present(doswap)) then
-    doswap_ = doswap
-  else
-    doswap_ = .true.
-  endif
-
-  if (present(trans)) then     
-    trans_ = psb_toupper(trans)
-  else
-    trans_ = 'N'
-  endif
-  if ( (trans_ == 'N').or.(trans_ == 'T')&
-       & .or.(trans_ == 'C')) then
-  else
-    info = psb_err_iarg_invalid_value_
-    call psb_errpush(info,name)
-    goto 9999
-  end if
-
-  m    = desc_a%get_global_rows()
-  n    = desc_a%get_global_cols()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then
-    if (size(work) >= liwork) then
-      aliw =.false.
-    else
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='Allocate'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' Allocated work ', info
-
-  if (trans_ == 'N') then
-    !  Matrix is not transposed
-
-    if (doswap_) then
-      call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & zzero,x%v,desc_a,iwork,info,data=psb_comm_halo_)
-    end if
-
-    call psb_csmm(alpha,a,x,beta,y,info)
-
-    if(info /= psb_success_) then
-      info = psb_err_from_subroutine_non_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-
-  else
-    !  Matrix is transposed
-
-    !
-    ! Non-empty overlap, need a buffer to hold
-    ! the entries updated with average operator.
-    ! Why the average? because in this way they will contribute
-    ! with a proper scale factor (1/np) to the overall product.
-    ! 
-    call psi_ovrl_save(x%v,xvsave,desc_a,info)
-    if (info == psb_success_) call psi_ovrl_upd(x%v,desc_a,psb_avg_,info)
-
-    if (beta /= zzero) call y%set(zzero,nrow+1,ncol)
-    !  local Matrix-vector product
-    if (info == psb_success_) call psb_csmm(alpha,a,x,beta,y,info,trans=trans_)
-
-    if (debug_level >= psb_debug_comp_) &
-         & write(debug_unit,*) me,' ',trim(name),' csmm ', info
-
-    if (info == psb_success_) call psi_ovrl_restore(x%v,xvsave,desc_a,info)
-    if (info /= psb_success_) then
-      info = psb_err_from_subroutine_
-      ch_err='psb_csmm'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    if (doswap_) then
-      call psi_swaptran(ior(psb_swap_send_,psb_swap_recv_),&
-           & zone,y%v,desc_a,iwork,info)
-      if (info == psb_success_) call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-           & zone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-      if (debug_level >= psb_debug_comp_) &
-           & write(debug_unit,*) me,' ',trim(name),' swaptran ', info
-      if(info /= psb_success_) then
-        info = psb_err_from_subroutine_
-        ch_err='PSI_SwapTran'
-        call psb_errpush(info,name,a_err=ch_err)
-        goto 9999
-      end if
-    end if
-
-  end if
-
-  if (aliw) deallocate(iwork,stat=info)
-  if (debug_level >= psb_debug_comp_) &
-       & write(debug_unit,*) me,' ',trim(name),' deallocat ',aliw, info
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='Deallocate iwork'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  nullify(iwork)
-
-  call psb_erractionrestore(err_act)
-  if (debug_level >= psb_debug_comp_) then 
-    call psb_barrier(ictxt)
-    write(debug_unit,*) me,' ',trim(name),' Returning '
-  endif
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_zspmv_vect
diff --git a/base/psblas/psb_zspnrm1.f90 b/base/psblas/psb_zspnrm1.f90
index 3c292a74..7bc5fa15 100644
--- a/base/psblas/psb_zspnrm1.f90
+++ b/base/psblas/psb_zspnrm1.f90
@@ -40,6 +40,7 @@
 !    a      -  type(psb_zspmat_type).   The sparse matrix containing A.
 !    desc_a -  type(psb_desc_type).     The communication descriptor.
 !    info   -  integer.                   Return code
+!    global -  logical(optional)    Whether to perform the global reduction, default: .true.
 !
 function psb_zspnrm1(a,desc_a,info,global)  result(res)
   use psb_base_mod, psb_protect_name => psb_zspnrm1
diff --git a/base/psblas/psb_zspsm.f90 b/base/psblas/psb_zspsm.f90
index 13e00d3f..63cbe783 100644
--- a/base/psblas/psb_zspsm.f90
+++ b/base/psblas/psb_zspsm.f90
@@ -31,6 +31,203 @@
 !    
 ! File: psb_zspsm.f90
 !
+!
+! Subroutine: psb_zsps_vect
+!  Performs one of the distributed matrix-vector operations
+!
+!     Y := alpha * Pr * A-1 * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-1 * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-1 * Pc * D * X + beta * Y,  or
+!
+!     Y := alpha * Pr * A-T * Pc * X + beta * Y,   or
+!
+!     Y := alpha * D * Pr * A-T * Pc * X + beta * Y, or
+!
+!     Y := alpha * Pr * A-T * Pc * D * X + beta * Y,  or
+!
+!  X is a distributed
+!  vector and T is a M-by-M distributed triangular matrix.
+!
+!
+! Arguments:   
+!    alpha   -  complex.               The scalar alpha.
+!    a       -  type(psb_zspmat_type). The sparse matrix containing A.
+!    x       -  type(psb_z_vect_type) The input vector containing the entries of ( X ).
+!    beta    -  complex                The scalar beta.
+!    y       -  type(psb_z_vect_type) The input vector containing the entries of ( Y ).
+!    desc_a  -  type(psb_desc_type).   The communication descriptor.
+!    info    -  integer.               Return code
+!    trans   -  character(optional).   Whether A or A'. If not present 'N' is assumed.
+!    scale   -  character(optional).   Specify some type of operation with
+!                                      the diagonal matrix D.
+!    choice  -  integer(optional).     The kind of update to perform on overlap elements.
+!    d(:)    -  complex, optional      Matrix for diagonal scaling.
+!    work(:) -  complex, optional      Working area.
+! 
+subroutine  psb_zspsv_vect(alpha,a,x,beta,y,desc_a,info,&
+     & trans, scale, choice, diag, work)   
+  use psb_base_mod, psb_protect_name => psb_zspsv_vect
+  use psi_mod
+  implicit none 
+
+  complex(psb_dpk_), intent(in)           :: alpha, beta
+  type(psb_z_vect_type), intent(inout)    :: x
+  type(psb_z_vect_type), intent(inout)    :: y
+  type(psb_zspmat_type), intent(inout)    :: a
+  type(psb_desc_type), intent(in)         :: desc_a
+  integer(psb_ipk_), intent(out)                    :: info
+  type(psb_z_vect_type), intent(inout), optional  :: diag
+  complex(psb_dpk_), optional, target, intent(inout) :: work(:)
+  character, intent(in), optional         :: trans, scale
+  integer(psb_ipk_), intent(in), optional           :: choice
+
+  ! locals
+  integer(psb_ipk_) :: ictxt, np, me, &
+       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
+       & ix, iy, ik, jx, jy, i, lld,&
+       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
+
+  character                :: lscale
+  integer(psb_ipk_), parameter       :: nb=4
+  complex(psb_dpk_),pointer :: iwork(:), xp(:), yp(:)
+  character                :: itrans
+  character(len=20)        :: name, ch_err
+  logical                  :: aliw
+
+  name='psb_zspsv_vect'
+  info=psb_success_
+  call psb_erractionsave(err_act)
+  if  (psb_errstatus_fatal()) then
+    info = psb_err_internal_error_ ;    goto 9999
+  end if
+
+  ictxt=desc_a%get_context()
+
+  call psb_info(ictxt, me, np)
+  if (np == -1) then
+    info = psb_err_context_error_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+  if (.not.allocated(x%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+  if (.not.allocated(y%v)) then 
+    info = psb_err_invalid_vect_state_
+    call psb_errpush(info,name)
+    goto 9999
+  endif
+
+
+  if (present(choice)) then     
+    choice_ = choice
+  else
+    choice_ = psb_avg_
+  endif
+
+  if (present(scale)) then     
+    lscale = psb_toupper(scale)
+  else
+    lscale = 'U'
+  endif
+
+  if (present(trans)) then     
+    itrans = psb_toupper(trans)
+    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
+      ! Ok
+    else
+      info = psb_err_iarg_invalid_value_
+      call psb_errpush(info,name)
+      goto 9999
+    end if
+  else
+    itrans = 'N'
+  endif
+
+  m    = desc_a%get_global_rows()
+  nrow = desc_a%get_local_rows()
+  ncol = desc_a%get_local_cols()
+  lldx = x%get_nrows()
+  lldy = y%get_nrows()
+  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
+  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
+
+  if (psb_errstatus_fatal()) then 
+    info=psb_err_from_subroutine_
+    ch_err='reall'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  iwork => null()
+  ! check for presence/size of a work area
+  liwork= 2*ncol
+
+  if (present(work)) then     
+    if (size(work) >= liwork) then 
+      aliw =.false.
+    else 
+      aliw=.true.
+    endif
+  else
+    aliw=.true.
+  end if
+
+  if (aliw) then 
+    allocate(iwork(liwork),stat=info)
+    if(info /= psb_success_) then
+      info=psb_err_from_subroutine_
+      ch_err='psb_realloc'
+      call psb_errpush(info,name,a_err=ch_err)
+      goto 9999
+    end if
+  else
+    iwork => work
+  endif
+
+  iwork(1)=0.d0
+
+  ! Perform local triangular system solve
+  if (present(diag)) then 
+    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
+  else
+    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
+  end if
+  if(info /= psb_success_) then
+    info = psb_err_from_subroutine_
+    ch_err='dcssm'
+    call psb_errpush(info,name,a_err=ch_err)
+    goto 9999
+  end if
+
+  ! update overlap elements
+  if (choice_ > 0) then
+    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
+         & zone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
+
+
+    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
+    if (info /= psb_success_) then
+      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
+      goto 9999
+    end if
+  end if
+
+  if (aliw) deallocate(iwork)
+
+  call psb_erractionrestore(err_act)
+  return  
+
+9999 call psb_error_handler(ictxt,err_act)
+
+  return
+end subroutine psb_zspsv_vect
+!
 ! Subroutine: psb_zspsm
 !  Performs one of the distributed matrix-vector operations
 !
@@ -283,38 +480,6 @@ subroutine  psb_zspsm(alpha,a,x,beta,y,desc_a,info,&
 
   return
 end subroutine psb_zspsm
-
-!!$ 
-!!$              Parallel Sparse BLAS  version 3.5
-!!$    (C) Copyright 2006-2018
-!!$                       Salvatore Filippone    University of Rome Tor Vergata
-!!$                       Alfredo Buttari      
-!!$ 
-!!$  Redistribution and use in source and binary forms, with or without
-!!$  modification, are permitted provided that the following conditions
-!!$  are met:
-!!$    1. Redistributions of source code must retain the above copyright
-!!$       notice, this list of conditions and the following disclaimer.
-!!$    2. Redistributions in binary form must reproduce the above copyright
-!!$       notice, this list of conditions, and the following disclaimer in the
-!!$       documentation and/or other materials provided with the distribution.
-!!$    3. The name of the PSBLAS group or the names of its contributors may
-!!$       not be used to endorse or promote products derived from this
-!!$       software without specific written permission.
-!!$ 
-!!$  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-!!$  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-!!$  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-!!$  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
-!!$  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-!!$  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-!!$  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-!!$  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-!!$  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-!!$  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-!!$  POSSIBILITY OF SUCH DAMAGE.
-!!$ 
-!!$
 !
 ! Subroutine: psb_zspsv
 !  Performs one of the distributed matrix-vector operations
@@ -545,166 +710,3 @@ subroutine  psb_zspsv(alpha,a,x,beta,y,desc_a,info,&
   return
 end subroutine psb_zspsv
 
-
-subroutine  psb_zspsv_vect(alpha,a,x,beta,y,desc_a,info,&
-     & trans, scale, choice, diag, work)   
-  use psb_base_mod, psb_protect_name => psb_zspsv_vect
-  use psi_mod
-  implicit none 
-
-  complex(psb_dpk_), intent(in)           :: alpha, beta
-  type(psb_z_vect_type), intent(inout)    :: x
-  type(psb_z_vect_type), intent(inout)    :: y
-  type(psb_zspmat_type), intent(inout)    :: a
-  type(psb_desc_type), intent(in)         :: desc_a
-  integer(psb_ipk_), intent(out)                    :: info
-  type(psb_z_vect_type), intent(inout), optional  :: diag
-  complex(psb_dpk_), optional, target, intent(inout) :: work(:)
-  character, intent(in), optional         :: trans, scale
-  integer(psb_ipk_), intent(in), optional           :: choice
-
-  ! locals
-  integer(psb_ipk_) :: ictxt, np, me, &
-       & err_act, iix, jjx, ia, ja, iia, jja, lldx,lldy, choice_,&
-       & ix, iy, ik, jx, jy, i, lld,&
-       & m, nrow, ncol, liwork, llwork, iiy, jjy, idx, ndm
-
-  character                :: lscale
-  integer(psb_ipk_), parameter       :: nb=4
-  complex(psb_dpk_),pointer :: iwork(:), xp(:), yp(:)
-  character                :: itrans
-  character(len=20)        :: name, ch_err
-  logical                  :: aliw
-
-  name='psb_sspsv'
-  info=psb_success_
-  call psb_erractionsave(err_act)
-  if  (psb_errstatus_fatal()) then
-    info = psb_err_internal_error_ ;    goto 9999
-  end if
-
-  ictxt=desc_a%get_context()
-
-  call psb_info(ictxt, me, np)
-  if (np == -1) then
-    info = psb_err_context_error_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-  if (.not.allocated(x%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-  if (.not.allocated(y%v)) then 
-    info = psb_err_invalid_vect_state_
-    call psb_errpush(info,name)
-    goto 9999
-  endif
-
-
-  if (present(choice)) then     
-    choice_ = choice
-  else
-    choice_ = psb_avg_
-  endif
-
-  if (present(scale)) then     
-    lscale = psb_toupper(scale)
-  else
-    lscale = 'U'
-  endif
-
-  if (present(trans)) then     
-    itrans = psb_toupper(trans)
-    if((itrans == 'N').or.(itrans == 'T').or.(itrans == 'C')) then
-      ! Ok
-    else
-      info = psb_err_iarg_invalid_value_
-      call psb_errpush(info,name)
-      goto 9999
-    end if
-  else
-    itrans = 'N'
-  endif
-
-  m    = desc_a%get_global_rows()
-  nrow = desc_a%get_local_rows()
-  ncol = desc_a%get_local_cols()
-  lldx = x%get_nrows()
-  lldy = y%get_nrows()
-  if ((info == 0).and.(lldx<ncol)) call x%reall(ncol,info)
-  if ((info == 0).and.(lldy<ncol)) call y%reall(ncol,info)
-
-  if (psb_errstatus_fatal()) then 
-    info=psb_err_from_subroutine_
-    ch_err='reall'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  iwork => null()
-  ! check for presence/size of a work area
-  liwork= 2*ncol
-
-  if (present(work)) then     
-    if (size(work) >= liwork) then 
-      aliw =.false.
-    else 
-      aliw=.true.
-    endif
-  else
-    aliw=.true.
-  end if
-
-  if (aliw) then 
-    allocate(iwork(liwork),stat=info)
-    if(info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='psb_realloc'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-  else
-    iwork => work
-  endif
-
-  iwork(1)=0.d0
-
-  ! Perform local triangular system solve
-  if (present(diag)) then 
-    call a%spsm(alpha,x,beta,y,info,scale=scale,d=diag,trans=trans)    
-  else
-    call a%spsm(alpha,x,beta,y,info,scale=scale,trans=trans)    
-  end if
-  if(info /= psb_success_) then
-    info = psb_err_from_subroutine_
-    ch_err='dcssm'
-    call psb_errpush(info,name,a_err=ch_err)
-    goto 9999
-  end if
-
-  ! update overlap elements
-  if (choice_ > 0) then
-    call psi_swapdata(ior(psb_swap_send_,psb_swap_recv_),&
-         & zone,y%v,desc_a,iwork,info,data=psb_comm_ovr_)
-
-
-    if (info == psb_success_) call psi_ovrl_upd(y%v,desc_a,choice_,info)
-    if (info /= psb_success_) then
-      call psb_errpush(psb_err_from_subroutine_,name,a_err='Inner updates')
-      goto 9999
-    end if
-  end if
-
-  if (aliw) deallocate(iwork)
-
-  call psb_erractionrestore(err_act)
-  return  
-
-9999 call psb_error_handler(ictxt,err_act)
-
-  return
-end subroutine psb_zspsv_vect
-
diff --git a/base/serial/psb_camax_s.f90 b/base/serial/psb_camax_s.f90
index 7e608c8e..d9073d0f 100644
--- a/base/serial/psb_camax_s.f90
+++ b/base/serial/psb_camax_s.f90
@@ -31,7 +31,7 @@
 !  
 !
 ! Function: psb_camax_s
-!    Searches the absolute max of X.
+!    Computes the max absolute value of X.
 !
 !    normi := max(abs(X(i))  
 !
diff --git a/base/serial/psb_casum_s.f90 b/base/serial/psb_casum_s.f90
index cf799cdc..1c357cc8 100644
--- a/base/serial/psb_casum_s.f90
+++ b/base/serial/psb_casum_s.f90
@@ -31,9 +31,9 @@
 !  
 !
 ! Function: psb_casum_s
-!    Searches the absolute max of X.
+!    Computes the sum of  absolute values of X.
 !
-!    normi := max(abs(X(i))  
+!    asum := sum(abs(X(:))  
 !
 ! Arguments:
 !    n      - integer  size of X
diff --git a/base/serial/psb_damax_s.f90 b/base/serial/psb_damax_s.f90
index f376f0ba..66aef288 100644
--- a/base/serial/psb_damax_s.f90
+++ b/base/serial/psb_damax_s.f90
@@ -31,7 +31,7 @@
 !  
 !
 ! Function: psb_damax_s
-!    Searches the absolute max of X.
+!    Computes the max absolute value of X.
 !
 !    normi := max(abs(X(i))  
 !
diff --git a/base/serial/psb_dasum_s.f90 b/base/serial/psb_dasum_s.f90
index 6fb6e4f6..1ab42614 100644
--- a/base/serial/psb_dasum_s.f90
+++ b/base/serial/psb_dasum_s.f90
@@ -31,9 +31,9 @@
 !  
 !
 ! Function: psb_dasum_s
-!    Searches the absolute max of X.
+!    Computes the sum of  absolute values of X.
 !
-!    normi := max(abs(X(i))  
+!    asum := sum(abs(X(:))  
 !
 ! Arguments:
 !    n      - integer  size of X
diff --git a/base/serial/psb_samax_s.f90 b/base/serial/psb_samax_s.f90
index da7fc93e..f2b1fee2 100644
--- a/base/serial/psb_samax_s.f90
+++ b/base/serial/psb_samax_s.f90
@@ -31,7 +31,7 @@
 !  
 !
 ! Function: psb_samax_s
-!    Searches the absolute max of X.
+!    Computes the max absolute value of X.
 !
 !    normi := max(abs(X(i))  
 !
diff --git a/base/serial/psb_sasum_s.f90 b/base/serial/psb_sasum_s.f90
index 7a3b44e2..c692cbd4 100644
--- a/base/serial/psb_sasum_s.f90
+++ b/base/serial/psb_sasum_s.f90
@@ -31,9 +31,9 @@
 !  
 !
 ! Function: psb_sasum_s
-!    Searches the absolute max of X.
+!    Computes the sum of  absolute values of X.
 !
-!    normi := max(abs(X(i))  
+!    asum := sum(abs(X(:))  
 !
 ! Arguments:
 !    n      - integer  size of X
diff --git a/base/serial/psb_zamax_s.f90 b/base/serial/psb_zamax_s.f90
index 8cac2e2e..4a408566 100644
--- a/base/serial/psb_zamax_s.f90
+++ b/base/serial/psb_zamax_s.f90
@@ -31,7 +31,7 @@
 !  
 !
 ! Function: psb_zamax_s
-!    Searches the absolute max of X.
+!    Computes the max absolute value of X.
 !
 !    normi := max(abs(X(i))  
 !
diff --git a/base/serial/psb_zasum_s.f90 b/base/serial/psb_zasum_s.f90
index f00e4347..35a043ed 100644
--- a/base/serial/psb_zasum_s.f90
+++ b/base/serial/psb_zasum_s.f90
@@ -31,9 +31,9 @@
 !  
 !
 ! Function: psb_zasum_s
-!    Searches the absolute max of X.
+!    Computes the sum of  absolute values of X.
 !
-!    normi := max(abs(X(i))  
+!    asum := sum(abs(X(:))  
 !
 ! Arguments:
 !    n      - integer  size of X
diff --git a/test/pargen/psb_d_pde3d.f90 b/test/pargen/psb_d_pde3d.f90
index 60cb9e9e..429e9a0e 100644
--- a/test/pargen/psb_d_pde3d.f90
+++ b/test/pargen/psb_d_pde3d.f90
@@ -214,8 +214,7 @@ contains
     ! Note: integer control variables going directly into an MPI call
     ! must be 4 bytes, i.e. psb_mpk_
     integer(psb_mpk_) :: npdims(3), npp, minfo
-    integer(psb_mpk_) :: npx,npy,npz, iamx,iamy,iamz
-    integer(psb_ipk_) :: mynx,myny,mynz
+    integer(psb_ipk_) :: npx,npy,npz, iamx,iamy,iamz,mynx,myny,mynz
     integer(psb_ipk_), allocatable :: bndx(:),bndy(:),bndz(:)
     ! Process grid
     integer(psb_ipk_) :: np, iam
@@ -236,9 +235,8 @@ contains
     call psb_erractionsave(err_act)
 
     call psb_info(ictxt, iam, np)
-    call psb_cd_set_large_threshold(1000)
-    call psb_cd_set_maxspace(-1)
-    
+
+
     if (present(f)) then 
       f_ => f
     else
@@ -373,49 +371,7 @@ contains
       ! the set of global indices it owns.
       ! 
       call psb_cdall(ictxt,desc_a,info,vl=myidx)
-
       
-      block
-        !
-        ! Test adjcncy methods 
-        ! 
-        integer(psb_mpk_), allocatable :: neighbours(:)
-        integer(psb_mpk_) :: cnt
-        logical, parameter :: debug_adj=.false.
-        if (debug_adj.and.(np > 1)) then 
-          cnt = 0
-          allocate(neighbours(np))
-          if (iamx < npx-1) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx+1,iamy,iamz,npx,npy,npz,base=0)
-          end if
-          if (iamy < npy-1) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx,iamy+1,iamz,npx,npy,npz,base=0)
-          end if
-          if (iamz < npz-1) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx,iamy,iamz+1,npx,npy,npz,base=0)
-          end if
-          if (iamx >0) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx-1,iamy,iamz,npx,npy,npz,base=0)
-          end if
-          if (iamy >0) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx,iamy-1,iamz,npx,npy,npz,base=0)
-          end if
-          if (iamz >0) then
-            cnt = cnt + 1 
-            call ijk2idx(neighbours(cnt),iamx,iamy,iamz-1,npx,npy,npz,base=0)
-          end if
-          call psb_realloc(cnt, neighbours,info)
-          call desc_a%set_p_adjcncy(neighbours)
-          write(0,*) iam,' Check on neighbours: ',desc_a%get_p_adjcncy()
-        end if
-      end block
-
-
     case default
       write(psb_err_unit,*) iam, 'Initialization error: should not get here'
       info = -1
diff --git a/test/pargen/runs/ppde.inp b/test/pargen/runs/ppde.inp
index 82f0bce6..a0f28e99 100644
--- a/test/pargen/runs/ppde.inp
+++ b/test/pargen/runs/ppde.inp
@@ -2,7 +2,7 @@
 BICGSTAB               Iterative method BICGSTAB CGS  BICG BICGSTABL RGMRES FCG CGR
 BJAC                   Preconditioner NONE  DIAG  BJAC 
 CSR                    Storage format for matrix A:  CSR COO 
-040                    Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) )
+080                    Domain size (acutal system is this**3 (pde3d) or **2 (pde2d) )
 3                      Partition: 1 BLOCK  3 3D
 2                      Stopping criterion 1 2 
 0100                   MAXIT