Merge branch 'merge-paraggr' into merge-paraggr-newops

# Conflicts: # docs/html/index.html # docs/html/userhtml.css # docs/html/userhtml.html # docs/psblas-3.7.pdf # docs/src/psbrout.tex
5 years ago · 77cdfd6cae
parent f95d089903 0e27c66a51
commit 77cdfd6cae
382 changed files with 50741 additions and 7407 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-PSBLAS library, version 3.6
+PSBLAS library, version 3.7
 ===========================

 The architecture of the Fortran 2003 sparse BLAS is described in:
@ -110,6 +110,25 @@ Fix all reamining bugs. Bugs? We dont' have any ! ;-)


 The PSBLAS team. 
+---------------
+Project lead:
+Salvatore Filippone
+
+Contributors (roughly reverse cronological order):
+
+Soren 	   Rasmussen
+Zaak       Beekman
+Ambra	   Abdullahi Hassan
+Pasqua	   D'Ambra
+Alfredo    Buttari
+Daniela    di Serafino
+Michele    Martone
+Michele    Colajanni
+Fabio      Cerioni
+Stefano    Maiolatesi
+Dario      Pascucci
+
+

 RELATED SOFTWARE
 ----------------
--- a/36
+++ b/36
@ -1,5 +1,21 @@
 WHAT'S NEW

+Version 3.7.0
+   1. Major change at API level: integer kinds reorganization.
+      Local indices are now of kind PSB_IPK_, whereas global indices
+      are  of kind PSB_LPK_; at configure time it is possible to
+      choose them independently to be PSB_MPK_ (4 bytes) or PSB_EPK_
+      (8 bytes), with the constraint that LPK is always at least as
+      large as IPK.
+   2. The internals have also been reorganized heavily, with a much
+      more coherent design and split of functionalities into source
+      files. 
+
+
+Version 3.6.1
+   1. Multiple improvements to CDASB.
+   2. Some additional methods required by the latest versions of MLD
+

 Version 3.5.
   1. New Krylov methods methods  FCG and GCR are now available.
@ -86,29 +102,9 @@ The solution: either take out the -g option, or, if you really need to
 debug, edit Make.inc to force -DMPI_H in place of -DMPI_MOD. 


-
-The PSBLAS team. 
-
 RELATED SOFTWARE

 If you are looking for more sophisticated preconditioners, you may be
 interested in the package  MLD2P4 from http://github.com/sfilippone/mld2p4-2


-Contact: https://github.com/sfilippone/psblas3
-
-Contributors 
-Salvatore  Filippone 
-Alfredo    Buttari
-Soren 	   Rasmussen
-Ambra	   Abdullahi Hassan
-Pasqua	   D'Ambra
-Daniela    di Serafino
-Michele    Martone
-Michele    Colajanni
-Fabio      Cerioni
-Stefano    Maiolatesi
-Dario      Pascucci
-
-
-
--- a/base/comm/internals/psi_covrl_restr.f90
+++ b/base/comm/internals/psi_covrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_covrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_covrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_covrl_restr_vect
  use psb_c_base_vect_mod
--- a/base/comm/internals/psi_covrl_restr_a.f90
+++ b/base/comm/internals/psi_covrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_covrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_covrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_covrl_restrr1

--- a/base/comm/internals/psi_covrl_save.f90
+++ b/base/comm/internals/psi_covrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_covrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_covrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_covrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_covrl_save_a.f90
+++ b/base/comm/internals/psi_covrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_covrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_covrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_covrl_saver1

--- a/base/comm/internals/psi_covrl_upd.f90
+++ b/base/comm/internals/psi_covrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_covrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_covrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_covrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_covrl_upd_a.f90
+++ b/base/comm/internals/psi_covrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_covrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_covrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_covrl_updr1

--- a/base/comm/internals/psi_cswapdata.F90
+++ b/base/comm/internals/psi_cswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_cswapdata.F90
 !
-! Subroutine: psi_cswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_cswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_cswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_cswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_cswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_cswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_complex_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_cswap_vidx_vect
 ! Subroutine: psi_cswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_cswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_cswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_complex_swap_tag
--- a/base/comm/internals/psi_cswapdata_a.F90
+++ b/base/comm/internals/psi_cswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_cswapdata.F90
 !
 ! Subroutine: psi_cswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_cswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_cswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_complex_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_cswapidxm
 !
 !
 ! Subroutine: psi_cswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_cswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_cswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_cswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_cswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_complex_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_cswaptran.F90
+++ b/base/comm/internals/psi_cswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_cswaptran.F90
 !
-! Subroutine: psi_cswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_cswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_c_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_cswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_cswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_cswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_cswaptran_vect
 ! Subroutine: psi_ctran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_ctran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_ctran_vidx_vect
 !
 !
 !
-! Subroutine: psi_cswaptran_vect
+! Subroutine: psi_cswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_cswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_ctran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_cswaptran_a.F90
+++ b/base/comm/internals/psi_cswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_cswaptran.F90
 !
 ! Subroutine: psi_cswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_ctranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_ctranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_complex_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_ctranidxm
 !
 !
 ! Subroutine: psi_cswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_ctranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_ctranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_ctranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_ctranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_complex_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/internals/psi_dovrl_restr.f90
+++ b/base/comm/internals/psi_dovrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_dovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_dovrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_restr_vect
  use psb_d_base_vect_mod
--- a/base/comm/internals/psi_dovrl_restr_a.f90
+++ b/base/comm/internals/psi_dovrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_dovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_dovrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_restrr1

--- a/base/comm/internals/psi_dovrl_save.f90
+++ b/base/comm/internals/psi_dovrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_dovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_dovrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_dovrl_save_a.f90
+++ b/base/comm/internals/psi_dovrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_dovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_dovrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_saver1

--- a/base/comm/internals/psi_dovrl_upd.f90
+++ b/base/comm/internals/psi_dovrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_dovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_dovrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_dovrl_upd_a.f90
+++ b/base/comm/internals/psi_dovrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_dovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_dovrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_dovrl_updr1

--- a/base/comm/internals/psi_dswapdata.F90
+++ b/base/comm/internals/psi_dswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_dswapdata.F90
 !
-! Subroutine: psi_dswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_dswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_dswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_dswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_dswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_dswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_double_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_dswap_vidx_vect
 ! Subroutine: psi_dswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_dswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_dswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_double_swap_tag
--- a/base/comm/internals/psi_dswapdata_a.F90
+++ b/base/comm/internals/psi_dswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_dswapdata.F90
 !
 ! Subroutine: psi_dswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_dswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_dswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_double_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_dswapidxm
 !
 !
 ! Subroutine: psi_dswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_dswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_dswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_dswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_dswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_double_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_dswaptran.F90
+++ b/base/comm/internals/psi_dswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_dswaptran.F90
 !
-! Subroutine: psi_dswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_dswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_d_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_dswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_dswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_dswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_dswaptran_vect
 ! Subroutine: psi_dtran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_dtran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_dtran_vidx_vect
 !
 !
 !
-! Subroutine: psi_dswaptran_vect
+! Subroutine: psi_dswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_dswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_dtran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_dswaptran_a.F90
+++ b/base/comm/internals/psi_dswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_dswaptran.F90
 !
 ! Subroutine: psi_dswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_dtranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_dtranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_double_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_dtranidxm
 !
 !
 ! Subroutine: psi_dswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_dtranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_dtranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_dtranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_dtranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_double_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/internals/psi_eovrl_restr_a.f90
+++ b/base/comm/internals/psi_eovrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_eovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_eovrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_eovrl_restrr1

--- a/base/comm/internals/psi_eovrl_save_a.f90
+++ b/base/comm/internals/psi_eovrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_eovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_eovrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_eovrl_saver1

--- a/base/comm/internals/psi_eovrl_upd_a.f90
+++ b/base/comm/internals/psi_eovrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_eovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_eovrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_eovrl_updr1

--- a/base/comm/internals/psi_eswapdata_a.F90
+++ b/base/comm/internals/psi_eswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_eswapdata.F90
 !
 ! Subroutine: psi_eswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_eswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_eswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int8_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_eswapidxm
 !
 !
 ! Subroutine: psi_eswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_eswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_eswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_eswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_eswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int8_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_eswaptran_a.F90
+++ b/base/comm/internals/psi_eswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_eswaptran.F90
 !
 ! Subroutine: psi_eswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_etranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_etranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int8_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_etranidxm
 !
 !
 ! Subroutine: psi_eswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_etranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_etranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_etranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_etranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int8_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/internals/psi_iovrl_restr.f90
+++ b/base/comm/internals/psi_iovrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_iovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_iovrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_iovrl_restr_vect
  use psb_i_base_vect_mod
--- a/base/comm/internals/psi_iovrl_save.f90
+++ b/base/comm/internals/psi_iovrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_iovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_iovrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_iovrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_iovrl_upd.f90
+++ b/base/comm/internals/psi_iovrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_iovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_iovrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_iovrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_iswapdata.F90
+++ b/base/comm/internals/psi_iswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_iswapdata.F90
 !
-! Subroutine: psi_iswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_iswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_iswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_iswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_iswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_iswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_int_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_iswap_vidx_vect
 ! Subroutine: psi_iswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_iswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_iswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_int_swap_tag
--- a/base/comm/internals/psi_iswaptran.F90
+++ b/base/comm/internals/psi_iswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_iswaptran.F90
 !
-! Subroutine: psi_iswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_iswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_i_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_iswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_iswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_iswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_iswaptran_vect
 ! Subroutine: psi_itran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_itran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_itran_vidx_vect
 !
 !
 !
-! Subroutine: psi_iswaptran_vect
+! Subroutine: psi_iswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_iswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_itran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_lovrl_restr.f90
+++ b/base/comm/internals/psi_lovrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_lovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_lovrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_lovrl_restr_vect
  use psb_l_base_vect_mod
--- a/base/comm/internals/psi_lovrl_save.f90
+++ b/base/comm/internals/psi_lovrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_lovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_lovrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_lovrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_lovrl_upd.f90
+++ b/base/comm/internals/psi_lovrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_lovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_lovrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_lovrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_lswapdata.F90
+++ b/base/comm/internals/psi_lswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_lswapdata.F90
 !
-! Subroutine: psi_lswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_lswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_lswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_lswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_lswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_lswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_long_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_lswap_vidx_vect
 ! Subroutine: psi_lswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_lswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_lswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_long_swap_tag
--- a/base/comm/internals/psi_lswaptran.F90
+++ b/base/comm/internals/psi_lswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_lswaptran.F90
 !
-! Subroutine: psi_lswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_lswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - integer                  Choose overwrite or sum. 
+!    y        - type(psb_l_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_lswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_lswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_lswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_lswaptran_vect
 ! Subroutine: psi_ltran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_ltran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_ltran_vidx_vect
 !
 !
 !
-! Subroutine: psi_lswaptran_vect
+! Subroutine: psi_lswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_lswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_ltran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_movrl_restr_a.f90
+++ b/base/comm/internals/psi_movrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_movrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_movrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_movrl_restrr1

--- a/base/comm/internals/psi_movrl_save_a.f90
+++ b/base/comm/internals/psi_movrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_movrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_movrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_movrl_saver1

--- a/base/comm/internals/psi_movrl_upd_a.f90
+++ b/base/comm/internals/psi_movrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_movrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_movrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_movrl_updr1

--- a/base/comm/internals/psi_mswapdata_a.F90
+++ b/base/comm/internals/psi_mswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_mswapdata.F90
 !
 ! Subroutine: psi_mswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_mswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_mswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int4_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_mswapidxm
 !
 !
 ! Subroutine: psi_mswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_mswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_mswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_mswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_mswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int4_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_mswaptran_a.F90
+++ b/base/comm/internals/psi_mswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_mswaptran.F90
 !
 ! Subroutine: psi_mswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:,:)   - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_mtranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_mtranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int4_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_mtranidxm
 !
 !
 ! Subroutine: psi_mswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_mtranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_mtranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - integer                  Choose overwrite or sum. 
+!    y(:)     - integer                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - integer                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_mtranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_mtranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_int4_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/internals/psi_sovrl_restr.f90
+++ b/base/comm/internals/psi_sovrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_sovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_sovrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_restr_vect
  use psb_s_base_vect_mod
--- a/base/comm/internals/psi_sovrl_restr_a.f90
+++ b/base/comm/internals/psi_sovrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_sovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_sovrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_restrr1

--- a/base/comm/internals/psi_sovrl_save.f90
+++ b/base/comm/internals/psi_sovrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_sovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_sovrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_sovrl_save_a.f90
+++ b/base/comm/internals/psi_sovrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_sovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_sovrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_saver1

--- a/base/comm/internals/psi_sovrl_upd.f90
+++ b/base/comm/internals/psi_sovrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_sovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_sovrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_sovrl_upd_a.f90
+++ b/base/comm/internals/psi_sovrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_sovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_sovrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_sovrl_updr1

--- a/base/comm/internals/psi_sswapdata.F90
+++ b/base/comm/internals/psi_sswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_sswapdata.F90
 !
-! Subroutine: psi_sswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_sswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_sswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_sswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_sswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_sswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_real_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_sswap_vidx_vect
 ! Subroutine: psi_sswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_sswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_sswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_real_swap_tag
--- a/base/comm/internals/psi_sswapdata_a.F90
+++ b/base/comm/internals/psi_sswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_sswapdata.F90
 !
 ! Subroutine: psi_sswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_sswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_sswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_real_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_sswapidxm
 !
 !
 ! Subroutine: psi_sswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_sswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_sswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_sswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_sswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_real_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_sswaptran.F90
+++ b/base/comm/internals/psi_sswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_sswaptran.F90
 !
-! Subroutine: psi_sswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_sswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - real                  Choose overwrite or sum. 
+!    y        - type(psb_s_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_sswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_sswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_sswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_sswaptran_vect
 ! Subroutine: psi_stran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_stran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_stran_vidx_vect
 !
 !
 !
-! Subroutine: psi_sswaptran_vect
+! Subroutine: psi_sswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_sswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_stran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_sswaptran_a.F90
+++ b/base/comm/internals/psi_sswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_sswaptran.F90
 !
 ! Subroutine: psi_sswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:,:)   - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_stranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_stranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_real_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_stranidxm
 !
 !
 ! Subroutine: psi_sswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_stranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_stranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - real                  Choose overwrite or sum. 
+!    y(:)     - real                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - real                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_stranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_stranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_real_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/internals/psi_zovrl_restr.f90
+++ b/base/comm/internals/psi_zovrl_restr.f90
@ -29,7 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_zovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
+!    
 subroutine  psi_zovrl_restr_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_restr_vect
  use psb_z_base_vect_mod
--- a/base/comm/internals/psi_zovrl_restr_a.f90
+++ b/base/comm/internals/psi_zovrl_restr_a.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_zovrl_restr
+!   These subroutines restore the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_zovrl_restrr1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_restrr1

--- a/base/comm/internals/psi_zovrl_save.f90
+++ b/base/comm/internals/psi_zovrl_save.f90
@ -29,6 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_zovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
+!    
 subroutine  psi_zovrl_save_vect(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_save_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_zovrl_save_a.f90
+++ b/base/comm/internals/psi_zovrl_save_a.f90
@ -29,7 +29,11 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+!
+! Subroutine: psi_zovrl_save
+!   These subroutines save the overlap region of a vector; they are used
+!   for the transpose  matrix-vector product when there is a nonempty overlap.
+!                                              
 subroutine  psi_zovrl_saver1(x,xs,desc_a,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_saver1

--- a/base/comm/internals/psi_zovrl_upd.f90
+++ b/base/comm/internals/psi_zovrl_upd.f90
@ -29,7 +29,13 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
-
+! Subroutine: psi_zovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
+!    
 subroutine  psi_zovrl_upd_vect(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_upd_vect
  use psb_realloc_mod
--- a/base/comm/internals/psi_zovrl_upd_a.f90
+++ b/base/comm/internals/psi_zovrl_upd_a.f90
@ -29,6 +29,12 @@
 !    POSSIBILITY OF SUCH DAMAGE.
 !   
 !
+! Subroutine: psi_zovrl_update
+!   These subroutines update the overlap region of a vector; they are  used
+!   for the transpose  matrix-vector product when there is a nonempty overlap,
+!   or for the application of Additive Schwarz preconditioners.                                           
+!    
+!    
 subroutine  psi_zovrl_updr1(x,desc_a,update,info)
  use psi_mod, psi_protect_name =>   psi_zovrl_updr1

--- a/base/comm/internals/psi_zswapdata.F90
+++ b/base/comm/internals/psi_zswapdata.F90
@ -32,8 +32,10 @@
 !
 ! File: psi_zswapdata.F90
 !
-! Subroutine: psi_zswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!
+!
+! Subroutine: psi_zswapdata_vect
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -43,12 +45,15 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a send on (PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_@x@_vect_type) The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -84,14 +89,6 @@
 !
 !
 ! 
-!
-! Subroutine: psi_zswapdata_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
-! 
 subroutine psi_zswapdata_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_zswapdata_vect
@ -263,7 +260,7 @@ subroutine psi_zswap_vidx_vect(iictxt,iicomm,flag,beta,y,idx, &
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)

      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_dcomplex_swap_tag
@ -426,7 +423,7 @@ end subroutine psi_zswap_vidx_vect
 ! Subroutine: psi_zswapdata_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_zswapdata_multivect(flag,beta,y,desc_a,work,info,data)
@ -602,7 +599,7 @@ subroutine psi_zswap_vidx_multivect(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        p2ptag = psb_dcomplex_swap_tag
--- a/base/comm/internals/psi_zswapdata_a.F90
+++ b/base/comm/internals/psi_zswapdata_a.F90
@ -33,9 +33,9 @@
 ! File: psi_zswapdata.F90
 !
 ! Subroutine: psi_zswapdatam
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
-!   it is capable of pruning empty exchanges, which are very likely in out 
+!   it is capable of pruning empty exchanges, which are very likely in our
 !   application environment. All the variants have the same structure 
 !   In all these subroutines X may be:    I    Integer
 !                                         S    real(psb_spk_)
@ -49,6 +49,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -69,10 +70,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -236,7 +237,7 @@ subroutine psi_zswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -349,7 +350,7 @@ subroutine psi_zswapidxm(iictxt,iicomm,flag,n,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_dcomplex_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),n*nerv,&
@ -507,7 +508,7 @@ end subroutine psi_zswapidxm
 !
 !
 ! Subroutine: psi_zswapdatav
-!   Does the data exchange among processes. Essentially this is doing 
+!   Implements the data exchange among processes. Essentially this is doing 
 !   a variable all-to-all data exchange (ALLTOALLV in MPI parlance), but 
 !   it is capable of pruning empty exchanges, which are very likely in out 
 !   application environment. All the variants have the same structure 
@ -523,6 +524,7 @@ end subroutine psi_zswapidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -543,10 +545,10 @@ end subroutine psi_zswapidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -721,7 +723,7 @@ subroutine psi_zswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -835,7 +837,7 @@ subroutine psi_zswapidxv(iictxt,iicomm,flag,beta,y,idx, &
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)

-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nerv>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_dcomplex_swap_tag
        call mpi_irecv(rcvbuf(rcv_pt),nerv,&
--- a/base/comm/internals/psi_zswaptran.F90
+++ b/base/comm/internals/psi_zswaptran.F90
@ -32,8 +32,8 @@
 !
 ! File: psi_zswaptran.F90
 !
-! Subroutine: psi_zswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+! Subroutine: psi_zswaptran_vect
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -47,12 +47,16 @@
 !                                         C    complex(psb_spk_)
 !                                         Z    complex(psb_dpk_)
 !   Basically the operation is as follows: on each process, we identify 
-!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(SND(Y)));
-!   then we receive, and we do an update with Y = UNPACK(RCV(Y)) + BETA * Y 
-!   but only on the elements involved in the UNPACK operation. 
+!   sections SND(Y) and RCV(Y); then we do a SEND(PACK(GTH(SND(Y))));
+!   then we receive, and we do an update with Y = SCT(RCV(Y)) + BETA * Y 
+!   but only on the elements involved in the SCT operation. 
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
+!   This version works on encapsulated vectors, and uses their methods to do  GTH and SCT,
+!   so that special versions (i.e. GPU vectors can override them 
+! 
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +77,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
-!    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    beta     - complex                  Choose overwrite or sum. 
+!    y        - type(psb_z_vect_type) The data area                        
+!    desc_a   - type(psb_desc_type).    The communication descriptor.        
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -87,13 +91,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !   
-!
-! Subroutine: psi_zswaptran_vect
-!   Data exchange among processes.
-!
-!   Takes care of Y an exanspulated vector.
-!   
-!   
 subroutine psi_zswaptran_vect(flag,beta,y,desc_a,work,info,data)

  use psi_mod, psb_protect_name => psi_zswaptran_vect
@ -171,7 +168,7 @@ end subroutine psi_zswaptran_vect
 ! Subroutine: psi_ztran_vidx_vect
 !   Data exchange among processes.
 !
-!   Takes care of Y an exanspulated vector. Relies on the gather/scatter methods
+!   Takes care of Y an encapsulated vector. Relies on the gather/scatter methods
 !   of vectors. 
 !   
 !   The real workhorse: the outer routine will only choose the index list
@ -269,7 +266,7 @@ subroutine psi_ztran_vidx_vect(iictxt,iicomm,flag,beta,y,idx,&

      snd_pt = 1+pnti+nerv+psb_n_elem_send_
      rcv_pt = 1+pnti+psb_n_elem_recv_
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),rcv_pt
        call mpi_irecv(y%combuf(snd_pt),nesd,&
@ -436,10 +433,10 @@ end subroutine psi_ztran_vidx_vect
 !
 !
 !
-! Subroutine: psi_zswaptran_vect
+! Subroutine: psi_zswaptran_multivect
 !   Data exchange among processes.
 !
-!   Takes care of Y an encaspulated vector.
+!   Takes care of Y an encaspulated multivector.
 !   
 !   
 subroutine psi_zswaptran_multivect(flag,beta,y,desc_a,work,info,data)
@ -616,7 +613,7 @@ subroutine psi_ztran_vidx_multivect(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx%v(pnti+psb_proc_id_)
      nerv = idx%v(pnti+psb_n_elem_recv_)
      nesd = idx%v(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        if (debug) write(*,*) me,'Posting receive from',prcid(i),snd_pt
        call mpi_irecv(y%combuf(snd_pt),n*nesd,&
--- a/base/comm/internals/psi_zswaptran_a.F90
+++ b/base/comm/internals/psi_zswaptran_a.F90
@ -33,7 +33,7 @@
 ! File: psi_zswaptran.F90
 !
 ! Subroutine: psi_zswaptranm
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -53,6 +53,7 @@
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -73,10 +74,10 @@
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:,:)   - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:,:)   - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -241,7 +242,7 @@ subroutine psi_ztranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = n*nerv
@ -359,7 +360,7 @@ subroutine psi_ztranidxm(iictxt,iicomm,flag,n,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_dcomplex_swap_tag
        call mpi_irecv(sndbuf(snd_pt),n*nesd,&
@ -516,7 +517,7 @@ end subroutine psi_ztranidxm
 !
 !
 ! Subroutine: psi_zswaptranv
-!   Does the data exchange among processes. This is similar to Xswapdata, but
+!   Implements the data exchange among processes. This is similar to Xswapdata, but
 !   the list is read "in reverse", i.e. indices that are normally SENT are used 
 !   for the RECEIVE part and vice-versa. This is the basic data exchange operation
 !   for doing the product of a sparse matrix by a vector. 
@ -536,6 +537,7 @@ end subroutine psi_ztranidxm
 !   Thus: for halo data exchange, the receive section is confined in the 
 !   halo indices, and BETA=0, whereas for overlap exchange the receive section 
 !   is scattered in the owned indices, and BETA=1.
+!   The first routine picks the desired exchange index list and passes it to the second.
 ! 
 ! Arguments: 
 !    flag     - integer                 Choose the algorithm for data exchange: 
@ -556,10 +558,10 @@ end subroutine psi_ztranidxm
 !
 !
 !    n        - integer                 Number of columns in Y               
-!    beta     - X                       Choose overwrite or sum. 
-!    y(:)     - X                       The data area                        
+!    beta     - complex                  Choose overwrite or sum. 
+!    y(:)     - complex                  The data area                        
 !    desc_a   - type(psb_desc_type).  The communication descriptor.        
-!    work(:)  - X                       Buffer space. If not sufficient, will do 
+!    work(:)  - complex                  Buffer space. If not sufficient, will do 
 !                                       our own internal allocation.
 !    info     - integer.                return code.
 !    data     - integer                 which list is to be used to exchange data
@ -734,7 +736,7 @@ subroutine psi_ztranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(proc_to_comm),ictxt,proc_to_comm)
+      prcid(proc_to_comm) = psb_get_mpi_rank(ictxt,proc_to_comm)

      brvidx(proc_to_comm) = rcv_pt
      rvsz(proc_to_comm)   = nerv
@ -852,7 +854,7 @@ subroutine psi_ztranidxv(iictxt,iicomm,flag,beta,y,idx,&
      proc_to_comm = idx(pnti+psb_proc_id_)
      nerv = idx(pnti+psb_n_elem_recv_)
      nesd = idx(pnti+nerv+psb_n_elem_send_)
-      call psb_get_rank(prcid(i),ictxt,proc_to_comm)      
+      prcid(i) = psb_get_mpi_rank(ictxt,proc_to_comm)      
      if ((nesd>0).and.(proc_to_comm /= me)) then 
        p2ptag = psb_dcomplex_swap_tag
        call mpi_irecv(sndbuf(snd_pt),nesd,&
--- a/base/comm/psb_cgather.f90
+++ b/base/comm/psb_cgather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_cgather.f90
 !
-! Subroutine: psb_cgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_cgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).          The local matrix into which gather 
+!   globx     -  complex,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  complex,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_c_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_cgather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_cgather_vect

+! Subroutine: psb_cgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_cgather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_cgather_multivect
--- a/base/comm/psb_cgather_a.f90
+++ b/base/comm/psb_cgather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_cgather.f90
 !
-! Subroutine: psb_cgatherm
+! Subroutine: psb_cgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_chalo.f90
+++ b/base/comm/psb_chalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_chalo.f90
 !
-! Subroutine: psb_chalom
+! Subroutine: psb_chalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  complex,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_c_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  complex(optional).            Work  area.
+!   work      -  complex(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_chalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_chalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_chalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_chalo_vect

-
+!
+! Subroutine: psb_chalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_c_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  complex(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_chalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_chalo_multivect
  use psi_mod
--- a/base/comm/psb_covrl.f90
+++ b/base/comm/psb_covrl.f90
@ -32,12 +32,12 @@
 !
 ! File:  psb_covrl.f90
 !
-! Subroutine: psb_covrlm
+! Subroutine: psb_covrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  complex                   The local part of the dense matrix.
+!   x           -  type(psb_c_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@ -180,7 +180,38 @@ subroutine  psb_covrl_vect(x,desc_a,info,work,update,mode)
    return
 end subroutine psb_covrl_vect

-
+!
+! Subroutine: psb_covrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_c_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  complex(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_covrl_multivect(x,desc_a,info,work,update,mode)
  use psb_base_mod, psb_protect_name => psb_covrl_multivect
  use psi_mod
--- a/base/comm/psb_cscatter.F90
+++ b/base/comm/psb_cscatter.F90
@ -31,13 +31,13 @@
 !    
 ! File:  psb_cscatter.f90
 !
-! Subroutine: psb_cscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_cscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).       The global matrix to scatter.
-!   locx      -  complex,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  complex,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_c_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
--- a/base/comm/psb_cscatter_a.F90
+++ b/base/comm/psb_cscatter_a.F90
@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_cscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:,:).       The global matrix to scatter.
@ -107,8 +107,8 @@ subroutine  psb_cscatterm(globx, locx, desc_a, info, root)

  m = desc_a%get_global_rows()
  n = desc_a%get_global_cols()
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,me)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,me)

  if  (iroot==-1) then
    lda_globx = size(globx, 1)
@ -159,7 +159,7 @@ subroutine  psb_cscatterm(globx, locx, desc_a, info, root)
    end do
  else
    
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
@ -278,7 +278,7 @@ end subroutine psb_cscatterm

 ! Subroutine: psb_cscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  complex,dimension(:).         The global vector to scatter.
@ -347,8 +347,8 @@ subroutine  psb_cscatterv(globx, locx, desc_a, info, root)
     iroot = psb_root_
  end if
  
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,iam)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,iam)

  iglobx = 1
  jglobx = 1
@ -394,7 +394,7 @@ subroutine  psb_cscatterv(globx, locx, desc_a, info, root)
      locx(i)=globx(ltg(i))
    end do
  else
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
--- a/base/comm/psb_cspgather.F90
+++ b/base/comm/psb_cspgather.F90
@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_cspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_c_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_lc_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_csp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
  use iso_fortran_env
--- a/base/comm/psb_dgather.f90
+++ b/base/comm/psb_dgather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_dgather.f90
 !
-! Subroutine: psb_dgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_dgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).          The local matrix into which gather 
+!   globx     -  real,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  real,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_d_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_dgather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_dgather_vect

+! Subroutine: psb_dgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_dgather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_dgather_multivect
--- a/base/comm/psb_dgather_a.f90
+++ b/base/comm/psb_dgather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_dgather.f90
 !
-! Subroutine: psb_dgatherm
+! Subroutine: psb_dgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_dhalo.f90
+++ b/base/comm/psb_dhalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_dhalo.f90
 !
-! Subroutine: psb_dhalom
+! Subroutine: psb_dhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  real,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_d_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  real(optional).            Work  area.
+!   work      -  real(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_dhalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_dhalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_dhalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_dhalo_vect

-
+!
+! Subroutine: psb_dhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_d_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  real(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_dhalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_dhalo_multivect
  use psi_mod
--- a/base/comm/psb_dovrl.f90
+++ b/base/comm/psb_dovrl.f90
@ -32,12 +32,12 @@
 !
 ! File:  psb_dovrl.f90
 !
-! Subroutine: psb_dovrlm
+! Subroutine: psb_dovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  real                   The local part of the dense matrix.
+!   x           -  type(psb_d_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@ -180,7 +180,38 @@ subroutine  psb_dovrl_vect(x,desc_a,info,work,update,mode)
    return
 end subroutine psb_dovrl_vect

-
+!
+! Subroutine: psb_dovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_d_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  real(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_dovrl_multivect(x,desc_a,info,work,update,mode)
  use psb_base_mod, psb_protect_name => psb_dovrl_multivect
  use psi_mod
--- a/base/comm/psb_dscatter.F90
+++ b/base/comm/psb_dscatter.F90
@ -31,13 +31,13 @@
 !    
 ! File:  psb_dscatter.f90
 !
-! Subroutine: psb_dscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_dscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).       The global matrix to scatter.
-!   locx      -  real,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  real,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_d_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
--- a/base/comm/psb_dscatter_a.F90
+++ b/base/comm/psb_dscatter_a.F90
@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_dscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:,:).       The global matrix to scatter.
@ -107,8 +107,8 @@ subroutine  psb_dscatterm(globx, locx, desc_a, info, root)

  m = desc_a%get_global_rows()
  n = desc_a%get_global_cols()
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,me)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,me)

  if  (iroot==-1) then
    lda_globx = size(globx, 1)
@ -159,7 +159,7 @@ subroutine  psb_dscatterm(globx, locx, desc_a, info, root)
    end do
  else
    
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
@ -278,7 +278,7 @@ end subroutine psb_dscatterm

 ! Subroutine: psb_dscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:).         The global vector to scatter.
@ -347,8 +347,8 @@ subroutine  psb_dscatterv(globx, locx, desc_a, info, root)
     iroot = psb_root_
  end if
  
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,iam)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,iam)

  iglobx = 1
  jglobx = 1
@ -394,7 +394,7 @@ subroutine  psb_dscatterv(globx, locx, desc_a, info, root)
      locx(i)=globx(ltg(i))
    end do
  else
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
--- a/base/comm/psb_dspgather.F90
+++ b/base/comm/psb_dspgather.F90
@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_dspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_d_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_ld_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_dsp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
  use iso_fortran_env
--- a/base/comm/psb_egather_a.f90
+++ b/base/comm/psb_egather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_egather.f90
 !
-! Subroutine: psb_egatherm
+! Subroutine: psb_egather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_escatter_a.F90
+++ b/base/comm/psb_escatter_a.F90
@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_escatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:,:).       The global matrix to scatter.
@ -107,8 +107,8 @@ subroutine  psb_escatterm(globx, locx, desc_a, info, root)

  m = desc_a%get_global_rows()
  n = desc_a%get_global_cols()
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,me)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,me)

  if  (iroot==-1) then
    lda_globx = size(globx, 1)
@ -159,7 +159,7 @@ subroutine  psb_escatterm(globx, locx, desc_a, info, root)
    end do
  else
    
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
@ -278,7 +278,7 @@ end subroutine psb_escatterm

 ! Subroutine: psb_escatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:).         The global vector to scatter.
@ -347,8 +347,8 @@ subroutine  psb_escatterv(globx, locx, desc_a, info, root)
     iroot = psb_root_
  end if
  
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,iam)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,iam)

  iglobx = 1
  jglobx = 1
@ -394,7 +394,7 @@ subroutine  psb_escatterv(globx, locx, desc_a, info, root)
      locx(i)=globx(ltg(i))
    end do
  else
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
--- a/base/comm/psb_igather.f90
+++ b/base/comm/psb_igather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_igather.f90
 !
-! Subroutine: psb_igatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_igather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).          The local matrix into which gather 
+!   globx     -  integer,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  integer,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_i_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_igather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_igather_vect

+! Subroutine: psb_igather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_igather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_igather_multivect
--- a/base/comm/psb_ihalo.f90
+++ b/base/comm/psb_ihalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_ihalo.f90
 !
-! Subroutine: psb_ihalom
+! Subroutine: psb_ihalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  integer,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_i_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  integer(optional).            Work  area.
+!   work      -  integer(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_ihalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_ihalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_ihalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_ihalo_vect

-
+!
+! Subroutine: psb_ihalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_i_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  integer(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_ihalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_ihalo_multivect
  use psi_mod
--- a/base/comm/psb_iovrl.f90
+++ b/base/comm/psb_iovrl.f90
@ -32,12 +32,12 @@
 !
 ! File:  psb_iovrl.f90
 !
-! Subroutine: psb_iovrlm
+! Subroutine: psb_iovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  integer                   The local part of the dense matrix.
+!   x           -  type(psb_i_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@ -180,7 +180,38 @@ subroutine  psb_iovrl_vect(x,desc_a,info,work,update,mode)
    return
 end subroutine psb_iovrl_vect

-
+!
+! Subroutine: psb_iovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_i_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  integer(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_iovrl_multivect(x,desc_a,info,work,update,mode)
  use psb_base_mod, psb_protect_name => psb_iovrl_multivect
  use psi_mod
--- a/base/comm/psb_iscatter.F90
+++ b/base/comm/psb_iscatter.F90
@ -31,13 +31,13 @@
 !    
 ! File:  psb_iscatter.f90
 !
-! Subroutine: psb_iscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_iscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).       The global matrix to scatter.
-!   locx      -  integer,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  integer,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_i_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
--- a/base/comm/psb_ispgather.F90
+++ b/base/comm/psb_ispgather.F90
@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_ispgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_i_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_@LX@_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_isp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
  use iso_fortran_env
--- a/base/comm/psb_lgather.f90
+++ b/base/comm/psb_lgather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_lgather.f90
 !
-! Subroutine: psb_lgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_lgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).          The local matrix into which gather 
+!   globx     -  integer,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  integer,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_l_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_lgather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_lgather_vect

+! Subroutine: psb_lgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_lgather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_lgather_multivect
--- a/base/comm/psb_lhalo.f90
+++ b/base/comm/psb_lhalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_lhalo.f90
 !
-! Subroutine: psb_lhalom
+! Subroutine: psb_lhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  integer,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_l_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  integer(optional).            Work  area.
+!   work      -  integer(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_lhalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_lhalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_lhalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_lhalo_vect

-
+!
+! Subroutine: psb_lhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_l_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  integer(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_lhalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_lhalo_multivect
  use psi_mod
--- a/base/comm/psb_lovrl.f90
+++ b/base/comm/psb_lovrl.f90
@ -32,12 +32,12 @@
 !
 ! File:  psb_lovrl.f90
 !
-! Subroutine: psb_lovrlm
+! Subroutine: psb_lovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  integer                   The local part of the dense matrix.
+!   x           -  type(psb_l_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@ -180,7 +180,38 @@ subroutine  psb_lovrl_vect(x,desc_a,info,work,update,mode)
    return
 end subroutine psb_lovrl_vect

-
+!
+! Subroutine: psb_lovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_l_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  integer(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_lovrl_multivect(x,desc_a,info,work,update,mode)
  use psb_base_mod, psb_protect_name => psb_lovrl_multivect
  use psi_mod
--- a/base/comm/psb_lscatter.F90
+++ b/base/comm/psb_lscatter.F90
@ -31,13 +31,13 @@
 !    
 ! File:  psb_lscatter.f90
 !
-! Subroutine: psb_lscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_lscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  integer,dimension(:,:).       The global matrix to scatter.
-!   locx      -  integer,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  integer,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_l_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
--- a/base/comm/psb_lspgather.F90
+++ b/base/comm/psb_lspgather.F90
@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_lspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_l_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_@LX@_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_lsp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
  use iso_fortran_env
--- a/base/comm/psb_mgather_a.f90
+++ b/base/comm/psb_mgather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_mgather.f90
 !
-! Subroutine: psb_mgatherm
+! Subroutine: psb_mgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_mscatter_a.F90
+++ b/base/comm/psb_mscatter_a.F90
@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_mscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:,:).       The global matrix to scatter.
@ -107,8 +107,8 @@ subroutine  psb_mscatterm(globx, locx, desc_a, info, root)

  m = desc_a%get_global_rows()
  n = desc_a%get_global_cols()
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,me)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,me)

  if  (iroot==-1) then
    lda_globx = size(globx, 1)
@ -159,7 +159,7 @@ subroutine  psb_mscatterm(globx, locx, desc_a, info, root)
    end do
  else
    
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
@ -278,7 +278,7 @@ end subroutine psb_mscatterm

 ! Subroutine: psb_mscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  integer,dimension(:).         The global vector to scatter.
@ -347,8 +347,8 @@ subroutine  psb_mscatterv(globx, locx, desc_a, info, root)
     iroot = psb_root_
  end if
  
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,iam)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,iam)

  iglobx = 1
  jglobx = 1
@ -394,7 +394,7 @@ subroutine  psb_mscatterv(globx, locx, desc_a, info, root)
      locx(i)=globx(ltg(i))
    end do
  else
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
--- a/base/comm/psb_sgather.f90
+++ b/base/comm/psb_sgather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_sgather.f90
 !
-! Subroutine: psb_sgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_sgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).          The local matrix into which gather 
+!   globx     -  real,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  real,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_s_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_sgather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_sgather_vect

+! Subroutine: psb_sgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_sgather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_sgather_multivect
--- a/base/comm/psb_sgather_a.f90
+++ b/base/comm/psb_sgather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_sgather.f90
 !
-! Subroutine: psb_sgatherm
+! Subroutine: psb_sgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_shalo.f90
+++ b/base/comm/psb_shalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_shalo.f90
 !
-! Subroutine: psb_shalom
+! Subroutine: psb_shalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  real,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_s_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  real(optional).            Work  area.
+!   work      -  real(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_shalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_shalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_shalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_shalo_vect

-
+!
+! Subroutine: psb_shalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_s_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  real(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_shalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_shalo_multivect
  use psi_mod
--- a/base/comm/psb_sovrl.f90
+++ b/base/comm/psb_sovrl.f90
@ -32,12 +32,12 @@
 !
 ! File:  psb_sovrl.f90
 !
-! Subroutine: psb_sovrlm
+! Subroutine: psb_sovrl_vect
 !   This subroutine performs the exchange of the overlap elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed dense vector between all the processes.
 !
 ! Arguments:
-!   x(:,:)      -  real                   The local part of the dense matrix.
+!   x           -  type(psb_s_vect_type)  The local part of the vector
 !   desc_a      -  type(psb_desc_type).    The communication descriptor.
 !   info        -  integer.                  Return code.
 !   jx          -  integer(optional).        The starting column of the global matrix
@ -180,7 +180,38 @@ subroutine  psb_sovrl_vect(x,desc_a,info,work,update,mode)
    return
 end subroutine psb_sovrl_vect

-
+!
+! Subroutine: psb_sovrl_multivect
+!   This subroutine performs the exchange of the overlap elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x           -  type(psb_s_vect_type)  The local part of the multivector
+!   desc_a      -  type(psb_desc_type).    The communication descriptor.
+!   info        -  integer.                  Return code.
+!   jx          -  integer(optional).        The starting column of the global matrix
+!   ik          -  integer(optional).        The number of columns to gather. 
+!   work        -  real(optional).           A work area.
+!   update      -  integer(optional).        Type of update:
+!                                            psb_none_   do nothing
+!                                            psb_sum_    sum of overlaps
+!                                            psb_avg_    average of overlaps
+!   mode        -  integer(optional).        Choose the algorithm for data exchange: 
+!                                       this is chosen through bit fields. 
+!                                       - swap_mpi  = iand(flag,psb_swap_mpi_)  /= 0
+!                                       - swap_sync = iand(flag,psb_swap_sync_) /= 0
+!                                       - swap_send = iand(flag,psb_swap_send_) /= 0
+!                                       - swap_recv = iand(flag,psb_swap_recv_) /= 0
+!                                       - if (swap_mpi):  use underlying MPI_ALLTOALLV.
+!                                       - if (swap_sync): use PSB_SND and PSB_RCV in 
+!                                                       synchronized pairs
+!                                       - if (swap_send .and. swap_recv): use mpi_irecv 
+!                                                       and mpi_send
+!                                       - if (swap_send): use psb_snd (but need another 
+!                                                       call with swap_recv to complete)
+!                                       - if (swap_recv): use psb_rcv (completing a 
+!                                                       previous call with swap_send)
+!
 subroutine  psb_sovrl_multivect(x,desc_a,info,work,update,mode)
  use psb_base_mod, psb_protect_name => psb_sovrl_multivect
  use psi_mod
--- a/base/comm/psb_sscatter.F90
+++ b/base/comm/psb_sscatter.F90
@ -31,13 +31,13 @@
 !    
 ! File:  psb_sscatter.f90
 !
-! Subroutine: psb_sscatterm
-!   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+! Subroutine: psb_sscatter_vect
+!   This subroutine scatters a global vector locally owned by one process
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
-!   globx     -  real,dimension(:,:).       The global matrix to scatter.
-!   locx      -  real,dimension(:,:).       The local piece of the distributed matrix.
+!   globx     -  real,dimension(:)          The global matrix to scatter.
+!   locx      -  type(psb_s_vect_type)      The local piece of the distributed matrix.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer(optional).            The process that owns the global matrix. 
--- a/base/comm/psb_sscatter_a.F90
+++ b/base/comm/psb_sscatter_a.F90
@ -33,7 +33,7 @@
 !
 ! Subroutine: psb_sscatterm
 !   This subroutine scatters a global matrix locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:,:).       The global matrix to scatter.
@ -107,8 +107,8 @@ subroutine  psb_sscatterm(globx, locx, desc_a, info, root)

  m = desc_a%get_global_rows()
  n = desc_a%get_global_cols()
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,me)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,me)

  if  (iroot==-1) then
    lda_globx = size(globx, 1)
@ -159,7 +159,7 @@ subroutine  psb_sscatterm(globx, locx, desc_a, info, root)
    end do
  else
    
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
@ -278,7 +278,7 @@ end subroutine psb_sscatterm

 ! Subroutine: psb_sscatterv
 !   This subroutine scatters a global vector locally owned by one process
-!   into pieces that are local to alle the processes.
+!   into pieces that are local to all the processes.
 !
 ! Arguments:
 !   globx     -  real,dimension(:).         The global vector to scatter.
@ -347,8 +347,8 @@ subroutine  psb_sscatterv(globx, locx, desc_a, info, root)
     iroot = psb_root_
  end if
  
-  call psb_get_mpicomm(ictxt,icomm)
-  call psb_get_rank(myrank,ictxt,iam)
+  icomm  = psb_get_mpi_comm(ictxt)
+  myrank = psb_get_mpi_rank(ictxt,iam)

  iglobx = 1
  jglobx = 1
@ -394,7 +394,7 @@ subroutine  psb_sscatterv(globx, locx, desc_a, info, root)
      locx(i)=globx(ltg(i))
    end do
  else
-    call psb_get_rank(rootrank,ictxt,iroot)
+    rootrank = psb_get_mpi_rank(ictxt,iroot)
    !
    ! This is potentially unsafe when IPK=8
    ! But then, IPK=8 is highly experimental anyway.
--- a/base/comm/psb_sspgather.F90
+++ b/base/comm/psb_sspgather.F90
@ -30,6 +30,17 @@
 !   
 !    
 ! File:  psb_sspgather.f90
+!
+! Gathers a sparse matrix onto a single process.
+! Two variants:
+! 1. Gathers to PSB_s_SPARSE_MAT   (i.e. to matrix with IPK_ indices)
+! 2. Gathers to PSB_ls_SPARSE_MAT  (i.e. to matrix with LPK_ indices)
+!
+! Note: this function uses MPI_ALLGATHERV. At this time, the size of the
+! resulting matrix must be within the range of 4 bytes because of the
+! restriction on MPI displacements to be 4 bytes. 
+! 
+!
 subroutine  psb_ssp_allgather(globa, loca, desc_a, info, root, dupl,keepnum,keeploc)
 #if defined(HAVE_ISO_FORTRAN_ENV)
  use iso_fortran_env
--- a/base/comm/psb_zgather.f90
+++ b/base/comm/psb_zgather.f90
@ -31,14 +31,14 @@
 !    
 ! File:  psb_zgather.f90
 !
-! Subroutine: psb_zgatherm
-!   This subroutine gathers pieces of a distributed dense matrix into a local one.
+! Subroutine: psb_zgather_vect
+!   This subroutine gathers pieces of a distributed vector into a local one.
 !
 ! Arguments:
-!   globx     -  complex,dimension(:,:).          The local matrix into which gather 
+!   globx     -  complex,dimension(:).           The local matrix into which gather 
 !                                                  the distributed pieces.
-!   locx      -  complex,dimension(:,:).          The local piece of the distributed 
-!                                                  matrix to be gathered.
+!   locx      -  type(psb_z_vect_type@         The local piece of the distributed 
+!                                                  vector to be gathered.
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Error code.
 !   iroot     -  integer.                      The process that has to own the 
@ -159,6 +159,8 @@ subroutine  psb_zgather_vect(globx, locx, desc_a, info, iroot)

 end subroutine psb_zgather_vect

+! Subroutine: psb_zgather_multivect
+!   This subroutine gathers pieces of a distributed multivector into a local one.

 subroutine  psb_zgather_multivect(globx, locx, desc_a, info, iroot)
  use psb_base_mod, psb_protect_name => psb_zgather_multivect
--- a/base/comm/psb_zgather_a.f90
+++ b/base/comm/psb_zgather_a.f90
@ -31,7 +31,7 @@
 !    
 ! File:  psb_zgather.f90
 !
-! Subroutine: psb_zgatherm
+! Subroutine: psb_zgather
 !   This subroutine gathers pieces of a distributed dense matrix into a local one.
 !
 ! Arguments:
--- a/base/comm/psb_zhalo.f90
+++ b/base/comm/psb_zhalo.f90
@ -31,17 +31,17 @@
 !    
 ! File:  psb_zhalo.f90
 !
-! Subroutine: psb_zhalom
+! Subroutine: psb_zhalo_vect
 !   This subroutine performs the exchange of the halo elements in a 
-!    distributed dense matrix between all the processes.
+!    distributed vector between all the processes.
 !
 ! Arguments:
-!   x         -  complex,dimension(:,:).          The local part of the dense matrix.
+!   x         -  type(psb_z_vect_type)    The local part of the vector
 !   desc_a    -  type(psb_desc_type).        The communication descriptor.
 !   info      -  integer.                      Return code
 !   jx        -  integer(optional).            The starting column of the global matrix. 
 !   ik        -  integer(optional).            The number of columns to gather. 
-!   work      -  complex(optional).            Work  area.
+!   work      -  complex(optional).             Work  area.
 !   tran      -  character(optional).          Transpose exchange.
 !   mode      -  integer(optional).            Communication mode (see Swapdata)
 !   data     - integer                 Which index list in desc_a should be used
@ -52,7 +52,6 @@
 !                                       psb_comm_mov_     use ovr_mst_idx
 !
 !
-
 subroutine  psb_zhalo_vect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_zhalo_vect
  use psi_mod
@ -185,7 +184,28 @@ subroutine  psb_zhalo_vect(x,desc_a,info,work,tran,mode,data)
    return
 end subroutine psb_zhalo_vect

-
+!
+! Subroutine: psb_zhalo_multivect
+!   This subroutine performs the exchange of the halo elements in a 
+!    distributed multivector between all the processes.
+!
+! Arguments:
+!   x         -  type(psb_z_multivect_type)    The local part of the multivector
+!   desc_a    -  type(psb_desc_type).        The communication descriptor.
+!   info      -  integer.                      Return code
+!   jx        -  integer(optional).            The starting column of the global matrix. 
+!   ik        -  integer(optional).            The number of columns to gather. 
+!   work      -  complex(optional).             Work  area.
+!   tran      -  character(optional).          Transpose exchange.
+!   mode      -  integer(optional).            Communication mode (see Swapdata)
+!   data     - integer                 Which index list in desc_a should be used
+!                                      to retrieve rows, default psb_comm_halo_
+!                                       psb_comm_halo_    use halo_index
+!                                       psb_comm_ext_     use ext_index 
+!                                       psb_comm_ovrl_    use ovrl_index
+!                                       psb_comm_mov_     use ovr_mst_idx
+!
+!
 subroutine  psb_zhalo_multivect(x,desc_a,info,work,tran,mode,data)
  use psb_base_mod, psb_protect_name => psb_zhalo_multivect
  use psi_mod
--- a/Show More
+++ b/Show More