From 3cfdc033fd342c9723a333b771b38f3c6e109dcb Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Thu, 9 Jan 2020 10:44:01 +0000 Subject: [PATCH] New structure for A2A and graph_fnd_owner New implementation for A2A: use mpi_allgather and mpi_reduce_scatter, with two variants: one for variable inputs, one for fixed size inputs. Modified graph_fnd_owner to pass padded and uniform sized input to A2A. --- base/internals/psi_a2a_fnd_owner.F90 | 96 +++++++++++++++++++++++--- base/internals/psi_graph_fnd_owner.F90 | 64 +++++++++++------ base/modules/desc/psb_desc_mod.F90 | 24 ------- base/modules/desc/psb_indx_map_mod.f90 | 55 ++++++++++++++- 4 files changed, 182 insertions(+), 57 deletions(-) diff --git a/base/internals/psi_a2a_fnd_owner.F90 b/base/internals/psi_a2a_fnd_owner.F90 index 99213adf..b9cd9bb2 100644 --- a/base/internals/psi_a2a_fnd_owner.F90 +++ b/base/internals/psi_a2a_fnd_owner.F90 @@ -47,7 +47,7 @@ ! so it goes for an all-to-all by building an auxiliary neighbours list and ! reusing the neighbour version. ! -subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info) +subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info,samesize) use psb_serial_mod use psb_const_mod use psb_error_mod @@ -66,16 +66,22 @@ subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info) integer(psb_ipk_), allocatable, intent(out) :: iprc(:) class(psb_indx_map), intent(in) :: idxmap integer(psb_ipk_), intent(out) :: info + logical, intent(in), optional :: samesize integer(psb_ipk_), allocatable :: tmpadj(:) - integer(psb_mpk_) :: icomm, minfo, iictxt - integer(psb_ipk_) :: i,n_row,n_col,err_act,nv + integer(psb_lpk_), allocatable :: rmtidx(:) + integer(psb_ipk_), allocatable :: tproc(:), lclidx(:) + integer(psb_mpk_), allocatable :: hsz(:),hidx(:), sdidx(:), rvidx(:),& + & sdsz(:), rvsz(:), sdhd(:), rvhd(:), p2pstat(:,:) + integer(psb_mpk_) :: icomm, minfo, iictxt,nv + integer(psb_ipk_) :: i,n_row,n_col,err_act,gsz integer(psb_lpk_) :: mglob, ih integer(psb_ipk_) :: ictxt,np,me, nresp logical, parameter :: use_psi_adj=.true. real(psb_dpk_) :: t0, t1, t2, t3, t4, tamx, tidx character(len=20) :: name + logical :: samesize_ info = psb_success_ name = 'psi_a2a_fnd_owner' @@ -101,22 +107,94 @@ subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info) goto 9999 end if + if (present(samesize)) then + samesize_ = samesize + else + samesize_ = .false. + end if + nv = size(idx) + ! write(0,*) me,name,' :',use_psi_adj,samesize_,nv if (use_psi_adj) then ! ! Reuse the adjcncy version by tricking it with an adjcncy list ! that contains everybody but ME. ! - nv = size(idx) call psb_realloc(np-1,tmpadj,info) tmpadj(1:me) = [(i,i=0,me-1)] tmpadj(me+1:np-1) = [(i,i=me+1,np-1)] call psi_adjcncy_fnd_owner(idx,iprc,tmpadj,idxmap,info) + else - ! - ! 1. allgetherv - ! 2. local conversion - ! 3. reduce_scatter - ! + if (samesize_) then + ! + ! Variant when IDX is guaranteed to have the same size on all + ! processes. To be tested for performance: is it worth it? + ! Probably yes. + ! + gsz = nv*np + Allocate(rmtidx(gsz),lclidx(gsz),iprc(nv),stat=info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate') + goto 9999 + end if + call mpi_allgather(idx,nv,psb_mpi_lpk_,rmtidx,nv,psb_mpi_lpk_,icomm,minfo) + call idxmap%g2l(rmtidx(1:gsz),lclidx(1:gsz),info,owned=.true.) + ! + ! Reuse lclidx to encode owning process + ! + do i=1, gsz + if ((1<=lclidx(i)).and.(lclidx(i)<=n_row)) then + lclidx(i) = me + else + lclidx(i) = -1 + end if + end do + call mpi_reduce_scatter_block(lclidx,iprc,nv,psb_mpi_ipk_,mpi_max,icomm,minfo) + + else + ! + ! 1. allgetherv + ! 2. local conversion + ! 3. reduce_scatter + ! + ! + ! The basic idea is very simple. + ! First we collect (to all) all the requests. + Allocate(hidx(np+1),hsz(np),stat=info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate') + goto 9999 + end if + + call mpi_allgather(nv,1,psb_mpi_mpk_,hsz,1,psb_mpi_mpk_,icomm,minfo) + hidx(1) = 0 + do i=1, np + hidx(i+1) = hidx(i) + hsz(i) + end do + gsz = hidx(np+1) + Allocate(rmtidx(gsz),lclidx(gsz),iprc(nv),stat=info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate') + goto 9999 + end if + + call mpi_allgatherv(idx,hsz(me+1),psb_mpi_lpk_,& + & rmtidx,hsz,hidx,psb_mpi_lpk_,& + & icomm,minfo) + + call idxmap%g2l(rmtidx(1:gsz),lclidx(1:gsz),info,owned=.true.) + ! + ! Reuse lclidx to encode owning process + ! + do i=1, gsz + if ((1<=lclidx(i)).and.(lclidx(i)<=n_row)) then + lclidx(i) = me + else + lclidx(i) = -1 + end if + end do + call mpi_reduce_scatter(lclidx,iprc,hsz,psb_mpi_ipk_,mpi_max,icomm,minfo) + end if end if call psb_erractionrestore(err_act) diff --git a/base/internals/psi_graph_fnd_owner.F90 b/base/internals/psi_graph_fnd_owner.F90 index f87a5942..219ef25e 100644 --- a/base/internals/psi_graph_fnd_owner.F90 +++ b/base/internals/psi_graph_fnd_owner.F90 @@ -101,8 +101,8 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) integer(psb_ipk_), allocatable :: tprc(:), tsmpl(:), ladj(:) integer(psb_mpk_) :: icomm, minfo, iictxt integer(psb_ipk_) :: i,n_row,n_col,err_act,ip,j,ipnt, nsampl_out,& - & nv, n_answers, n_rest, nsampl_in, locr_max, & - & nrest_max, nadj, maxspace, mxnsin + & nv, n_answers, nreqst, nsampl_in, locr_max, & + & nreqst_max, nadj, maxspace, mxnsin integer(psb_lpk_) :: mglob, ih integer(psb_ipk_) :: ictxt,np,me, nresp integer(psb_ipk_), parameter :: nt=4 @@ -164,15 +164,19 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) nadj = psb_size(ladj) ! This makes ladj allocated with size 0 just in case call psb_realloc(nadj,ladj,info) - n_rest = nv - n_answers - nrest_max = n_rest + ! + ! Throughout the subroutine, nreqst is the number of local inquiries + ! that have not been answered yet + ! + nreqst = nv - n_answers + nreqst_max = nreqst ! ! Choice of maxspace should be adjusted to account for a default ! "sensible" size and/or a user-specified value ! tmpv(1) = nadj - tmpv(2) = nrest_max + tmpv(2) = nreqst_max tmpv(3) = n_row tmpv(4) = psb_cd_get_maxspace() call psb_max(ictxt,tmpv) @@ -187,12 +191,12 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) ! Do a preliminary run on the user-defined adjacency lists ! if (trace.and.(me == 0)) write(0,*) ' Initial sweep on user-defined topology' - nsampl_in = min(n_rest,max(1,(maxspace+max(1,nadj)-1))/(max(1,nadj))) + nsampl_in = min(nreqst,max(1,(maxspace+max(1,nadj)-1))/(max(1,nadj))) call psi_adj_fnd_sweep(idx,iprc,ladj,idxmap,nsampl_in,n_answers) call idxmap%xtnd_p_adjcncy(ladj) - n_rest = nv - n_answers - nrest_max = n_rest - call psb_max(ictxt,nrest_max) + nreqst = nv - n_answers + nreqst_max = nreqst + call psb_max(ictxt,nreqst_max) if (trace.and.(me == 0)) write(0,*) ' After initial sweep:',nrest_max end if if (do_timings) call psb_toc(idx_sweep0) @@ -208,28 +212,32 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) ! sample query is then sent to all processes ! ! if (trace.and.(me == 0)) write(0,*) 'Looping in graph_fnd_owner: ', nrest_max - nsampl_in = min(n_rest,max(1,(maxspace+np-1)/np)) - !nsampl_in = min(n_rest,32) + nsampl_in = psb_cd_get_samplesize() + nsampl_in = min(max(1,(maxspace+np-1)/np),nsampl_in) ! ! Choose a sample, should it be done in this simplistic way? ! Note: nsampl_in is a hint, not an absolute, hence nsampl_out ! ipnt = 1 -!!$ write(0,*) me,' Into first sampling ',nsampl_in - call psi_get_sample(ipnt, idx,iprc,tidx,tsmpl,nsampl_in,nsampl_out) + call psi_get_sample(ipnt, idx,iprc,tidx,tsmpl,nsampl_in,nsampl_out, pad=.true.) nsampl_in = min(nsampl_out,nsampl_in) !!$ write(0,*) me,' From first sampling ',nsampl_in ! ! 2. Do a search on all processes; this is supposed to find ! the owning process for all inputs; ! - call psi_a2a_fnd_owner(tidx(1:nsampl_in),tprc,idxmap,info) + call psi_a2a_fnd_owner(tidx(1:nsampl_in),tprc,idxmap,info, samesize=.true.) + ! + ! We might have padded when looking for owners, so the actual samples + ! could be less than they appear. Should be explained better. + ! + nsampl_in = min(nreqst,nsampl_in) call psi_cpy_out(iprc,tprc,tsmpl,nsampl_in,nsampl_out) if (nsampl_out /= nsampl_in) then write(0,*) me,'Warning: indices not found by a2a_fnd_owner ',nsampl_out,nsampl_in end if n_answers = n_answers + nsampl_out - n_rest = nv - n_answers + nreqst = nv - n_answers ! ! 3. Extract the resulting adjacency list and add it to the ! indxmap; @@ -246,16 +254,16 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) ! Need to set up a proper loop here to have a complete ! sweep over the input vector. Done inside adj_fnd_sweep. ! -!!$ write(0,*) me,' After a2a ',n_rest - nsampl_in = min(n_rest,max(1,(maxspace+max(1,nadj)-1))/(max(1,nadj))) +!!$ write(0,*) me,' After a2a ',nreqst + nsampl_in = min(nreqst,max(1,(maxspace+max(1,nadj)-1))/(max(1,nadj))) mxnsin = nsampl_in call psb_max(ictxt,mxnsin) !!$ write(0,*) me, ' mxnsin ',mxnsin if (mxnsin>0) call psi_adj_fnd_sweep(idx,iprc,ladj,idxmap,nsampl_in,n_answers) call idxmap%xtnd_p_adjcncy(ladj) - n_rest = nv - n_answers - nrest_max = n_rest + nreqst = nv - n_answers + nrest_max = nreqst call psb_max(ictxt,nrest_max) if (trace.and.(me == 0)) write(0,*) ' fnd_owner_loop remaining:',nrest_max if (do_timings) call psb_toc(idx_loop_neigh) @@ -270,16 +278,23 @@ subroutine psi_graph_fnd_owner(idx,iprc,idxmap,info) contains - subroutine psi_get_sample(ipntidx,idx,iprc,tidx,tsmpl,ns_in,ns_out) + subroutine psi_get_sample(ipntidx,idx,iprc,tidx,tsmpl,ns_in,ns_out,pad) implicit none integer(psb_ipk_), intent(inout) :: ipntidx integer(psb_lpk_), intent(in) :: idx(:) integer(psb_ipk_), intent(in) :: ns_in, iprc(:) integer(psb_lpk_), intent(out) :: tidx(:) integer(psb_ipk_), intent(out) :: tsmpl(:), ns_out + logical, intent(in), optional :: pad ! - integer(psb_ipk_) :: nv, ns + integer(psb_ipk_) :: nv, ns, k + logical :: pad_ + if (present(pad)) then + pad_ = pad + else + pad_ = .false. + end if nv = size(idx) ! ! Choose a sample, should it be done in this simplistic way? @@ -302,6 +317,13 @@ contains ipntidx = ipntidx + 1 if (ns_out >= ns) exit end do + if (pad_) then + do k = ns_out+1, ns + tsmpl(k) = -1 + tidx(k) = -1 + end do + ns_out = ns + end if end subroutine psi_get_sample diff --git a/base/modules/desc/psb_desc_mod.F90 b/base/modules/desc/psb_desc_mod.F90 index fbc64c8c..03ddc7c4 100644 --- a/base/modules/desc/psb_desc_mod.F90 +++ b/base/modules/desc/psb_desc_mod.F90 @@ -285,14 +285,6 @@ module psb_desc_mod module procedure psb_cdfree end interface psb_free - interface psb_cd_set_maxspace - module procedure psb_cd_set_maxspace - end interface psb_cd_set_maxspace - - interface psb_cd_get_maxspace - module procedure psb_cd_get_maxspace - end interface psb_cd_get_maxspace - interface psb_cd_set_large_threshold module procedure psb_i_cd_set_large_threshold end interface psb_cd_set_large_threshold @@ -310,7 +302,6 @@ module psb_desc_mod integer(psb_lpk_), private, save :: cd_large_threshold = psb_default_large_threshold - integer(psb_ipk_), private, save :: cd_maxspace = -1 contains @@ -362,21 +353,6 @@ contains integer(psb_lpk_) :: val val = cd_large_threshold end function psb_cd_get_large_threshold - - - subroutine psb_cd_set_maxspace(ith) - implicit none - integer(psb_ipk_), intent(in) :: ith - if (ith > 0) then - cd_maxspace = ith - end if - end subroutine psb_cd_set_maxspace - - function psb_cd_get_maxspace() result(val) - implicit none - integer(psb_ipk_) :: val - val = cd_maxspace - end function psb_cd_get_maxspace function psb_cd_is_large_size(m) result(val) use psb_penv_mod diff --git a/base/modules/desc/psb_indx_map_mod.f90 b/base/modules/desc/psb_indx_map_mod.f90 index 0f9d15c6..b79be471 100644 --- a/base/modules/desc/psb_indx_map_mod.f90 +++ b/base/modules/desc/psb_indx_map_mod.f90 @@ -29,7 +29,7 @@ ! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ! POSSIBILITY OF SUCH DAMAGE. ! -!!$ +! ! ! ! package: psb_indx_map_mod @@ -278,13 +278,14 @@ module psb_indx_map_mod end interface interface - subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info) + subroutine psi_a2a_fnd_owner(idx,iprc,idxmap,info,samesize) import :: psb_indx_map, psb_ipk_, psb_lpk_ implicit none integer(psb_lpk_), intent(in) :: idx(:) integer(psb_ipk_), allocatable, intent(out) :: iprc(:) class(psb_indx_map), intent(in) :: idxmap integer(psb_ipk_), intent(out) :: info + logical, intent(in), optional :: samesize end subroutine psi_a2a_fnd_owner end interface @@ -311,6 +312,25 @@ module psb_indx_map_mod end subroutine psi_graph_fnd_owner end interface + interface psb_cd_set_maxspace + module procedure psb_cd_set_maxspace + end interface psb_cd_set_maxspace + + interface psb_cd_get_maxspace + module procedure psb_cd_get_maxspace + end interface psb_cd_get_maxspace + + interface psb_cd_set_samplesize + module procedure psb_cd_set_samplesize + end interface psb_cd_set_samplesize + + interface psb_cd_get_samplesize + module procedure psb_cd_get_samplesize + end interface psb_cd_get_samplesize + + integer(psb_ipk_), private, save :: cd_maxspace = -1 + integer(psb_ipk_), private, save :: samplesize = 32 + integer, parameter :: psi_symm_flag_norv_ = 0 integer, parameter :: psi_symm_flag_inrv_ = 1 interface psi_symm_dep_list @@ -363,7 +383,36 @@ contains val = 'Unknown ?' end select end function psi_get_adj_alg_fmt - + + subroutine psb_cd_set_maxspace(ith) + implicit none + integer(psb_ipk_), intent(in) :: ith + if (ith > 0) then + cd_maxspace = ith + end if + end subroutine psb_cd_set_maxspace + + function psb_cd_get_maxspace() result(val) + implicit none + integer(psb_ipk_) :: val + val = cd_maxspace + end function psb_cd_get_maxspace + + + subroutine psb_cd_set_samplesize(ith) + implicit none + integer(psb_ipk_), intent(in) :: ith + if (ith > 0) then + samplesize = ith + end if + end subroutine psb_cd_set_samplesize + + function psb_cd_get_samplesize() result(val) + implicit none + integer(psb_ipk_) :: val + val = samplesize + end function psb_cd_get_samplesize + !> !! \memberof psb_indx_map