diff --git a/base/internals/psi_bld_glb_dep_list.F90 b/base/internals/psi_bld_glb_dep_list.F90 index fb4f9f32..bf28e49b 100644 --- a/base/internals/psi_bld_glb_dep_list.F90 +++ b/base/internals/psi_bld_glb_dep_list.F90 @@ -181,7 +181,7 @@ subroutine psi_i_bld_glb_csr_dep_list(ictxt,loc_dl,length_dl,c_dep_list,dl_ptr,i goto 9999 end if icomm = psb_get_mpi_comm(iictxt) - call mpi_allgather(loc_dl,myld,psb_mpi_ipk_,& + call mpi_allgatherv(loc_dl,myld,psb_mpi_ipk_,& & c_dep_list,length_dl,dl_ptr,psb_mpi_ipk_,icomm,minfo) info = minfo diff --git a/base/internals/psi_crea_index.f90 b/base/internals/psi_crea_index.f90 index eb22893e..68c4615b 100644 --- a/base/internals/psi_crea_index.f90 +++ b/base/internals/psi_crea_index.f90 @@ -154,8 +154,8 @@ subroutine psi_i_crea_index(desc_a,index_in,index_out,nxch,nsnd,nrcv,info) !!$ & ' avg:',dlavg, choose_sorting(dlmax,dlavg,np) if (choose_sorting(dlmax,dlavg,np)) then - if (.true.) then - call psi_bld_glb_dep_list(ictxt,& + if (.false.) then + call psi_bld_glb_dep_list(ictxt,& & loc_dl,length_dl,dep_list,dl_lda,info) if (info /= psb_success_) then @@ -196,6 +196,9 @@ subroutine psi_i_crea_index(desc_a,index_in,index_out,nxch,nsnd,nrcv,info) !!$ !!$ ! ....now i can sort dependency lists. call psi_sort_dl(dl_ptr,c_dep_list,length_dl,np,info) + ldl = length_dl(me) + loc_dl = c_dep_list(dl_ptr(me):dl_ptr(me)+ldl-1) + !!$ if(info /= psb_success_) then !!$ call psb_errpush(psb_err_from_subroutine_,name,a_err='psi_sort_dl') !!$ goto 9999 @@ -215,6 +218,7 @@ subroutine psi_i_crea_index(desc_a,index_in,index_out,nxch,nsnd,nrcv,info) if (do_timings) call psb_tic(idx_phase3) if(debug_level >= psb_debug_inner_)& & write(debug_unit,*) me,' ',trim(name),': calling psi_desc_index',ldl,':',loc_dl(1:ldl) + ! Do the actual format conversion. call psi_desc_index(desc_a,index_in,loc_dl,ldl,nsnd,nrcv,index_out,info) if(debug_level >= psb_debug_inner_) & @@ -249,6 +253,7 @@ contains logical :: val val = .not.(((dlmax>(26*4)).or.((dlavg>=(26*2)).and.(np>=128)))) + val = .true. end function choose_sorting end subroutine psi_i_crea_index diff --git a/base/internals/psi_sort_dl.f90 b/base/internals/psi_sort_dl.f90 index 60212d30..29a55b49 100644 --- a/base/internals/psi_sort_dl.f90 +++ b/base/internals/psi_sort_dl.f90 @@ -69,7 +69,6 @@ subroutine psi_i_sort_dl(dep_list,l_dep_list,np,info) isz = iich + ndgmx if (debug_level >= psb_debug_inner_)& & write(debug_unit,*) name,': ndgmx ',ndgmx,isz - allocate(work(isz)) ! call srtlist(dep_list, dl_lda, l_dep_list, np, info) call srtlist(dep_list,size(dep_list,1,kind=psb_ipk_),l_dep_list,np,work(idg),& @@ -89,19 +88,193 @@ subroutine psi_i_sort_dl(dep_list,l_dep_list,np,info) end subroutine psi_i_sort_dl +!********************************************************************** +! * +! The communication step among processors at each * +! matrix-vector product is a variable all-to-all * +! collective communication that we reimplement * +! in terms of point-to-point communications. * +! The data in input is a list of dependencies: * +! for each node a list of all the nodes it has to * +! communicate with. The lists are guaranteed to be * +! symmetric, i.e. for each pair (I,J) there is a * +! pair (J,I). The idea is to organize the ordering * +! so that at each communication step as many * +! processors as possible are communicating at the * +! same time, i.e. a step is defined by the fact * +! that all edges (I,J) in it have no common node. * +! * +! Formulation of the problem is: * +! Given an undirected graph (forest): * +! Find the shortest series of steps to cancel all * +! graph edges, where at each step all edges belonging * +! to a matching in the graph are canceled. * +! * +! An obvious lower bound to the optimum number of steps * +! is the largest degree of any node in the graph. * +! * +! The algorithm proceeds as follows: * +! 1. Build a list of all edges, e.g. copy the * +! dependencies lists keeping only (I,J) with I psi_i_csr_sort_dl use psb_const_mod use psb_error_mod + use psb_sort_mod implicit none - integer(psb_ipk_), intent(in) :: c_dep_list(:), dl_ptr(0:) - integer(psb_ipk_), intent(inout) :: l_dep_list(0:) + integer(psb_ipk_), intent(inout) :: c_dep_list(:), dl_ptr(0:), l_dep_list(0:) integer(psb_ipk_), intent(in) :: np integer(psb_ipk_), intent(out) :: info ! Local variables + integer(psb_ipk_), allocatable :: dg(:), dgp(:),& + & idx(:), upd(:), edges(:,:), ich(:) + integer(psb_ipk_) :: i, j, nedges, ip1, ip2, nch, ip, iedge,& + & i1, ix, ist, iswap(2) + nedges = size(c_dep_list) -end subroutine psi_i_csr_sort_dl + allocate(dg(0:np-1),dgp(nedges),edges(2,nedges),upd(0:np-1),& + & idx(nedges),ich(nedges),stat = info) + + if (info /= 0) then + info = -9 + return + end if + ! + ! 1. Compute an auxiliary vector with the degree of + ! each node of the graph. + dg(0:np-1) = l_dep_list(0:np-1) + ! + ! 2. Build a list of all edges, e.g. copy the + ! dependencies lists keeping only (I,J) with I