diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 index eb6b0eac..516daf4b 100644 --- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 index 020cae4b..ed4161a5 100644 --- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_c_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 index 241f0568..f2cf9027 100644 --- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 index 1433a670..6047f375 100644 --- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_d_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 index 329cd3ba..4d9ab106 100644 --- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) real(psb_spk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 index 4bb17a80..e94261a8 100644 --- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_s_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = sone + s_neigh_coo%val(k) = sone else - s_neigh_coo%val(ip) = -sone + s_neigh_coo%val(k) = -sone end if + else + s_neigh_coo%val(k) = -sone end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1)) diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 index 697a55b3..40a85dae 100644 --- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 @@ -87,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_), intent(out) :: info ! Local variables - integer(psb_ipk_), allocatable :: ioffs(:), neigh(:), irow(:), icol(:),& + integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),& & ideg(:), idxs(:) integer(psb_lpk_), allocatable :: tmpaggr(:) complex(psb_dpk_), allocatable :: val(:), diag(:) @@ -130,7 +130,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nr = a%get_nrows() nc = a%get_ncols() - allocate(ilaggr(nr),ioffs(nr),neigh(nr),ideg(nr),idxs(nr),& + allocate(ilaggr(nr),neigh(nr),ideg(nr),idxs(nr),& & icol(nc),val(nc),stat=info) if(info /= psb_success_) then info=psb_err_alloc_request_ @@ -151,19 +151,17 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in if (do_timings) call psb_toc(idx_soc1_p0) if (clean_zeros) call acsr%clean_zeros(info) if (iorder == amg_aggr_ord_nat_) then - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i - ioffs(i) = 0 end do !$omp end parallel do else - !$omp parallel do private(i) + !$omp parallel do private(i) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = acsr%irp(i+1) - acsr%irp(i) - ioffs(i) = 0 end do !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) @@ -189,11 +187,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! been set because it is strongly connected to an entry J belonging to a ! different thread. - - !$omp parallel shared(bnds,ioffs,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & + info = 0 + !$omp parallel shared(bnds,locnaggr,ilaggr,nr,naggr,diag,theta,nths) & !$omp private(icol,val,myth,kk) block integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg, ip, rsz + integer(psb_lpk_) :: itmp nths = omp_get_num_threads() myth = omp_get_thread_num() rsz = nr/nths @@ -213,7 +212,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp end master !$omp barrier - !$omp do schedule(static) + !$omp do schedule(static) private(disjoint) reduction(max: info) do kk=0, nths-1 step1: do ii=bnds(kk), bnds(kk+1)-1 if (info /= 0) cycle @@ -257,23 +256,31 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! as yet unconnected, turn it into the next aggregate. ! Same if ip==0 (in which case, neighborhood only ! contains I even if it does not look like it from matrix) + ! The fact that DISJOINT is private and not under lock + ! generates a certain un-repeatability, in that between + ! computing DISJOINT and assigning, another thread might + ! alter the values of ILAGGR. + ! However, a certain unrepeatability is already present + ! because the sequence of aggregates is computed with a + ! different order than in serial mode. ! disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - !$omp critical(update_ilaggr) - disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0) - if (disjoint) then - locnaggr(kk) = locnaggr(kk) + 1 - do k=1, ip - ilaggr(icol(k)) = bnds(kk)-1+locnaggr(kk) - ioffs(icol(k)) = kk - end do - ilaggr(i) = bnds(kk)-1+locnaggr(kk) - ioffs(i) = kk + if (disjoint) then + locnaggr(kk) = locnaggr(kk) + 1 + itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk + if (itmp < (bnds(kk)-1+locnaggr(kk))) then + info = 12345678 + cycle step1 end if - !$omp end critical(update_ilaggr) + !$omp atomic write + ilaggr(i) = itmp + !$omp end atomic + do k=1, ip + !$omp atomic write + ilaggr(icol(k)) = itmp + !$omp end atomic + end do end if - end if enddo step1 end do @@ -293,9 +300,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in !$omp do schedule(static) do kk=0, nths-1 do ii=bnds(kk), bnds(kk+1)-1 - if (ilaggr(ii) > 0) then - kp = ioffs(ii) - ilaggr(ii) = ilaggr(ii)- (bnds(kp)-1) + locnaggr(kp) + if (ilaggr(ii) > 0) then + kp = mod(ilaggr(ii),nths) + ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp) end if end do end do @@ -303,6 +310,12 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end block !$omp end parallel end block + if (info /= 0) then + if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR' + info=psb_err_internal_error_ + call psb_errpush(info,name) + goto 9999 + end if #else step1: do ii=1, nr if (info /= 0) cycle diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 index c1b165b1..e09bcf1e 100644 --- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 +++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 @@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in use psb_base_mod use amg_base_prec_type use amg_z_inner_mod +#if defined(OPENMP) + use omp_lib +#endif implicit none @@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in integer(psb_ipk_) :: np, me integer(psb_ipk_) :: nrow, ncol, n_ne character(len=20) :: name, ch_err + integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1 + integer(psb_ipk_), save :: idx_soc2_p0=-1 + logical, parameter :: do_timings=.true. info=psb_success_ name = 'amg_soc2_map_bld' @@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in nrow = desc_a%get_local_rows() ncol = desc_a%get_local_cols() nrglob = desc_a%get_global_rows() + if ((do_timings).and.(idx_soc2_p0==-1)) & + & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0") + if ((do_timings).and.(idx_soc2_p1==-1)) & + & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1") + if ((do_timings).and.(idx_soc2_p2==-1)) & + & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2") + if ((do_timings).and.(idx_soc2_p3==-1)) & + & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3") nr = a%get_nrows() nc = a%get_ncols() @@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in goto 9999 end if + if (do_timings) call psb_tic(idx_soc2_p0) diag = a%get_diag(info) if(info /= psb_success_) then info=psb_err_from_subroutine_ @@ -137,55 +152,104 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in ! call a%cp_to(muij) if (clean_zeros) call muij%clean_zeros(info) + !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static) do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j))) end do end do - + !$omp end parallel do ! ! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) - ip = 0 do i=1, nr do k=muij%irp(i),muij%irp(i+1)-1 j = muij%ja(k) + s_neigh_coo%ia(k) = i + s_neigh_coo%ja(k) = j if (j<=nr) then - ip = ip + 1 - s_neigh_coo%ia(ip) = i - s_neigh_coo%ja(ip) = j if (real(muij%val(k)) >= theta) then - s_neigh_coo%val(ip) = done + s_neigh_coo%val(k) = done else - s_neigh_coo%val(ip) = -done + s_neigh_coo%val(k) = -done end if + else + s_neigh_coo%val(k) = -done end if end do end do !write(*,*) 'S_NEIGH: ',nr,ip - call s_neigh_coo%set_nzeros(ip) + call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh%mv_from_coo(s_neigh_coo,info) - if (iorder == amg_aggr_ord_nat_) then + if (iorder == amg_aggr_ord_nat_) then + + !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) idxs(i) = i end do + !$omp end parallel do else + !$omp parallel do private(i) shared(ilaggr,idxs,muij) schedule(static) do i=1, nr ilaggr(i) = -(nr+1) ideg(i) = muij%irp(i+1) - muij%irp(i) end do + !$omp end parallel do call psb_msort(ideg,ix=idxs,dir=psb_sort_down_) end if + if (do_timings) call psb_toc(idx_soc2_p0) + if (do_timings) call psb_tic(idx_soc2_p1) ! ! Phase one: Start with disjoint groups. ! naggr = 0 +#if defined(OPENMP) + icnt = 0 + step1: do ii=1, nr + i = idxs(ii) + + if (ilaggr(i) == -(nr+1)) then + ! + ! Get the 1-neighbourhood of I + ! + ip1 = s_neigh%irp(i) + nz = s_neigh%irp(i+1)-ip1 + ! + ! If the neighbourhood only contains I, skip it + ! + if (nz ==0) then + ilaggr(i) = 0 + cycle step1 + end if + if ((nz==1).and.(s_neigh%ja(ip1)==i)) then + ilaggr(i) = 0 + cycle step1 + end if + ! + ! If the whole strongly coupled neighborhood of I is + ! as yet unconnected, turn it into the next aggregate. + ! + nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) + icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) + disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) + if (disjoint) then + icnt = icnt + 1 + naggr = naggr + 1 + do k=1, nzcnt + ilaggr(icol(k)) = naggr + end do + ilaggr(i) = naggr + end if + endif + enddo step1 + +#else icnt = 0 step1: do ii=1, nr i = idxs(ii) @@ -224,7 +288,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in end if endif enddo step1 - +#endif if (debug_level >= psb_debug_outer_) then write(debug_unit,*) me,' ',trim(name),& & ' Check 1:',count(ilaggr == -(nr+1))