Fixed SOC1 and SOC2 OpenMP

dev-openmp
sfilippone 1 year ago
parent 73495edf09
commit 5bcd36f394

@ -393,7 +393,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare !$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip) !$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)

@ -164,6 +164,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
@ -180,6 +181,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
@ -210,44 +212,155 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
naggr = 0 naggr = 0
#if defined(OPENMP) #if defined(OPENMP)
icnt = 0 block
step1: do ii=1, nr integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
i = idxs(ii) integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
if (ilaggr(i) == -(nr+1)) then !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
! !$omp private(icol,val,myth,kk)
! Get the 1-neighbourhood of I block
! integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
ip1 = s_neigh%irp(i) integer(psb_lpk_) :: itmp
nz = s_neigh%irp(i+1)-ip1 !$omp master
! nths = omp_get_num_threads()
! If the neighbourhood only contains I, skip it allocate(bnds(0:nths),locnaggr(0:nths+1))
! locnaggr(:) = 0
if (nz ==0) then bnds(0) = 1
ilaggr(i) = 0 !$omp end master
cycle step1 !$omp barrier
end if myth = omp_get_thread_num()
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then rsz = nr/nths
ilaggr(i) = 0 if (myth < mod(nr,nths)) rsz = rsz + 1
cycle step1 bnds(myth+1) = rsz
end if !$omp barrier
! !$omp master
! If the whole strongly coupled neighborhood of I is do i=1,nths
! as yet unconnected, turn it into the next aggregate. bnds(i) = bnds(i) + bnds(i-1)
! end do
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) info = 0
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) !$omp end master
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) !$omp barrier
if (disjoint) then
icnt = icnt + 1 !$omp do schedule(static) private(disjoint)
naggr = naggr + 1 do kk=0, nths-1
do k=1, nzcnt step1: do ii=bnds(kk), bnds(kk+1)-1
ilaggr(icol(k)) = naggr i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do end do
ilaggr(i) = naggr end do
end if !$omp end do
endif end block
enddo step1 !$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else #else
icnt = 0 icnt = 0

@ -393,7 +393,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare !$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip) !$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)

@ -164,6 +164,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
@ -180,6 +181,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
@ -210,44 +212,155 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
naggr = 0 naggr = 0
#if defined(OPENMP) #if defined(OPENMP)
icnt = 0 block
step1: do ii=1, nr integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
i = idxs(ii) integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
if (ilaggr(i) == -(nr+1)) then !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
! !$omp private(icol,val,myth,kk)
! Get the 1-neighbourhood of I block
! integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
ip1 = s_neigh%irp(i) integer(psb_lpk_) :: itmp
nz = s_neigh%irp(i+1)-ip1 !$omp master
! nths = omp_get_num_threads()
! If the neighbourhood only contains I, skip it allocate(bnds(0:nths),locnaggr(0:nths+1))
! locnaggr(:) = 0
if (nz ==0) then bnds(0) = 1
ilaggr(i) = 0 !$omp end master
cycle step1 !$omp barrier
end if myth = omp_get_thread_num()
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then rsz = nr/nths
ilaggr(i) = 0 if (myth < mod(nr,nths)) rsz = rsz + 1
cycle step1 bnds(myth+1) = rsz
end if !$omp barrier
! !$omp master
! If the whole strongly coupled neighborhood of I is do i=1,nths
! as yet unconnected, turn it into the next aggregate. bnds(i) = bnds(i) + bnds(i-1)
! end do
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) info = 0
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) !$omp end master
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) !$omp barrier
if (disjoint) then
icnt = icnt + 1 !$omp do schedule(static) private(disjoint)
naggr = naggr + 1 do kk=0, nths-1
do k=1, nzcnt step1: do ii=bnds(kk), bnds(kk+1)-1
ilaggr(icol(k)) = naggr i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do end do
ilaggr(i) = naggr end do
end if !$omp end do
endif end block
enddo step1 !$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else #else
icnt = 0 icnt = 0

@ -393,7 +393,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare !$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip) !$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)

@ -164,6 +164,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
@ -180,6 +181,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
@ -210,44 +212,155 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
naggr = 0 naggr = 0
#if defined(OPENMP) #if defined(OPENMP)
icnt = 0 block
step1: do ii=1, nr integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
i = idxs(ii) integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
if (ilaggr(i) == -(nr+1)) then !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
! !$omp private(icol,val,myth,kk)
! Get the 1-neighbourhood of I block
! integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
ip1 = s_neigh%irp(i) integer(psb_lpk_) :: itmp
nz = s_neigh%irp(i+1)-ip1 !$omp master
! nths = omp_get_num_threads()
! If the neighbourhood only contains I, skip it allocate(bnds(0:nths),locnaggr(0:nths+1))
! locnaggr(:) = 0
if (nz ==0) then bnds(0) = 1
ilaggr(i) = 0 !$omp end master
cycle step1 !$omp barrier
end if myth = omp_get_thread_num()
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then rsz = nr/nths
ilaggr(i) = 0 if (myth < mod(nr,nths)) rsz = rsz + 1
cycle step1 bnds(myth+1) = rsz
end if !$omp barrier
! !$omp master
! If the whole strongly coupled neighborhood of I is do i=1,nths
! as yet unconnected, turn it into the next aggregate. bnds(i) = bnds(i) + bnds(i-1)
! end do
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) info = 0
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) !$omp end master
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) !$omp barrier
if (disjoint) then
icnt = icnt + 1 !$omp do schedule(static) private(disjoint)
naggr = naggr + 1 do kk=0, nths-1
do k=1, nzcnt step1: do ii=bnds(kk), bnds(kk+1)-1
ilaggr(icol(k)) = naggr i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do end do
ilaggr(i) = naggr end do
end if !$omp end do
endif end block
enddo step1 !$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else #else
icnt = 0 icnt = 0

@ -393,7 +393,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
tmpaggr = ilaggr tmpaggr = ilaggr
!$omp end workshare !$omp end workshare
!$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)&
!$omp private(ii,i,j,k,nz,icol,val,ip) !$omp private(ii,i,j,k,nz,icol,val,ip,cpling)
step2: do ii=1,nr step2: do ii=1,nr
i = idxs(ii) i = idxs(ii)

@ -164,6 +164,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! Compute the 1-neigbour; mark strong links with +1, weak links with -1 ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
! !
call s_neigh_coo%allocate(nr,nr,muij%get_nzeros()) call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
!$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
do i=1, nr do i=1, nr
do k=muij%irp(i),muij%irp(i+1)-1 do k=muij%irp(i),muij%irp(i+1)-1
j = muij%ja(k) j = muij%ja(k)
@ -180,6 +181,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
end if end if
end do end do
end do end do
!$omp end parallel do
!write(*,*) 'S_NEIGH: ',nr,ip !write(*,*) 'S_NEIGH: ',nr,ip
call s_neigh_coo%set_nzeros(muij%get_nzeros()) call s_neigh_coo%set_nzeros(muij%get_nzeros())
call s_neigh%mv_from_coo(s_neigh_coo,info) call s_neigh%mv_from_coo(s_neigh_coo,info)
@ -210,44 +212,155 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
! !
naggr = 0 naggr = 0
#if defined(OPENMP) #if defined(OPENMP)
icnt = 0 block
step1: do ii=1, nr integer(psb_ipk_), allocatable :: bnds(:), locnaggr(:)
i = idxs(ii) integer(psb_ipk_) :: myth,nths, kk
! The parallelization makes use of a locaggr(:) array; each thread
! keeps its own version of naggr, and when the loop ends, a prefix is applied
! to locnaggr to determine:
! 1. The total number of aggregaters NAGGR;
! 2. How much should each thread shift its own aggregates
! Part 2 requires to keep track of which thread defined each entry
! of ilaggr(), so that each entry can be adjusted correctly: even
! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
! been set because it is strongly connected to an entry J belonging to a
! different thread.
if (ilaggr(i) == -(nr+1)) then !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
! !$omp private(icol,val,myth,kk)
! Get the 1-neighbourhood of I block
! integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
ip1 = s_neigh%irp(i) integer(psb_lpk_) :: itmp
nz = s_neigh%irp(i+1)-ip1 !$omp master
! nths = omp_get_num_threads()
! If the neighbourhood only contains I, skip it allocate(bnds(0:nths),locnaggr(0:nths+1))
! locnaggr(:) = 0
if (nz ==0) then bnds(0) = 1
ilaggr(i) = 0 !$omp end master
cycle step1 !$omp barrier
end if myth = omp_get_thread_num()
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then rsz = nr/nths
ilaggr(i) = 0 if (myth < mod(nr,nths)) rsz = rsz + 1
cycle step1 bnds(myth+1) = rsz
end if !$omp barrier
! !$omp master
! If the whole strongly coupled neighborhood of I is do i=1,nths
! as yet unconnected, turn it into the next aggregate. bnds(i) = bnds(i) + bnds(i-1)
! end do
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0) info = 0
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0)) !$omp end master
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) !$omp barrier
if (disjoint) then
icnt = icnt + 1 !$omp do schedule(static) private(disjoint)
naggr = naggr + 1 do kk=0, nths-1
do k=1, nzcnt step1: do ii=bnds(kk), bnds(kk+1)-1
ilaggr(icol(k)) = naggr i = idxs(ii)
if (info /= 0) then
write(0,*) ' Step1:',kk,ii,i,info
cycle step1
end if
if ((i<1).or.(i>nr)) then
!$omp atomic write
info=psb_err_internal_error_
!$omp end atomic
call psb_errpush(info,name)
cycle step1
!goto 9999
end if
if (ilaggr(i) == -(nr+1)) then
!
! Get the 1-neighbourhood of I
!
ip1 = s_neigh%irp(i)
nz = s_neigh%irp(i+1)-ip1
!
! If the neighbourhood only contains I, skip it
!
if (nz ==0) then
ilaggr(i) = 0
cycle step1
end if
if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
ilaggr(i) = 0
cycle step1
end if
nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1))
!
! If the whole strongly coupled neighborhood of I is
! as yet unconnected, turn it into the next aggregate.
! Same if ip==0 (in which case, neighborhood only
! contains I even if it does not look like it from matrix)
! The fact that DISJOINT is private and not under lock
! generates a certain un-repeatability, in that between
! computing DISJOINT and assigning, another thread might
! alter the values of ILAGGR.
! However, a certain unrepeatability is already present
! because the sequence of aggregates is computed with a
! different order than in serial mode.
! In any case, even if the enteries of ILAGGR may be
! overwritten, the important thing is that each entry is
! consistent and they generate a correct aggregation map.
!
if (disjoint) then
locnaggr(kk) = locnaggr(kk) + 1
itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
if (itmp < (bnds(kk)-1+locnaggr(kk))) then
!$omp atomic update
info = max(12345678,info)
!$omp end atomic
cycle step1
end if
!$omp atomic write
ilaggr(i) = itmp
!$omp end atomic
do k=1, nzcnt
!$omp atomic write
ilaggr(icol(k)) = itmp
!$omp end atomic
end do
end if
end if
enddo step1
end do
!$omp end do
!$omp master
naggr = sum(locnaggr(0:nths-1))
do i=1,nths
locnaggr(i) = locnaggr(i) + locnaggr(i-1)
end do
do i=nths+1,1,-1
locnaggr(i) = locnaggr(i-1)
end do
locnaggr(0) = 0
!write(0,*) 'LNAG ',locnaggr(nths+1)
!$omp end master
!$omp barrier
!$omp do schedule(static)
do kk=0, nths-1
do ii=bnds(kk), bnds(kk+1)-1
if (ilaggr(ii) > 0) then
kp = mod(ilaggr(ii),nths)
ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
end if
end do end do
ilaggr(i) = naggr end do
end if !$omp end do
endif end block
enddo step1 !$omp end parallel
end block
if (info /= 0) then
if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
info=psb_err_internal_error_
call psb_errpush(info,name)
goto 9999
end if
#else #else
icnt = 0 icnt = 0

Loading…
Cancel
Save