From 5bcd36f39411dcd3201072cf48d7ebe2eb3bbf88 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Tue, 8 Aug 2023 09:25:15 +0200
Subject: [PATCH] Fixed SOC1 and SOC2 OpenMP

---
 .../impl/aggregator/amg_c_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_c_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_d_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_d_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_s_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_s_soc2_map_bld.F90    | 189 ++++++++++++++----
 .../impl/aggregator/amg_z_soc1_map_bld.F90    |   2 +-
 .../impl/aggregator/amg_z_soc2_map_bld.F90    | 189 ++++++++++++++----
 8 files changed, 608 insertions(+), 156 deletions(-)

diff --git a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
index b9110aae..81047953 100644
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
index ed4161a5..3bda8e90 100644
--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_c_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
index 2b01f3e5..c83dfe3b 100644
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
index 6047f375..b4602378 100644
--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_d_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
index 069c924e..59a7c03b 100644
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
index e94261a8..8dac2dd5 100644
--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_s_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0
diff --git a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
index d618fe1c..66c8e4e2 100644
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@@ -393,7 +393,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   tmpaggr = ilaggr
   !$omp end workshare
   !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
-  !$omp     private(ii,i,j,k,nz,icol,val,ip)
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
   step2: do ii=1,nr
     i = idxs(ii)
 
diff --git a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90 b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
index e09bcf1e..19956309 100644
--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@@ -68,7 +68,7 @@
 !
 subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
 
-  use psb_base_mod
+  use psb_base_mod 
   use amg_base_prec_type
   use amg_z_inner_mod
 #if defined(OPENMP)
@@ -164,6 +164,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
   !
   call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
   do i=1, nr
     do k=muij%irp(i),muij%irp(i+1)-1
       j = muij%ja(k)
@@ -180,6 +181,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
       end if
     end do
   end do
+  !$omp end parallel do
   !write(*,*) 'S_NEIGH: ',nr,ip
   call s_neigh_coo%set_nzeros(muij%get_nzeros())
   call s_neigh%mv_from_coo(s_neigh_coo,info)
@@ -209,45 +211,156 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
   ! Phase one: Start with disjoint groups.
   ! 
   naggr = 0
-#if defined(OPENMP)  
-  icnt = 0
-  step1: do ii=1, nr
-    i = idxs(ii)
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
 
-    if (ilaggr(i) == -(nr+1)) then 
-      !
-      ! Get the 1-neighbourhood of I 
-      !
-      ip1 = s_neigh%irp(i)
-      nz  = s_neigh%irp(i+1)-ip1
-      !
-      ! If the neighbourhood only contains I, skip it
-      !
-      if (nz ==0) then
-        ilaggr(i) = 0
-        cycle step1
-      end if
-      if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
-        ilaggr(i) = 0
-        cycle step1
-      end if      
-      !
-      ! If the whole strongly coupled neighborhood of I is
-      ! as yet unconnected, turn it into the next aggregate.
-      !
-      nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
-      icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
-      disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
-      if (disjoint) then 
-        icnt      = icnt + 1 
-        naggr     = naggr + 1
-        do k=1, nzcnt
-          ilaggr(icol(k)) = naggr
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
         end do
-        ilaggr(i) = naggr
-      end if
-    endif
-  enddo step1
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
 
 #else
   icnt = 0