Merge branch 'dev-openmp' into development

1 year ago · 0bcc9d7b55
parent dd7cb86775 11421f53a2
commit 0bcc9d7b55
64 changed files with 5655 additions and 819 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,3 @@
                         AMG4PSBLAS  
 Algebraic Multigrid Package  based on PSBLAS (Parallel Sparse BLAS version 3.8)
--- a/amgprec/amg_d_matchboxp_mod.f90
+++ b/amgprec/amg_d_matchboxp_mod.f90
@ -143,9 +143,10 @@ contains
    type(psb_ld_coo_sparse_mat) :: tmpcoo
    logical  :: display_out_, print_out_, reproducible_
    logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
    integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
    logical, parameter :: do_timings=.true.
    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
    ictxt = desc_a%get_ctxt()
    call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
    call desc_a%l2gip(ilv,info,owned=.false.)
    call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
    call psb_geasb(ilaggr,desc_a,info)
    nr = a%get_nrows()
    nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
      call psb_barrier(ictxt)
      if (iam == 0) write(0,*)' out from buildmatching:', info
    end if
    if (debug_mate) then
      block
        integer(psb_lpk_), allocatable :: ckmate(:)
        allocate(ckmate(nr))
        ckmate(1:nr) = mate(1:nr)
        call psb_msort(ckmate(1:nr))
        do i=1,nr-1
          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
          end if
        end do
      end block
    end if
    if (info == 0) then
      if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
          cycle
        else
-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then
            wk   = w(k)
            widx = w(idx)
@ -267,7 +281,7 @@ contains
            nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
            if (nrmagg > epsilon(nrmagg)) then
              if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                  ! Now, if both vertices are local, the aggregate is local
                  ! (kinda obvious).
                  nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
                  ilaggr(idx) = nlaggr(iam)
                  wtemp(k)    = w(k)/nrmagg
                  wtemp(idx)  = w(idx)/nrmagg
                else
                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
                       &mate(idx),ilaggr(idx)
                end if
                nlpairs = nlpairs+1
              else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                else
                  ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                end if
              end if
@ -325,6 +342,12 @@ contains
                nlsingl     = nlsingl + 1
              end if
            end if
            if (ilaggr(k) == ilaggr_neginit) then
              write(0,*) iam,' Error: no update to ',k,mate(k),&
                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
            end if
          else
            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
          end if
        end if
      end do
@ -332,7 +355,7 @@ contains
      if (do_timings) call psb_tic(idx_phase3)
      ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
      call psb_sum(ictxt,nlaggr)
      ntaggr  = sum(nlaggr(0:np-1))
      naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
      call psb_halo(wtemp,desc_a,info)
      ! Cleanup as yet unmarked entries
      do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
          idx = mate(k)
          if (idx > nr) then
            i   = ilaggr(idx)
@ -359,9 +382,14 @@ contains
          else
            write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
          end if
-        end if
+        else if (ilaggr(k) <0) then
-        if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
          if (mate(k) <= nr) then
            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
          end if
          flush(0)
        end if
      end do
      if (debug_sync) then
@ -414,7 +442,7 @@ contains
      end block
      if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
             & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
      end if
--- a/amgprec/amg_s_matchboxp_mod.f90
+++ b/amgprec/amg_s_matchboxp_mod.f90
@ -143,9 +143,10 @@ contains
    type(psb_ls_coo_sparse_mat) :: tmpcoo
    logical  :: display_out_, print_out_, reproducible_
    logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
    integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
    logical, parameter :: do_timings=.true.
    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2
    ictxt = desc_a%get_ctxt()
    call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
    call desc_a%l2gip(ilv,info,owned=.false.)
    call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
    call psb_geasb(ilaggr,desc_a,info)
    nr = a%get_nrows()
    nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
      call psb_barrier(ictxt)
      if (iam == 0) write(0,*)' out from buildmatching:', info
    end if
    if (debug_mate) then
      block
        integer(psb_lpk_), allocatable :: ckmate(:)
        allocate(ckmate(nr))
        ckmate(1:nr) = mate(1:nr)
        call psb_msort(ckmate(1:nr))
        do i=1,nr-1
          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
          end if
        end do
      end block
    end if
    if (info == 0) then
      if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
          cycle
        else
-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then
            wk   = w(k)
            widx = w(idx)
@ -267,7 +281,7 @@ contains
            nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
            if (nrmagg > epsilon(nrmagg)) then
              if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                  ! Now, if both vertices are local, the aggregate is local
                  ! (kinda obvious).
                  nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
                  ilaggr(idx) = nlaggr(iam)
                  wtemp(k)    = w(k)/nrmagg
                  wtemp(idx)  = w(idx)/nrmagg
                else
                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
                       &mate(idx),ilaggr(idx)
                end if
                nlpairs = nlpairs+1
              else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                else
                  ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                end if
              end if
@ -325,6 +342,12 @@ contains
                nlsingl     = nlsingl + 1
              end if
            end if
            if (ilaggr(k) == ilaggr_neginit) then
              write(0,*) iam,' Error: no update to ',k,mate(k),&
                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
            end if
          else
            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
          end if
        end if
      end do
@ -332,7 +355,7 @@ contains
      if (do_timings) call psb_tic(idx_phase3)
      ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
      call psb_sum(ictxt,nlaggr)
      ntaggr  = sum(nlaggr(0:np-1))
      naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
      call psb_halo(wtemp,desc_a,info)
      ! Cleanup as yet unmarked entries
      do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
          idx = mate(k)
          if (idx > nr) then
            i   = ilaggr(idx)
@ -359,9 +382,14 @@ contains
          else
            write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
          end if
-        end if
+        else if (ilaggr(k) <0) then
-        if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
          if (mate(k) <= nr) then
            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
          end if
          flush(0)
        end if
      end do
      if (debug_sync) then
@ -414,7 +442,7 @@ contains
      end block
      if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
             & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
      end if
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@ -62,7 +62,22 @@ amg_s_parmatch_smth_bld.o \
 amg_s_parmatch_spmm_bld_inner.o
 MPCOBJS=MatchBoxPC.o \
-algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
+sendBundledMessages.o \
 initialize.o \
 extractUChunk.o \
 isAlreadyMatched.o \
 findOwnerOfGhost.o \
 clean.o \
 computeCandidateMate.o \
 parallelComputeCandidateMateB.o \
 processMatchedVertices.o \
 processMatchedVerticesAndSendMessages.o \
 processCrossEdge.o \
 queueTransfer.o \
 processMessages.o \
 processExposedVertex.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
 algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 OBJS = $(FOBJS) $(MPCOBJS)
--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 		MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
 #if !defined(SERIAL_MPI)
  MPI_Comm C_comm=MPI_Comm_f2c(icomm);
 #ifdef DEBUG
  fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
-  dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
+
 #define TIME_TRACKER
    #ifdef TIME_TRACKER
        double tmr = MPI_Wtime();
    #endif
 #define OMP
 #ifdef OMP
        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
 							   myRank, numProcs, C_comm,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
 #else
        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
 							   myRank, numProcs, C_comm,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
 #endif
  #ifdef TIME_TRACKER
    tmr = MPI_Wtime() - tmr;
    fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
  #endif
 #endif
 }
--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@ -52,145 +52,412 @@
 #ifndef _matchboxpC_H_
 #define _matchboxpC_H_
-//Turn on a lot of debugging information with this switch:
+// Turn on a lot of debugging information with this switch:
 //#define PRINT_DEBUG_INFO_
 #include <stdio.h>
 #include <iostream>
 #include <assert.h>
 #include <map>
 #include <vector>
-// #include "matchboxp.h"
+#include "omp.h"
 #include "primitiveDataTypeDefinitions.h"
 #include "dataStrStaticQueue.h"
 using namespace std;
 const int NUM_THREAD = 4;
 const int UCHUNK = 10;
 const MilanLongInt REQUEST = 1;
 const MilanLongInt SUCCESS = 2;
 const MilanLongInt FAILURE = 3;
 const MilanLongInt SIZEINFO = 4;
 const int ComputeTag = 7; // Predefined tag
 const int BundleTag = 9;  // Predefined tag
 static vector<MilanLongInt> DEFAULT_VECTOR;
 // MPI type map
 template <typename T>
 MPI_Datatype TypeMap();
 template <>
 inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
 template <>
 inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
 template <>
 inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
 template <>
 inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 #ifdef __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
 #if !defined(SERIAL_MPI)
-#define MilanMpiLongInt  MPI_LONG_LONG
+#define MilanMpiLongInt MPI_LONG_LONG
 #ifndef _primitiveDataType_Definition_
 #define _primitiveDataType_Definition_
-    //Regular integer:
+// Regular integer:
-    #ifndef INTEGER_H
+#ifndef INTEGER_H
-    #define INTEGER_H
+#define INTEGER_H
-        typedef int32_t MilanInt;
+    typedef int32_t MilanInt;
-    #endif
+#endif
-
+
-    //Regular long integer:
+// Regular long integer:
-    #ifndef LONG_INT_H
+#ifndef LONG_INT_H
-    #define LONG_INT_H
+#define LONG_INT_H
-        #ifdef BIT64
+#ifdef BIT64
-            typedef int64_t MilanLongInt;
+    typedef int64_t MilanLongInt;
-            typedef MPI_LONG MilanMpiLongInt;
+    typedef MPI_LONG MilanMpiLongInt;
-        #else
+#else
-            typedef int32_t MilanLongInt;
+    typedef int32_t MilanLongInt;
-            typedef MPI_INT MilanMpiLongInt;
+    typedef MPI_INT MilanMpiLongInt;
-        #endif
+#endif
-    #endif
+#endif
-
+
-    //Regular boolean
+// Regular boolean
-    #ifndef BOOL_H
+#ifndef BOOL_H
-    #define BOOL_H
+#define BOOL_H
-        typedef bool MilanBool;
+    typedef bool MilanBool;
-    #endif
+#endif
-
+
-    //Regular double and absolute value computation:
+// Regular double and absolute value computation:
-    #ifndef REAL_H
+#ifndef REAL_H
-    #define REAL_H
+#define REAL_H
-        typedef double MilanReal;
+    typedef double MilanReal;
-        typedef MPI_DOUBLE MilanMpiReal;
+    typedef MPI_DOUBLE MilanMpiReal;
-        inline MilanReal MilanAbs(MilanReal value)
+    inline MilanReal MilanAbs(MilanReal value)
-        {
+    {
-            return fabs(value);
+        return fabs(value);
-        }
+    }
-    #endif
+#endif
-
+
-    //Regular float and absolute value computation:
+// Regular float and absolute value computation:
-    #ifndef FLOAT_H
+#ifndef FLOAT_H
-    #define FLOAT_H
+#define FLOAT_H
-        typedef float MilanFloat;
+    typedef float MilanFloat;
-        typedef MPI_FLOAT MilanMpiFloat;
+    typedef MPI_FLOAT MilanMpiFloat;
-        inline MilanFloat MilanAbsFloat(MilanFloat value)
+    inline MilanFloat MilanAbsFloat(MilanFloat value)
-        {
+    {
-            return fabs(value);
+        return fabs(value);
-        }
+    }
-    #endif
+#endif
-
+
-    //// Define the limits:
+//// Define the limits:
-    #ifndef LIMITS_H
+#ifndef LIMITS_H
-    #define LIMITS_H
+#define LIMITS_H
-    //Integer Maximum and Minimum:
+    // Integer Maximum and Minimum:
-  //      #define MilanIntMax INT_MAX
+    //      #define MilanIntMax INT_MAX
-  //    #define MilanIntMin INT_MIN
+    //    #define MilanIntMin INT_MIN
-        #define MilanIntMax INT32_MAX
+#define MilanIntMax INT32_MAX
-        #define MilanIntMin INT32_MIN
+#define MilanIntMin INT32_MIN
-
+
-        #ifdef BIT64
+#ifdef BIT64
-            #define MilanLongIntMax INT64_MAX
+#define MilanLongIntMax INT64_MAX
-            #define MilanLongIntMin -INT64_MAX
+#define MilanLongIntMin -INT64_MAX
-        #else
+#else
-            #define MilanLongIntMax INT32_MAX
+#define MilanLongIntMax INT32_MAX
-            #define MilanLongIntMin -INT32_MAX
+#define MilanLongIntMin -INT32_MAX
-        #endif
+#endif
-
+
-    #endif
+#endif
    // +INFINITY
    const double PLUS_INFINITY = numeric_limits<int>::infinity();
    const double MINUS_INFINITY = -PLUS_INFINITY;
-    //#define MilanRealMax LDBL_MAX
+//#define MilanRealMax LDBL_MAX
-    #define MilanRealMax PLUS_INFINITY
+#define MilanRealMax PLUS_INFINITY
-    #define MilanRealMin MINUS_INFINITY
+#define MilanRealMin MINUS_INFINITY
 #endif
-//Function of find the owner of a ghost vertex using binary search:
+    // Function of find the owner of a ghost vertex using binary search:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+    MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                     MilanInt myRank, MilanInt numProcs);
+                              MilanInt myRank, MilanInt numProcs);
-
+
-  void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
+    MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
-(
+                                           MilanLongInt adj2,
- MilanLongInt NLVer, MilanLongInt NLEdge,
+                                           MilanLongInt *verLocInd,
- MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
+                                           MilanReal *edgeLocWeight);
- MilanLongInt* verDistance,
+
- MilanLongInt* Mate,
+    void queuesTransfer(vector<MilanLongInt> &U,
- MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+                        vector<MilanLongInt> &privateU,
- MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
+                        vector<MilanLongInt> &QLocalVtx,
- MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                        vector<MilanLongInt> &QGhostVtx,
- MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+                        vector<MilanLongInt> &QMsgType,
-
+                        vector<MilanInt> &QOwner,
- void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
+                        vector<MilanLongInt> &privateQLocalVtx,
-(
+                        vector<MilanLongInt> &privateQGhostVtx,
-MilanLongInt NLVer, MilanLongInt NLEdge,
+                        vector<MilanLongInt> &privateQMsgType,
-MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
+                        vector<MilanInt> &privateQOwner);
-MilanLongInt* verDistance,
+
-MilanLongInt* Mate,
+    bool isAlreadyMatched(MilanLongInt node,
-MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+                          MilanLongInt StartIndex,
-MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
+                          MilanLongInt EndIndex,
-MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                          vector<MilanLongInt> &GMate,
-MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+                          MilanLongInt *Mate,
-
+                          map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
-void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
+    MilanLongInt computeCandidateMate(MilanLongInt adj1,
-		MilanLongInt* verDistance,
+                                      MilanLongInt adj2,
-		MilanLongInt* Mate,
+                                      MilanReal *edgeLocWeight,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                                      MilanLongInt k,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
+                                      MilanLongInt *verLocInd,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                                      MilanLongInt StartIndex,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+                                      MilanLongInt EndIndex,
-
+                                      vector<MilanLongInt> &GMate,
-void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                                      MilanLongInt *Mate,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
+                                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
-		MilanLongInt* verDistance,
+
-		MilanLongInt* Mate,
+    void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                    MilanLongInt StartIndex, MilanLongInt EndIndex,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
+                    MilanLongInt *numGhostEdgesPtr,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
+                    MilanLongInt *numGhostVerticesPtr,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+                    MilanLongInt *S,
                    MilanLongInt *verLocInd,
                    MilanLongInt *verLocPtr,
                    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                    vector<MilanLongInt> &Counter,
                    vector<MilanLongInt> &verGhostPtr,
                    vector<MilanLongInt> &verGhostInd,
                    vector<MilanLongInt> &tempCounter,
                    vector<MilanLongInt> &GMate,
                    vector<MilanLongInt> &Message,
                    vector<MilanLongInt> &QLocalVtx,
                    vector<MilanLongInt> &QGhostVtx,
                    vector<MilanLongInt> &QMsgType,
                    vector<MilanInt> &QOwner,
                    MilanLongInt *&candidateMate,
                    vector<MilanLongInt> &U,
                    vector<MilanLongInt> &privateU,
                    vector<MilanLongInt> &privateQLocalVtx,
                    vector<MilanLongInt> &privateQGhostVtx,
                    vector<MilanLongInt> &privateQMsgType,
                    vector<MilanInt> &privateQOwner);
    void clean(MilanLongInt NLVer,
               MilanInt myRank,
               MilanLongInt MessageIndex,
               vector<MPI_Request> &SRequest,
               vector<MPI_Status> &SStatus,
               MilanInt BufferSize,
               MilanLongInt *Buffer,
               MilanLongInt msgActual,
               MilanLongInt *msgActualSent,
               MilanLongInt msgInd,
               MilanLongInt *msgIndSent,
               MilanLongInt NumMessagesBundled,
               MilanReal *msgPercent);
    void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
                                           MilanLongInt *verLocPtr,
                                           MilanLongInt *verLocInd,
                                           MilanInt myRank,
                                           MilanReal *edgeLocWeight,
                                           MilanLongInt *candidateMate);
    void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                           MilanLongInt *candidateMate,
                                           MilanLongInt *verLocInd,
                                           MilanLongInt *verLocPtr,
                                           MilanLongInt StartIndex,
                                           MilanLongInt EndIndex,
                                           MilanLongInt *Mate,
                                           vector<MilanLongInt> &GMate,
                                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                                           MilanReal *edgeLocWeight,
                                           MilanLongInt *myCardPtr,
                                           MilanLongInt *msgIndPtr,
                                           MilanLongInt *NumMessagesBundledPtr,
                                           MilanLongInt *SPtr,
                                           MilanLongInt *verDistance,
                                           MilanLongInt *PCounter,
                                           vector<MilanLongInt> &Counter,
                                           MilanInt myRank,
                                           MilanInt numProcs,
                                           vector<MilanLongInt> &U,
                                           vector<MilanLongInt> &privateU,
                                           vector<MilanLongInt> &QLocalVtx,
                                           vector<MilanLongInt> &QGhostVtx,
                                           vector<MilanLongInt> &QMsgType,
                                           vector<MilanInt> &QOwner,
                                           vector<MilanLongInt> &privateQLocalVtx,
                                           vector<MilanLongInt> &privateQGhostVtx,
                                           vector<MilanLongInt> &privateQMsgType,
                                           vector<MilanInt> &privateQOwner);
    void PROCESS_CROSS_EDGE(MilanLongInt *edge,
                            MilanLongInt *SPtr);
    void processMatchedVertices(
        MilanLongInt NLVer,
        vector<MilanLongInt> &UChunkBeingProcessed,
        vector<MilanLongInt> &U,
        vector<MilanLongInt> &privateU,
        MilanLongInt StartIndex,
        MilanLongInt EndIndex,
        MilanLongInt *myCardPtr,
        MilanLongInt *msgIndPtr,
        MilanLongInt *NumMessagesBundledPtr,
        MilanLongInt *SPtr,
        MilanLongInt *verLocPtr,
        MilanLongInt *verLocInd,
        MilanLongInt *verDistance,
        MilanLongInt *PCounter,
        vector<MilanLongInt> &Counter,
        MilanInt myRank,
        MilanInt numProcs,
        MilanLongInt *candidateMate,
        vector<MilanLongInt> &GMate,
        MilanLongInt *Mate,
        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
        MilanReal *edgeLocWeight,
        vector<MilanLongInt> &QLocalVtx,
        vector<MilanLongInt> &QGhostVtx,
        vector<MilanLongInt> &QMsgType,
        vector<MilanInt> &QOwner,
        vector<MilanLongInt> &privateQLocalVtx,
        vector<MilanLongInt> &privateQGhostVtx,
        vector<MilanLongInt> &privateQMsgType,
        vector<MilanInt> &privateQOwner);
    void processMatchedVerticesAndSendMessages(
        MilanLongInt NLVer,
        vector<MilanLongInt> &UChunkBeingProcessed,
        vector<MilanLongInt> &U,
        vector<MilanLongInt> &privateU,
        MilanLongInt StartIndex,
        MilanLongInt EndIndex,
        MilanLongInt *myCardPtr,
        MilanLongInt *msgIndPtr,
        MilanLongInt *NumMessagesBundledPtr,
        MilanLongInt *SPtr,
        MilanLongInt *verLocPtr,
        MilanLongInt *verLocInd,
        MilanLongInt *verDistance,
        MilanLongInt *PCounter,
        vector<MilanLongInt> &Counter,
        MilanInt myRank,
        MilanInt numProcs,
        MilanLongInt *candidateMate,
        vector<MilanLongInt> &GMate,
        MilanLongInt *Mate,
        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
        MilanReal *edgeLocWeight,
        vector<MilanLongInt> &QLocalVtx,
        vector<MilanLongInt> &QGhostVtx,
        vector<MilanLongInt> &QMsgType,
        vector<MilanInt> &QOwner,
        vector<MilanLongInt> &privateQLocalVtx,
        vector<MilanLongInt> &privateQGhostVtx,
        vector<MilanLongInt> &privateQMsgType,
        vector<MilanInt> &privateQOwner,
        MPI_Comm comm,
        MilanLongInt *msgActual,
        vector<MilanLongInt> &Message);
    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
                             MilanInt *BufferSizePtr,
                             MilanLongInt *Buffer,
                             vector<MilanLongInt> &PCumulative,
                             vector<MilanLongInt> &PMessageBundle,
                             vector<MilanLongInt> &PSizeInfoMessages,
                             MilanLongInt *PCounter,
                             MilanLongInt NumMessagesBundled,
                             MilanLongInt *msgActualPtr,
                             MilanLongInt *MessageIndexPtr,
                             MilanInt numProcs,
                             MilanInt myRank,
                             MPI_Comm comm,
                             vector<MilanLongInt> &QLocalVtx,
                             vector<MilanLongInt> &QGhostVtx,
                             vector<MilanLongInt> &QMsgType,
                             vector<MilanInt> &QOwner,
                             vector<MPI_Request> &SRequest,
                             vector<MPI_Status> &SStatus);
    void processMessages(
        MilanLongInt NLVer,
        MilanLongInt *Mate,
        MilanLongInt *candidateMate,
        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
        vector<MilanLongInt> &GMate,
        vector<MilanLongInt> &Counter,
        MilanLongInt StartIndex,
        MilanLongInt EndIndex,
        MilanLongInt *myCardPtr,
        MilanLongInt *msgIndPtr,
        MilanLongInt *msgActualPtr,
        MilanReal *edgeLocWeight,
        MilanLongInt *verDistance,
        MilanLongInt *verLocPtr,
        MilanLongInt k,
        MilanLongInt *verLocInd,
        MilanInt numProcs,
        MilanInt myRank,
        MPI_Comm comm,
        vector<MilanLongInt> &Message,
        MilanLongInt numGhostEdges,
        MilanLongInt u,
        MilanLongInt v,
        MilanLongInt *SPtr,
        vector<MilanLongInt> &U);
    void extractUChunk(
        vector<MilanLongInt> &UChunkBeingProcessed,
        vector<MilanLongInt> &U,
        vector<MilanLongInt> &privateU);
    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
        MilanLongInt NLVer, MilanLongInt NLEdge,
        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
        MilanLongInt *verDistance,
        MilanLongInt *Mate,
        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
        MilanLongInt NLVer, MilanLongInt NLEdge,
        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
        MilanLongInt *verDistance,
        MilanLongInt *Mate,
        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
    void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
        MilanLongInt NLVer, MilanLongInt NLEdge,
        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
        MilanLongInt *verDistance,
        MilanLongInt *Mate,
        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
    void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
                     MilanLongInt *verDistance,
                     MilanLongInt *Mate,
                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
    void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
                     MilanLongInt *verDistance,
                     MilanLongInt *Mate,
                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
 #endif
 #ifdef __cplusplus
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@ -72,12 +72,6 @@
 #ifdef SERIAL_MPI
 #else
 //MPI type map
 template<typename T> MPI_Datatype TypeMap();
 template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
 template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
 template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
 template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
 // DOUBLE PRECISION VERSION
 //WARNING: The vertex block on a given rank is contiguous
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@ -0,0 +1,554 @@
 #include "MatchBoxPC.h"
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
 //               Mahantesh Halappanavar (hala@pnnl.gov)
 //               Pacific Northwest National Laboratory
 //
 // ***********************************************************************
 //
 //       Copyright (2021) Battelle Memorial Institute
 //                      All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the copyright holder nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // ************************************************************************
 //////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////
 /* Function	: algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
 *
 * Date     : New update: Feb 17, 2019, Richland, Washington.
 * Date		: Original development: May 17, 2009, E&CS Bldg.
 *
 * Purpose	: Compute Approximate Maximum Weight Matching in Linear Time
 *
 * Args		: inputMatrix - instance of Compressed-Col format of Matrix
 *                Mate - The Mate array
 *
 * Returns	: By Value: (void)
 *            By Reference: Mate
 *
 * Comments	: 1/2 Approx Algorithm. Picks the locally available heaviest edge.
 *                Assumption: The Mate Array is empty.
 */
 /*
 NLVer = #of vertices, NLEdge = #of edges
 CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
 verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
 Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
 MPI: myRank, numProcs, comm,
 Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
 Statistics: ph0_time, ph1_time, ph2_time: Runtimes
 Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
 */
 //#define DEBUG_HANG_
 #ifdef SERIAL_MPI
 #else
 // DOUBLE PRECISION VERSION
 // WARNING: The vertex block on a given rank is contiguous
 void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
    MilanLongInt NLVer, MilanLongInt NLEdge,
    MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
    MilanReal *edgeLocWeight,
    MilanLongInt *verDistance,
    MilanLongInt *Mate,
    MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
    MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
    MilanReal *msgPercent,
    MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
    MilanLongInt *ph1_card, MilanLongInt *ph2_card)
 {
    /*
     * verDistance: it's a vector long as the number of processors.
     *              verDistance[i] contains the first node index of the i-th processor
     *              verDistance[i + 1] contains the last node index of the i-th processor
     * NLVer: number of elements in the LocPtr
     * NLEdge: number of edges assigned to the current processor
     *
     * Contains the portion of matrix assigned to the processor in
     * Yale notation
     * verLocInd: contains the positions on row of the matrix
     * verLocPtr: i-th value is the position of the first element on the i-th row and
     *            i+1-th value is the position of the first element on the i+1-th row
     */
 #if !defined(SERIAL_MPI)
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ") verDistance [" ;
    for (int i = 0; i < numProcs; i++)
      cout << verDistance[i] << "," << verDistance[i+1];
    cout  << "]\n";
    fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
    if (myRank == 0) {
      cout << "\n(" << myRank << ") verDistance [" ;
      for (int i = 0; i < numProcs; i++)
 	cout << verDistance[i] << "," ;
      cout  << verDistance[numProcs]<< "]\n";
    }
    fflush(stdout);
 #endif
    MilanLongInt StartIndex = verDistance[myRank];       // The starting vertex owned by the current rank
    MilanLongInt EndIndex   = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
    MPI_Status computeStatus;
    MilanLongInt msgActual = 0, msgInd = 0;
    MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
    MilanReal startTime, finishTime;
    startTime = MPI_Wtime();
    // Data structures for sending and receiving messages:
    vector<MilanLongInt> Message; // [ u, v, message_type ]
    Message.resize(3, -1);
    // Data structures for Message Bundling:
    // Although up to two messages can be sent along any cross edge,
    // only one message will be sent in the initialization phase -
    // one of: REQUEST/FAILURE/SUCCESS
    vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
    vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
    MilanLongInt *PCounter = new MilanLongInt[numProcs];
    for (int i = 0; i < numProcs; i++)
        PCounter[i] = 0;
    MilanLongInt NumMessagesBundled = 0;
    // TODO when the last computational section will be refactored this could be eliminated
    MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
    MilanLongInt *candidateMate = nullptr;
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")NV: " << NLVer << "  Edges: " << NLEdge;
    fflush(stdout);
    cout << "\n(" << myRank << ")StartIndex: " << StartIndex << "  EndIndex: " << EndIndex;
    fflush(stdout);
 #endif
    // Other Variables:
    MilanLongInt u = -1, v = -1, w = -1, i = 0;
    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
    MilanLongInt myCard = 0;
    // Build the Ghost Vertex Set: Vg
    map<MilanLongInt, MilanLongInt> Ghost2LocalMap;       // Map each ghost vertex to a local vertex
    vector<MilanLongInt> Counter;                         // Store the edge count for each ghost vertex
    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
    fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
    if (myRank == 0)
        cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
    fflush(stdout);
 #endif
    // Define Adjacency Lists for Ghost Vertices:
    // cout<<"Building Ghost data structures ... \n\n";
    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
    // Mate array for ghost vertices:
    vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
    MilanLongInt S;
    MilanLongInt privateMyCard = 0;
    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
    vector<MPI_Request> SRequest;  // Requests that are used for each send message
    vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
    MilanLongInt MessageIndex = 0; // Pointer for current message
    MilanInt BufferSize;
    MilanLongInt *Buffer;
    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
    vector<MilanInt> privateQOwner;
    vector<MilanLongInt> U, privateU;
    initialize(NLVer, NLEdge, StartIndex,
               EndIndex, &numGhostEdges,
               &numGhostVertices, &S,
               verLocInd, verLocPtr,
               Ghost2LocalMap, Counter,
               verGhostPtr, verGhostInd,
               tempCounter, GMate,
               Message, QLocalVtx,
               QGhostVtx, QMsgType, QOwner,
               candidateMate, U,
               privateU,
               privateQLocalVtx,
               privateQGhostVtx,
               privateQMsgType,
               privateQOwner);
    finishTime = MPI_Wtime();
    *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
 #ifdef DEBUG_HANG_
    cout << myRank << " Finished initialization" << endl;
    fflush(stdout);
 #endif
    startTime = MPI_Wtime();
    /////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////////////////
    // Compute the Initial Matching Set:
    /*
     * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
     * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
     * the two.
     * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
     */
    PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
                                      verLocPtr,
                                      verLocInd,
                                      myRank,
                                      edgeLocWeight,
                                      candidateMate);
 #ifdef DEBUG_HANG_
    cout << myRank << " Finished Exposed Vertex" << endl;
    fflush(stdout);
 #if 0
    cout << myRank << " candidateMate after parallelCompute " <<endl;
    for (int i=0; i<NLVer; i++) {
      cout << candidateMate[i] << " " ;
    }
    cout << endl;
 #endif
 #endif
    /*
     * PARALLEL_PROCESS_EXPOSED_VERTEX_B
     * TODO: write comment
     *
     * TODO: Test when it's actually more efficient to execute this code
     *       in parallel.
     */
    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
                                      candidateMate,
                                      verLocInd,
                                      verLocPtr,
                                      StartIndex,
                                      EndIndex,
                                      Mate,
                                      GMate,
                                      Ghost2LocalMap,
                                      edgeLocWeight,
                                      &myCard,
                                      &msgInd,
                                      &NumMessagesBundled,
                                      &S,
                                      verDistance,
                                      PCounter,
                                      Counter,
                                      myRank,
                                      numProcs,
                                      U,
                                      privateU,
                                      QLocalVtx,
                                      QGhostVtx,
                                      QMsgType,
                                      QOwner,
                                      privateQLocalVtx,
                                      privateQGhostVtx,
                                      privateQMsgType,
                                      privateQOwner);
    tempCounter.clear(); // Do not need this any more
 #ifdef DEBUG_HANG_
    cout << myRank << " Finished Exposed Vertex" << endl;
    fflush(stdout);
 #if 0
    cout << myRank << " Mate after Exposed Vertices " <<endl;
    for (int i=0; i<NLVer; i++) {
      cout << Mate[i] << " " ;
    }
    cout << endl;
 #endif
 #endif
    ///////////////////////////////////////////////////////////////////////////////////
    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////
    // TODO what would be the optimal UCHUNK
    vector<MilanLongInt> UChunkBeingProcessed;
    UChunkBeingProcessed.reserve(UCHUNK);
    processMatchedVertices(NLVer,
                           UChunkBeingProcessed,
                           U,
                           privateU,
                           StartIndex,
                           EndIndex,
                           &myCard,
                           &msgInd,
                           &NumMessagesBundled,
                           &S,
                           verLocPtr,
                           verLocInd,
                           verDistance,
                           PCounter,
                           Counter,
                           myRank,
                           numProcs,
                           candidateMate,
                           GMate,
                           Mate,
                           Ghost2LocalMap,
                           edgeLocWeight,
                           QLocalVtx,
                           QGhostVtx,
                           QMsgType,
                           QOwner,
                           privateQLocalVtx,
                           privateQGhostVtx,
                           privateQMsgType,
                           privateQOwner);
 #ifdef DEBUG_HANG_
    cout << myRank << " Finished Process Vertices" << endl;
    fflush(stdout);
 #if 0
    cout << myRank << " Mate after Matched Vertices " <<endl;
    for (int i=0; i<NLVer; i++) {
      cout << Mate[i] << " " ;
    }
    cout << endl;
 #endif
 #endif
    /////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////////////////
    sendBundledMessages(&numGhostEdges,
                        &BufferSize,
                        Buffer,
                        PCumulative,
                        PMessageBundle,
                        PSizeInfoMessages,
                        PCounter,
                        NumMessagesBundled,
                        &msgActual,
                        &MessageIndex,
                        numProcs,
                        myRank,
                        comm,
                        QLocalVtx,
                        QGhostVtx,
                        QMsgType,
                        QOwner,
                        SRequest,
                        SStatus);
    ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
    finishTime = MPI_Wtime();
    *ph1_time = finishTime - startTime; // Time taken for Phase-1
 #ifdef DEBUG_HANG_
    cout << myRank << " Finished sendBundles" << endl;
    fflush(stdout);
 #endif
    *ph1_card = myCard;                 // Cardinality at the end of Phase-1
    startTime = MPI_Wtime();
    /////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////////////////
    // Main While Loop:
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Entering While(true) loop..";
    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
    while (true)  {
 #ifdef DEBUG_HANG_
      //if (myRank == 0)
      cout << "\n(" << myRank << ") Main loop" << endl;
      fflush(stdout);
 #endif
        ///////////////////////////////////////////////////////////////////////////////////
        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
        ///////////////////////////////////////////////////////////////////////////////////
      processMatchedVerticesAndSendMessages(NLVer,
 					    UChunkBeingProcessed,
 					    U,
 					    privateU,
 					    StartIndex,
 					    EndIndex,
 					    &myCard,
 					    &msgInd,
 					    &NumMessagesBundled,
 					    &S,
 					    verLocPtr,
 					    verLocInd,
 					    verDistance,
 					    PCounter,
 					    Counter,
 					    myRank,
 					    numProcs,
 					    candidateMate,
 					    GMate,
 					    Mate,
 					    Ghost2LocalMap,
 					    edgeLocWeight,
 					    QLocalVtx,
 					    QGhostVtx,
 					    QMsgType,
 					    QOwner,
 					    privateQLocalVtx,
 					    privateQGhostVtx,
 					    privateQMsgType,
 					    privateQOwner,
 					    comm,
 					    &msgActual,
 					    Message);
      ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
        //// BREAK IF NO MESSAGES EXPECTED /////////
 #ifdef DEBUG_HANG_
 #if 0
      cout << myRank << " Mate after ProcessMatchedAndSend phase "<<S <<endl;
      for (int i=0; i<NLVer; i++) {
 	cout << Mate[i] << " " ;
      }
      cout << endl;
 #endif
 #endif
 #ifdef PRINT_DEBUG_INFO_
        cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
 #endif
        if (S == 0) {
 #ifdef DEBUG_HANG_
 	  cout << "\n(" << myRank << ") Breaking out" << endl;
 	  fflush(stdout);
 #endif
 	  break;
        }
        ///////////////////////////////////////////////////////////////////////////////////
        /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
        ///////////////////////////////////////////////////////////////////////////////////
        processMessages(NLVer,
                        Mate,
                        candidateMate,
                        Ghost2LocalMap,
                        GMate,
                        Counter,
                        StartIndex,
                        EndIndex,
                        &myCard,
                        &msgInd,
                        &msgActual,
                        edgeLocWeight,
                        verDistance,
                        verLocPtr,
                        k,
                        verLocInd,
                        numProcs,
                        myRank,
                        comm,
                        Message,
                        numGhostEdges,
                        u,
                        v,
                        &S,
                        U);
        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
 #ifdef DEBUG_HANG_
 #if 0
      cout << myRank << " Mate after ProcessMessages phase "<<S <<endl;
      for (int i=0; i<NLVer; i++) {
 	cout << Mate[i] << " " ;
      }
      cout << endl;
 #endif
 #endif
 #ifdef PRINT_DEBUG_INFO_
        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
        fflush(stdout);
        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
        fflush(stdout);
        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
        fflush(stdout);
 #endif
    } // End of while (true)
    clean(NLVer,
          myRank,
          MessageIndex,
          SRequest,
          SStatus,
          BufferSize,
          Buffer,
          msgActual,
          msgActualSent,
          msgInd,
          msgIndSent,
          NumMessagesBundled,
          msgPercent);
    finishTime = MPI_Wtime();
    *ph2_time = finishTime - startTime; // Time taken for Phase-2
    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
 }
 // End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
 #endif
 #endif
--- a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
  logical, parameter      :: do_timings=.false.
  name='amg_c_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_map_bld==-1))       &
       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
  if ((do_timings).and.(idx_map_tprol==-1))     &
       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
  if (do_timings) call psb_toc(idx_map_bld)
  if (do_timings) call psb_tic(idx_map_tprol)
  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_c_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_c_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()
  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
  if ((do_timings).and.(idx_cpytrans1==-1)) &
       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
  if ((do_timings).and.(idx_cpytrans2==-1)) &
       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.
  if (present(desc_ax)) then
    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
+    search: do k=i,1,-1
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
+      if (coo_restr%ia(k) <= nrl) then
-        i = i+1
+        call coo_restr%set_nzeros(k)
-        coo_restr%val(i) = coo_restr%val(k)
+        exit search
        coo_restr%ia(i)  = coo_restr%ia(k)
        coo_restr%ja(i)  = coo_restr%ja(k)
      end if
-    end do
+    end do search
-    call coo_restr%set_nzeros(i)
+
    call coo_restr%fix(info) 
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
    if (do_timings) call psb_tic(idx_cpytrans2)
    call csr_restr%cp_from_coo(coo_restr,info)
    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_c_inner_mod
-
+#if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info
  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  complex(psb_spk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
  integer(psb_ipk_), save :: idx_soc1_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc1_p0==-1))       &
       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
  if ((do_timings).and.(idx_soc1_p1==-1))       &
       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
  if ((do_timings).and.(idx_soc1_p2==-1))       &
       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
  if ((do_timings).and.(idx_soc1_p3==-1))       &
       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
    !$omp end parallel do 
  else
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)          
          if (info /= 0) cycle step1
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then
            nz         = (acsr%irp(i+1)-acsr%irp(i))
            if ((nz<0).or.(nz>size(icol))) then
              !$omp atomic write
              info=psb_err_internal_error_
              !$omp end atomic 
              call psb_errpush(info,name)
              cycle step1
              !goto 9999
            end if
            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
            !
            ! Build the set of all strongly coupled nodes 
            !
            ip = 0
            do k=1, nz
              j   = icol(k)
              ! If any of the neighbours is already assigned,
              ! we will not reset. 
              if (ilaggr(j) > 0) cycle step1
              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                ip = ip + 1
                icol(ip) = icol(k)
              end if
            enddo
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, ip
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  step1: do ii=1, nr
    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
      !goto 9999
    end if
    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
        !goto 9999
      end if
      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
  !$omp end parallel do
  if (do_timings) call psb_toc(idx_soc1_p2)
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,info)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
  call acsr%free()
  call psb_erractionrestore(err_act)
  return
--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_c_inner_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
  integer(psb_ipk_), save :: idx_soc2_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc2_p0==-1))       &
       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
  if ((do_timings).and.(idx_soc2_p1==-1))       &
       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
  if ((do_timings).and.(idx_soc2_p2==-1))       &
       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
  if ((do_timings).and.(idx_soc2_p3==-1))       &
       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      s_neigh_coo%ia(k)  = i
      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
        ip = ip + 1
        s_neigh_coo%ia(ip)  = i
        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
        else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
        end if
      else
        s_neigh_coo%val(k) = -sone        
      end if
    end do
  end do
  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)
  if (iorder == amg_aggr_ord_nat_) then
    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
    !$omp end parallel do 
  else 
    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
  if (do_timings) call psb_toc(idx_soc2_p0)
  if (do_timings) call psb_tic(idx_soc2_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
 #if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)
          if (info /= 0) then
            write(0,*) ' Step1:',kk,ii,i,info
            cycle step1
          end if
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then 
            !
            ! Get the 1-neighbourhood of I 
            !
            ip1 = s_neigh%irp(i)
            nz  = s_neigh%irp(i+1)-ip1
            !
            ! If the neighbourhood only contains I, skip it
            !
            if (nz ==0) then
              ilaggr(i) = 0
              cycle step1
            end if
            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
              ilaggr(i) = 0
              cycle step1
            end if
            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, nzcnt
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !write(0,*) 'LNAG ',locnaggr(nths+1)
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
+  !$omp end parallel do
-
+  if (do_timings) call psb_toc(idx_soc2_p2)
  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_spk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
  logical, parameter :: do_timings=.false.
  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()
  call psb_info(ctxt, me, np)
  if ((do_timings).and.(idx_spspmm==-1)) &
       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
  if ((do_timings).and.(idx_phase1==-1)) &
       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
  if ((do_timings).and.(idx_phase2==-1)) &
       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
  if ((do_timings).and.(idx_phase3==-1)) &
       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
  if ((do_timings).and.(idx_gtrans==-1)) &
       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
  if ((do_timings).and.(idx_refine==-1)) &
       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
  if ((do_timings).and.(idx_cdasb==-1)) &
       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
  if ((do_timings).and.(idx_ptap==-1)) &
       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
  if (do_timings) call psb_tic(idx_phase1)
  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = czero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if
  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= czero) then
      adiag(i) = cone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = cone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 
    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
  if (do_timings) call psb_toc(idx_phase1)
-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
  if (do_timings) call psb_toc(idx_phase2)
  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
+  if (do_timings) call psb_toc(idx_phase3)
-
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
  logical, parameter      :: do_timings=.false.
  name='amg_d_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_map_bld==-1))       &
       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
  if ((do_timings).and.(idx_map_tprol==-1))     &
       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
  if (do_timings) call psb_toc(idx_map_bld)
  if (do_timings) call psb_tic(idx_map_tprol)
  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_d_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_d_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()
  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
  if ((do_timings).and.(idx_cpytrans1==-1)) &
       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
  if ((do_timings).and.(idx_cpytrans2==-1)) &
       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.
  if (present(desc_ax)) then
    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
+    search: do k=i,1,-1
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
+      if (coo_restr%ia(k) <= nrl) then
-        i = i+1
+        call coo_restr%set_nzeros(k)
-        coo_restr%val(i) = coo_restr%val(k)
+        exit search
        coo_restr%ia(i)  = coo_restr%ia(k)
        coo_restr%ja(i)  = coo_restr%ja(k)
      end if
-    end do
+    end do search
-    call coo_restr%set_nzeros(i)
+
    call coo_restr%fix(info) 
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
    if (do_timings) call psb_tic(idx_cpytrans2)
    call csr_restr%cp_from_coo(coo_restr,info)
    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_d_inner_mod
-
+#if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info
  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  real(psb_dpk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
  integer(psb_ipk_), save :: idx_soc1_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc1_p0==-1))       &
       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
  if ((do_timings).and.(idx_soc1_p1==-1))       &
       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
  if ((do_timings).and.(idx_soc1_p2==-1))       &
       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
  if ((do_timings).and.(idx_soc1_p3==-1))       &
       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
    !$omp end parallel do 
  else
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)          
          if (info /= 0) cycle step1
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then
            nz         = (acsr%irp(i+1)-acsr%irp(i))
            if ((nz<0).or.(nz>size(icol))) then
              !$omp atomic write
              info=psb_err_internal_error_
              !$omp end atomic 
              call psb_errpush(info,name)
              cycle step1
              !goto 9999
            end if
            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
            !
            ! Build the set of all strongly coupled nodes 
            !
            ip = 0
            do k=1, nz
              j   = icol(k)
              ! If any of the neighbours is already assigned,
              ! we will not reset. 
              if (ilaggr(j) > 0) cycle step1
              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                ip = ip + 1
                icol(ip) = icol(k)
              end if
            enddo
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, ip
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  step1: do ii=1, nr
    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
      !goto 9999
    end if
    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
        !goto 9999
      end if
      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
  !$omp end parallel do
  if (do_timings) call psb_toc(idx_soc1_p2)
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,info)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
  call acsr%free()
  call psb_erractionrestore(err_act)
  return
--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_d_inner_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
  integer(psb_ipk_), save :: idx_soc2_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc2_p0==-1))       &
       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
  if ((do_timings).and.(idx_soc2_p1==-1))       &
       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
  if ((do_timings).and.(idx_soc2_p2==-1))       &
       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
  if ((do_timings).and.(idx_soc2_p3==-1))       &
       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      s_neigh_coo%ia(k)  = i
      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
        ip = ip + 1
        s_neigh_coo%ia(ip)  = i
        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
        else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
        end if
      else
        s_neigh_coo%val(k) = -done        
      end if
    end do
  end do
  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)
  if (iorder == amg_aggr_ord_nat_) then
    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
    !$omp end parallel do 
  else 
    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
  if (do_timings) call psb_toc(idx_soc2_p0)
  if (do_timings) call psb_tic(idx_soc2_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
 #if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)
          if (info /= 0) then
            write(0,*) ' Step1:',kk,ii,i,info
            cycle step1
          end if
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then 
            !
            ! Get the 1-neighbourhood of I 
            !
            ip1 = s_neigh%irp(i)
            nz  = s_neigh%irp(i+1)-ip1
            !
            ! If the neighbourhood only contains I, skip it
            !
            if (nz ==0) then
              ilaggr(i) = 0
              cycle step1
            end if
            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
              ilaggr(i) = 0
              cycle step1
            end if
            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, nzcnt
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !write(0,*) 'LNAG ',locnaggr(nths+1)
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
+  !$omp end parallel do
-
+  if (do_timings) call psb_toc(idx_soc2_p2)
  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
  logical, parameter :: do_timings=.false.
  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()
  call psb_info(ctxt, me, np)
  if ((do_timings).and.(idx_spspmm==-1)) &
       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
  if ((do_timings).and.(idx_phase1==-1)) &
       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
  if ((do_timings).and.(idx_phase2==-1)) &
       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
  if ((do_timings).and.(idx_phase3==-1)) &
       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
  if ((do_timings).and.(idx_gtrans==-1)) &
       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
  if ((do_timings).and.(idx_refine==-1)) &
       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
  if ((do_timings).and.(idx_cdasb==-1)) &
       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
  if ((do_timings).and.(idx_ptap==-1)) &
       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
  if (do_timings) call psb_tic(idx_phase1)
  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = dzero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if
  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= dzero) then
      adiag(i) = done / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = done
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 
    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
  if (do_timings) call psb_toc(idx_phase1)
-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
  if (do_timings) call psb_toc(idx_phase2)
  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
+  if (do_timings) call psb_toc(idx_phase3)
-
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
  logical, parameter      :: do_timings=.false.
  name='amg_s_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_map_bld==-1))       &
       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
  if ((do_timings).and.(idx_map_tprol==-1))     &
       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
  if (do_timings) call psb_toc(idx_map_bld)
  if (do_timings) call psb_tic(idx_map_tprol)
  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_s_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_s_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()
  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
  if ((do_timings).and.(idx_cpytrans1==-1)) &
       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
  if ((do_timings).and.(idx_cpytrans2==-1)) &
       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.
  if (present(desc_ax)) then
    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
+    search: do k=i,1,-1
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
+      if (coo_restr%ia(k) <= nrl) then
-        i = i+1
+        call coo_restr%set_nzeros(k)
-        coo_restr%val(i) = coo_restr%val(k)
+        exit search
        coo_restr%ia(i)  = coo_restr%ia(k)
        coo_restr%ja(i)  = coo_restr%ja(k)
      end if
-    end do
+    end do search
-    call coo_restr%set_nzeros(i)
+
    call coo_restr%fix(info) 
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
    if (do_timings) call psb_tic(idx_cpytrans2)
    call csr_restr%cp_from_coo(coo_restr,info)
    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_s_inner_mod
-
+#if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info
  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  real(psb_spk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
  integer(psb_ipk_), save :: idx_soc1_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc1_p0==-1))       &
       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
  if ((do_timings).and.(idx_soc1_p1==-1))       &
       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
  if ((do_timings).and.(idx_soc1_p2==-1))       &
       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
  if ((do_timings).and.(idx_soc1_p3==-1))       &
       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
    !$omp end parallel do 
  else
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)          
          if (info /= 0) cycle step1
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then
            nz         = (acsr%irp(i+1)-acsr%irp(i))
            if ((nz<0).or.(nz>size(icol))) then
              !$omp atomic write
              info=psb_err_internal_error_
              !$omp end atomic 
              call psb_errpush(info,name)
              cycle step1
              !goto 9999
            end if
            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
            !
            ! Build the set of all strongly coupled nodes 
            !
            ip = 0
            do k=1, nz
              j   = icol(k)
              ! If any of the neighbours is already assigned,
              ! we will not reset. 
              if (ilaggr(j) > 0) cycle step1
              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                ip = ip + 1
                icol(ip) = icol(k)
              end if
            enddo
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, ip
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  step1: do ii=1, nr
    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
      !goto 9999
    end if
    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
        !goto 9999
      end if
      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
  !$omp end parallel do
  if (do_timings) call psb_toc(idx_soc1_p2)
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,info)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
  call acsr%free()
  call psb_erractionrestore(err_act)
  return
--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_s_inner_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
  integer(psb_ipk_), save :: idx_soc2_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc2_p0==-1))       &
       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
  if ((do_timings).and.(idx_soc2_p1==-1))       &
       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
  if ((do_timings).and.(idx_soc2_p2==-1))       &
       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
  if ((do_timings).and.(idx_soc2_p3==-1))       &
       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      s_neigh_coo%ia(k)  = i
      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
        ip = ip + 1
        s_neigh_coo%ia(ip)  = i
        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
        else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
        end if
      else
        s_neigh_coo%val(k) = -sone        
      end if
    end do
  end do
  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)
  if (iorder == amg_aggr_ord_nat_) then
    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
    !$omp end parallel do 
  else 
    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
  if (do_timings) call psb_toc(idx_soc2_p0)
  if (do_timings) call psb_tic(idx_soc2_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
 #if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)
          if (info /= 0) then
            write(0,*) ' Step1:',kk,ii,i,info
            cycle step1
          end if
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then 
            !
            ! Get the 1-neighbourhood of I 
            !
            ip1 = s_neigh%irp(i)
            nz  = s_neigh%irp(i+1)-ip1
            !
            ! If the neighbourhood only contains I, skip it
            !
            if (nz ==0) then
              ilaggr(i) = 0
              cycle step1
            end if
            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
              ilaggr(i) = 0
              cycle step1
            end if
            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, nzcnt
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !write(0,*) 'LNAG ',locnaggr(nths+1)
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
+  !$omp end parallel do
-
+  if (do_timings) call psb_toc(idx_soc2_p2)
  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_spk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
  logical, parameter :: do_timings=.false.
  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()
  call psb_info(ctxt, me, np)
  if ((do_timings).and.(idx_spspmm==-1)) &
       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
  if ((do_timings).and.(idx_phase1==-1)) &
       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
  if ((do_timings).and.(idx_phase2==-1)) &
       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
  if ((do_timings).and.(idx_phase3==-1)) &
       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
  if ((do_timings).and.(idx_gtrans==-1)) &
       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
  if ((do_timings).and.(idx_refine==-1)) &
       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
  if ((do_timings).and.(idx_cdasb==-1)) &
       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
  if ((do_timings).and.(idx_ptap==-1)) &
       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
  if (do_timings) call psb_tic(idx_phase1)
  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = szero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if
  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= szero) then
      adiag(i) = sone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = sone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 
    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
  if (do_timings) call psb_toc(idx_phase1)
-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
  if (do_timings) call psb_toc(idx_phase2)
  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
+  if (do_timings) call psb_toc(idx_phase3)
-
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
  logical, parameter      :: do_timings=.false.
  name='amg_z_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_map_bld==-1))       &
       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
  if ((do_timings).and.(idx_map_tprol==-1))     &
       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")
  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
  if (do_timings) call psb_toc(idx_map_bld)
  if (do_timings) call psb_tic(idx_map_tprol)
  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_z_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_z_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1
  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()
  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
  if ((do_timings).and.(idx_cpytrans1==-1)) &
       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
  if ((do_timings).and.(idx_cpytrans2==-1)) &
       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")
  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.
  if (present(desc_ax)) then
    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
+    search: do k=i,1,-1
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
+      if (coo_restr%ia(k) <= nrl) then
-        i = i+1
+        call coo_restr%set_nzeros(k)
-        coo_restr%val(i) = coo_restr%val(k)
+        exit search
        coo_restr%ia(i)  = coo_restr%ia(k)
        coo_restr%ja(i)  = coo_restr%ja(k)
      end if
-    end do
+    end do search
-    call coo_restr%set_nzeros(i)
+
    call coo_restr%fix(info) 
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
    if (do_timings) call psb_tic(idx_cpytrans2)
    call csr_restr%cp_from_coo(coo_restr,info)
    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_z_inner_mod
-
+#if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info
  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  complex(psb_dpk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
  integer(psb_ipk_), save :: idx_soc1_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc1_p0==-1))       &
       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
  if ((do_timings).and.(idx_soc1_p1==-1))       &
       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
  if ((do_timings).and.(idx_soc1_p2==-1))       &
       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
  if ((do_timings).and.(idx_soc1_p3==-1))       &
       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
    !$omp end parallel do 
  else
    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)          
          if (info /= 0) cycle step1
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then
            nz         = (acsr%irp(i+1)-acsr%irp(i))
            if ((nz<0).or.(nz>size(icol))) then
              !$omp atomic write
              info=psb_err_internal_error_
              !$omp end atomic 
              call psb_errpush(info,name)
              cycle step1
              !goto 9999
            end if
            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
            !
            ! Build the set of all strongly coupled nodes 
            !
            ip = 0
            do k=1, nz
              j   = icol(k)
              ! If any of the neighbours is already assigned,
              ! we will not reset. 
              if (ilaggr(j) > 0) cycle step1
              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
                ip = ip + 1
                icol(ip) = icol(k)
              end if
            enddo
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, ip
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  step1: do ii=1, nr
    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
      !goto 9999
    end if
    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
        !goto 9999
      end if
      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -244,8 +424,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
  !$omp end parallel do
  if (do_timings) call psb_toc(idx_soc1_p2)
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,info)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
  call acsr%free()
  call psb_erractionrestore(err_act)
  return
--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_z_inner_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
  integer(psb_ipk_), save :: idx_soc2_p0=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
  if ((do_timings).and.(idx_soc2_p0==-1))       &
       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
  if ((do_timings).and.(idx_soc2_p1==-1))       &
       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
  if ((do_timings).and.(idx_soc2_p2==-1))       &
       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
  if ((do_timings).and.(idx_soc2_p3==-1))       &
       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")
  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if
  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      s_neigh_coo%ia(k)  = i
      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
        ip = ip + 1
        s_neigh_coo%ia(ip)  = i
        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
        else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
        end if
      else
        s_neigh_coo%val(k) = -done        
      end if
    end do
  end do
  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)
  if (iorder == amg_aggr_ord_nat_) then
    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
    !$omp end parallel do 
  else 
    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
  if (do_timings) call psb_toc(idx_soc2_p0)
  if (do_timings) call psb_tic(idx_soc2_p1)
  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
 #if defined(OPENMP)
  block
    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
    integer(psb_ipk_) :: myth,nths, kk
    ! The parallelization makes use of a locaggr(:) array; each thread
    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
    ! to locnaggr to determine:
    ! 1. The total number of aggregaters NAGGR;
    ! 2. How much should each thread shift its own aggregates
    ! Part 2 requires to keep track of which thread defined each entry
    ! of ilaggr(), so that each entry can be adjusted correctly: even
    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
    ! been set because it is strongly connected to an entry J belonging to a
    ! different thread. 
    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
    !$omp private(icol,val,myth,kk) 
    block
      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
      integer(psb_lpk_) :: itmp
      !$omp master
      nths = omp_get_num_threads()
      allocate(bnds(0:nths),locnaggr(0:nths+1))
      locnaggr(:) = 0
      bnds(0) = 1
      !$omp end master      
      !$omp barrier
      myth = omp_get_thread_num()
      rsz = nr/nths
      if (myth < mod(nr,nths)) rsz = rsz + 1
      bnds(myth+1) = rsz
      !$omp barrier
      !$omp master
      do i=1,nths
        bnds(i) = bnds(i) + bnds(i-1)
      end do
      info = 0
      !$omp end master
      !$omp barrier
      !$omp do schedule(static) private(disjoint) 
      do kk=0, nths-1
        step1: do ii=bnds(kk), bnds(kk+1)-1
          i = idxs(ii)
          if (info /= 0) then
            write(0,*) ' Step1:',kk,ii,i,info
            cycle step1
          end if
          if ((i<1).or.(i>nr)) then
            !$omp atomic write
            info=psb_err_internal_error_
            !$omp end atomic 
            call psb_errpush(info,name)
            cycle step1
            !goto 9999
          end if
          if (ilaggr(i) == -(nr+1)) then 
            !
            ! Get the 1-neighbourhood of I 
            !
            ip1 = s_neigh%irp(i)
            nz  = s_neigh%irp(i+1)-ip1
            !
            ! If the neighbourhood only contains I, skip it
            !
            if (nz ==0) then
              ilaggr(i) = 0
              cycle step1
            end if
            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
              ilaggr(i) = 0
              cycle step1
            end if
            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
            !
            ! If the whole strongly coupled neighborhood of I is
            ! as yet unconnected, turn it into the next aggregate.
            ! Same if ip==0 (in which case, neighborhood only
            ! contains I even if it does not look like it from matrix)
            ! The fact that DISJOINT is private and not under lock
            ! generates a certain un-repeatability, in that between
            ! computing DISJOINT and assigning, another thread might
            ! alter the values of ILAGGR.
            ! However, a certain unrepeatability is already present
            ! because the sequence of aggregates is computed with a
            ! different order than in serial mode.
            ! In any case, even if the enteries of ILAGGR may be
            ! overwritten, the important thing is that each entry is
            ! consistent and they generate a correct aggregation map.
            !
            if (disjoint) then
              locnaggr(kk)     = locnaggr(kk) + 1
              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
                !$omp atomic update
                info = max(12345678,info)
                !$omp end atomic
                cycle step1
              end if
              !$omp atomic write
              ilaggr(i) = itmp
              !$omp end atomic
              do k=1, nzcnt
                !$omp atomic write
                ilaggr(icol(k)) = itmp
                !$omp end atomic
              end do
            end if
          end if
        enddo step1
      end do
      !$omp end do
      !$omp master
      naggr = sum(locnaggr(0:nths-1))
      do i=1,nths
        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
      end do
      do i=nths+1,1,-1
        locnaggr(i) = locnaggr(i-1)
      end do
      locnaggr(0) = 0
      !write(0,*) 'LNAG ',locnaggr(nths+1)
      !$omp end master 
      !$omp barrier
      !$omp  do schedule(static) 
      do kk=0, nths-1
        do ii=bnds(kk), bnds(kk+1)-1
          if (ilaggr(ii) > 0) then 
            kp = mod(ilaggr(ii),nths)
            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
          end if
        end do
      end do
      !$omp end do
    end block
    !$omp end parallel
  end block
  if (info /= 0) then
    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
    info=psb_err_internal_error_
    call psb_errpush(info,name)
    goto 9999
  end if
 #else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
  !$omp workshare
  tmpaggr = ilaggr
  !$omp end workshare
  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)
@ -259,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
+  !$omp end parallel do
-
+  if (do_timings) call psb_toc(idx_soc2_p2)
  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3
  ! Any leftovers?
  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
        !$omp atomic write
        info=psb_err_internal_error_
        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
  if (info /= 0) goto 9999
  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
  logical, parameter :: do_timings=.false.
  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1
  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()
  call psb_info(ctxt, me, np)
  if ((do_timings).and.(idx_spspmm==-1)) &
       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
  if ((do_timings).and.(idx_phase1==-1)) &
       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
  if ((do_timings).and.(idx_phase2==-1)) &
       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
  if ((do_timings).and.(idx_phase3==-1)) &
       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
  if ((do_timings).and.(idx_gtrans==-1)) &
       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
  if ((do_timings).and.(idx_refine==-1)) &
       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
  if ((do_timings).and.(idx_cdasb==-1)) &
       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
  if ((do_timings).and.(idx_ptap==-1)) &
       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
  if (do_timings) call psb_tic(idx_phase1)
  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = zzero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if
  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= zzero) then
      adiag(i) = zone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = zone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 
    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
  if (do_timings) call psb_toc(idx_phase1)
-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999
@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
  if (do_timings) call psb_toc(idx_phase2)
  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
+  if (do_timings) call psb_toc(idx_phase3)
-
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@ -0,0 +1,91 @@
 #include "MatchBoxPC.h"
 // TODO comment
 void clean(MilanLongInt NLVer,
           MilanInt myRank,
           MilanLongInt MessageIndex,
           vector<MPI_Request> &SRequest,
           vector<MPI_Status> &SStatus,
           MilanInt BufferSize,
           MilanLongInt *Buffer,
           MilanLongInt msgActual,
           MilanLongInt *msgActualSent,
           MilanLongInt msgInd,
           MilanLongInt *msgIndSent,
           MilanLongInt NumMessagesBundled,
           MilanReal *msgPercent)
 {
    // Cleanup Phase
 #pragma omp parallel
    {
 #pragma omp master
        {
 #pragma omp task
            {
 #ifdef PRINT_DEBUG_INFO_
                cout << "\n(" << myRank << ") Waitall= " << endl;
                fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
                cout << "\n(" << myRank << ") Waitall " << endl;
                fflush(stdout);
 #endif
                //return;
                MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
                // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
                if (BufferSize > 0)
                {
                    MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
                    free(Buffer);                            // Free the memory that was allocated
                }
            }
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
            fflush(stdout);
            cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
            fflush(stdout);
            cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
            fflush(stdout);
            cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
            fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
            cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
            cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
            cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
            if (msgInd > 0)
            {
                cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
            }
            fflush(stdout);
 #endif
 #pragma omp task
            {
                *msgActualSent = msgActual;
                *msgIndSent = msgInd;
                if (msgInd > 0)
                {
                    *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
                }
                else
                {
                    *msgPercent = 0;
                }
            }
 #ifdef DEBUG_HANG_
            if (myRank == 0)
                cout << "\n(" << myRank << ") Done" << endl;
            fflush(stdout);
 #endif
        }
    }
 }
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@ -0,0 +1,73 @@
 #include "MatchBoxPC.h"
 /**
 * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
 * Returns the vertices with the highest weight
 * @param adj1
 * @param adj2
 * @param verLocInd
 * @param edgeLocWeight
 * @return
 */
 MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
                                       MilanLongInt adj2,
                                       MilanLongInt *verLocInd,
                                       MilanReal *edgeLocWeight)
 {
    MilanInt w = -1;
    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
    int finalK;
    for (int k = adj1; k < adj2; k++) {
      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
 	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
 	heaviestEdgeWt = edgeLocWeight[k];
 	w = verLocInd[k];
 	finalK = k;
      }
    } // End of for loop
    return finalK;
 }
 /**
 * //TODO documentation
 * @param adj1
 * @param adj2
 * @param edgeLocWeight
 * @param k
 * @param verLocInd
 * @param StartIndex
 * @param EndIndex
 * @param GMate
 * @param Mate
 * @param Ghost2LocalMap
 * @return
 */
 MilanLongInt computeCandidateMate(MilanLongInt adj1,
                                  MilanLongInt adj2,
                                  MilanReal *edgeLocWeight,
                                  MilanLongInt k,
                                  MilanLongInt *verLocInd,
                                  MilanLongInt StartIndex,
                                  MilanLongInt EndIndex,
                                  vector<MilanLongInt> &GMate,
                                  MilanLongInt *Mate,
                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
 {
    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
    MilanInt w = -1;
    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
    for (k = adj1; k < adj2; k++)   {
      if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
 	continue;
      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
 	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
 	heaviestEdgeWt = edgeLocWeight[k];
 	w = verLocInd[k];
      }
    } // End of for loop
      //  End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
    return w;
 }
--- a/amgprec/impl/aggregator/dataStrStaticQueue.h
+++ b/amgprec/impl/aggregator/dataStrStaticQueue.h
@ -80,9 +80,11 @@ class staticQueue
 		MilanLongInt squeueTail;
 		MilanLongInt NumNodes;
 		//FIXME I had to comment this piece of code in order to make everything work.
 		//		why?
 		//Prevent Assignment and Pass by Value:
-		staticQueue(const staticQueue& src);
+		//staticQueue(const staticQueue& src);
-		staticQueue& operator=(const staticQueue& rhs);
+		//staticQueue& operator=(const staticQueue& rhs);
 	public:
 		//Constructors and Destructors
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@ -0,0 +1,31 @@
 #include "MatchBoxPC.h"
 void extractUChunk(
    vector<MilanLongInt> &UChunkBeingProcessed,
    vector<MilanLongInt> &U,
    vector<MilanLongInt> &privateU)
 {
    UChunkBeingProcessed.clear();
 #pragma omp critical(U)
    {
        if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
        {
            while (!privateU.empty())
                UChunkBeingProcessed.push_back(privateU.back());
            privateU.pop_back();
        }
        else
        {
            for (int i = 0; i < UCHUNK; i++)
            { // Pop the new nodes
                if (U.empty())
                    break;
                UChunkBeingProcessed.push_back(U.back());
                U.pop_back();
            }
        }
    } // End of critical U // End of critical U
 }
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@ -0,0 +1,29 @@
 #include "MatchBoxPC.h"
 /// Find the owner of a ghost node:
 MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
                          MilanInt myRank, MilanInt numProcs)
 {
  MilanLongInt mStartInd = mVerDistance[myRank];
  MilanInt Start = 0;
  MilanInt End = numProcs;
  MilanInt Current = 0;
  while (Start <= End)
  {
    Current = (End + Start) / 2;
    // CASE-1:
    if (mVerDistance[Current] == vtxIndex) return Current;
    else // CASE 2:
      if (mVerDistance[Current] > vtxIndex)
        End = Current - 1;
      else // CASE 3:
        Start = Current + 1;
  } // End of While()
 if (mVerDistance[Current] > vtxIndex)
      return (Current - 1);
  return Current;
 } // End of findOwnerOfGhost()
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@ -0,0 +1,304 @@
 #include "MatchBoxPC.h"
 void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
                MilanLongInt StartIndex, MilanLongInt EndIndex,
                MilanLongInt *numGhostEdges,
                MilanLongInt *numGhostVertices,
                MilanLongInt *S,
                MilanLongInt *verLocInd,
                MilanLongInt *verLocPtr,
                map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                vector<MilanLongInt> &Counter,
                vector<MilanLongInt> &verGhostPtr,
                vector<MilanLongInt> &verGhostInd,
                vector<MilanLongInt> &tempCounter,
                vector<MilanLongInt> &GMate,
                vector<MilanLongInt> &Message,
                vector<MilanLongInt> &QLocalVtx,
                vector<MilanLongInt> &QGhostVtx,
                vector<MilanLongInt> &QMsgType,
                vector<MilanInt> &QOwner,
                MilanLongInt *&candidateMate,
                vector<MilanLongInt> &U,
                vector<MilanLongInt> &privateU,
                vector<MilanLongInt> &privateQLocalVtx,
                vector<MilanLongInt> &privateQGhostVtx,
                vector<MilanLongInt> &privateQMsgType,
                vector<MilanInt> &privateQOwner)
 {
    MilanLongInt insertMe = 0;
    MilanLongInt adj1, adj2;
    int i, v, k, w;
    // index that starts with zero to |Vg|  - 1
    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
 #pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
    {
 #pragma omp single
        {
 #ifdef TIME_TRACKER
            double Ghost2LocalInitialization = MPI_Wtime();
 #endif
            /*
             * OMP Ghost2LocalInitialization
             * This loop analyzes all the edges and when finds a ghost edge
             * puts it in the Ghost2LocalMap.
             * A critical region is needed when inserting data in the map.
             *
             * Despite the critical region it is still productive to
             * parallelize this cycle because the critical region is exeuted
             * only when a ghost edge is found and ghost edges are a minority,
             * circa 3.5% during the tests.
             */
 #pragma omp task depend(out \
                        : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
            {
 #pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
                                                     : numGhostEdges[:1])
                for (i = 0; i < NLEdge; i++)
                { // O(m) - Each edge stored twice
                    insertMe = verLocInd[i];
                    if ((insertMe < StartIndex) || (insertMe > EndIndex))
                    { // Find a ghost
                        (*numGhostEdges)++;
 #pragma omp critical
                        {
                            storedAlready = Ghost2LocalMap.find(insertMe);
                            if (storedAlready != Ghost2LocalMap.end())
                            {                                     // Has already been added
                                Counter[storedAlready->second]++; // Increment the counter
                            }
                            else
                            {                                                 // Insert an entry for the ghost:
                                Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
                                Counter.push_back(1);                         // Initialize the counter
                                (*numGhostVertices)++;                        // Increment the number of ghost vertices
                            }                                                 // End of else()
                        }
                    } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
                }     // End of for(ghost vertices)
            }         // end of task depend
            // *numGhostEdges = atomicNumGhostEdges;
 #ifdef TIME_TRACKER
            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
            if (!Ghost2LocalMap.empty())
            {
                cout << "\n(" << myRank << ")Final Map : on process ";
                cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
                fflush(stdout);
                storedAlready = Ghost2LocalMap.begin();
                do
                {
                    cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
                    fflush(stdout);
                    storedAlready++;
                } while (storedAlready != Ghost2LocalMap.end());
            }
 #endif
 #pragma omp task depend(out                                                       \
                        : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
                                                                               : *numGhostVertices, *numGhostEdges)
            {
                // Initialize adjacency Lists for Ghost Vertices:
                try
                {
                    verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
                    tempCounter.reserve(*numGhostVertices);     // Pointer Vector
                    verGhostInd.reserve(*numGhostEdges);        // Index Vector
                    GMate.reserve(*numGhostVertices);           // Ghost Mate Vector
                }
                catch (length_error)
                {
                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
                    cout << "Not enough memory to allocate the internal variables \n";
                    exit(1);
                }
                // Initialize the Vectors:
                verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
                tempCounter.resize(*numGhostVertices, 0);     // Temporary Counter
                verGhostInd.resize(*numGhostEdges, -1);       // Index Vector
                GMate.resize(*numGhostVertices, -1);          // Temporary Counter
                verGhostPtr[0] = 0;                           // The first value
 #ifdef PRINT_DEBUG_INFO_
                cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
                fflush(stdout);
 #endif
            } // End of task
 #pragma omp task depend(out                      \
                        : verGhostPtr) depend(in \
                                              : Counter, *numGhostVertices)
            {
 #ifdef TIME_TRACKER
                double verGhostPtrInitialization = MPI_Wtime();
 #endif
                for (i = 0; i < *numGhostVertices; i++)
                { // O(|Ghost Vertices|)
                    verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
 #ifdef PRINT_DEBUG_INFO_
                    cout << verGhostPtr[i] << "\t";
                    fflush(stdout);
 #endif
                }
 #ifdef TIME_TRACKER
                verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
                fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
 #endif
            } // End of task
 #ifdef PRINT_DEBUG_INFO_
            if (*numGhostVertices > 0)
                cout << verGhostPtr[*numGhostVertices] << "\n";
            fflush(stdout);
 #endif
 #ifdef TIME_TRACKER
            double verGhostIndInitialization = MPI_Wtime();
 #endif
            /*
             * OMP verGhostIndInitialization
             *
             * In this cycle the verGhostInd is initialized
             * with the datas related to ghost edges.
             * The check to see if a node is a ghost node is
             * executed in paralle and when a ghost node
             * is found a critical region is started.
             *
             * Despite the critical region it's still useful to
             * parallelize the for cause the ghost nodes
             * are a minority hence the critical region is executed
             * few times, circa 3.5% of the times in the tests.
             */
 #pragma omp task depend(in                                                               \
                        : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
                                                                                     : verGhostInd)
            {
 #pragma omp taskloop num_tasks(NUM_THREAD)
                for (v = 0; v < NLVer; v++)
                {
                    adj1 = verLocPtr[v]; // Vertex Pointer
                    adj2 = verLocPtr[v + 1];
                    for (k = adj1; k < adj2; k++)
                    {
                        w = verLocInd[k]; // Get the adjacent vertex
                        if ((w < StartIndex) || (w > EndIndex))
                        { // Find a ghost
 #pragma omp critical
                            {
                                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
                                tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
                            }
                            verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
                        }                                           // End of if((w < StartIndex) || (w > EndIndex))
                    }                                               // End of for(k)
                }                                                   // End of for (v)
            }                                                       // end of tasklopp
 #ifdef TIME_TRACKER
            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")Ghost Vertex Index: ";
            for (v = 0; v < *numGhostEdges; v++)
                cout << verGhostInd[v] << "\t";
            cout << endl;
            fflush(stdout);
 #endif
 #pragma omp task depend(in                           \
                        : *numGhostEdges) depend(out \
                                                 : QLocalVtx, QGhostVtx, QMsgType, QOwner)
            {
                try
                {
                    QLocalVtx.reserve(*numGhostEdges); // Local Vertex
                    QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
                    QMsgType.reserve(*numGhostEdges);  // Message Type (Request/Failure)
                    QOwner.reserve(*numGhostEdges);    // Owner of the ghost: COmpute once and use later
                }
                catch (length_error)
                {
                    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
                    cout << "Not enough memory to allocate the internal variables \n";
                    exit(1);
                }
            } // end of task
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
            fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << "=========================************===============================" << endl;
            fflush(stdout);
            fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
            fflush(stdout);
            fflush(stdout);
 #endif
 #ifdef DEBUG_HANG_
            if (myRank == 0)
                cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
            fflush(stdout);
 #endif
 #pragma omp task depend(in                              \
                        : *numGhostVertices) depend(out \
                                                    : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
            {
                // Allocate Data Structures:
                /*
                 * candidateMate was a vector and has been replaced with an array
                 * there is no point in using the vector (or maybe there is (???))
                 * so I replaced it with an array wich is slightly faster
                 */
                candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
                *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
                /*
                 * Create the Queue Data Structure for the Dominating Set
                 *
                 * I had to declare the staticuQueue U before the parallel region
                 * to have it in the correct scope. Since we can't change the dimension
                 * of a staticQueue I had to destroy the previous object and instantiate
                 * a new one of the correct size.
                 */
                //new (&U) staticQueue(NLVer + (*numGhostVertices));
                U.reserve(NLVer + (*numGhostVertices));
                // Initialize the private vectors                
                privateQLocalVtx.reserve(*numGhostVertices);
                privateQGhostVtx.reserve(*numGhostVertices);
                privateQMsgType.reserve(*numGhostVertices);
                privateQOwner.reserve(*numGhostVertices);
                privateU.reserve(*numGhostVertices);
            } // end of task
        } // End of single region
    }     // End of parallel region
 }
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@ -0,0 +1,46 @@
 #include "MatchBoxPC.h"
 /**
 * //TODO documentation
 * @param k
 * @param verLocInd
 * @param StartIndex
 * @param EndIndex
 * @param GMate
 * @param Mate
 * @param Ghost2LocalMap
 * @return
 */
 bool isAlreadyMatched(MilanLongInt node,
                      MilanLongInt StartIndex,
                      MilanLongInt EndIndex,
                      vector<MilanLongInt> &GMate,
                      MilanLongInt *Mate,
                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
 {
    /*
 #pragma omp critical(Mate)
    {
        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
            result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
        } else { //A local vertex
            result = (Mate[node - StartIndex] >= 0); // Already matched
        }
    }
    */
    MilanLongInt val;
    if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
    { 
 #pragma omp atomic read
        val = GMate[Ghost2LocalMap[node]];
        return val >= 0; // Already matched
    }
    // If not ghost vertex
 #pragma omp atomic read
    val = Mate[node - StartIndex];
    return val >= 0; // Already matched
 }
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@ -0,0 +1,27 @@
 #include "MatchBoxPC.h"
 void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
                                              MilanLongInt *verLocPtr,
                                              MilanLongInt *verLocInd,
                                              MilanInt myRank,
                                              MilanReal *edgeLocWeight,
                                              MilanLongInt *candidateMate)
 {
    MilanLongInt v = -1;
 #pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
    {
 #pragma omp for schedule(static)
        for (v = 0; v < NLVer; v++) {
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
            fflush(stdout);
 #endif
            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
        }
    }
 }
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@ -0,0 +1,24 @@
 #include "MatchBoxPC.h"
 void PROCESS_CROSS_EDGE(MilanLongInt *edge,
                        MilanLongInt *S)
 {
    // Start: PARALLEL_PROCESS_CROSS_EDGE_B
    MilanLongInt captureCounter;
 #pragma omp atomic capture
    captureCounter = --(*edge); // Decrement
    //assert(captureCounter >= 0);
    if (captureCounter == 0)
 #pragma omp atomic
        (*S)--; // Decrement S
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
    fflush(stdout);
 #endif
    // End: PARALLEL_PROCESS_CROSS_EDGE_B
 }
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@ -0,0 +1,195 @@
 #include "MatchBoxPC.h"
 void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
                                       MilanLongInt *candidateMate,
                                       MilanLongInt *verLocInd,
                                       MilanLongInt *verLocPtr,
                                       MilanLongInt StartIndex,
                                       MilanLongInt EndIndex,
                                       MilanLongInt *Mate,
                                       vector<MilanLongInt> &GMate,
                                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
                                       MilanReal *edgeLocWeight,
                                       MilanLongInt *myCard,
                                       MilanLongInt *msgInd,
                                       MilanLongInt *NumMessagesBundled,
                                       MilanLongInt *S,
                                       MilanLongInt *verDistance,
                                       MilanLongInt *PCounter,
                                       vector<MilanLongInt> &Counter,
                                       MilanInt myRank,
                                       MilanInt numProcs,
                                       vector<MilanLongInt> &U,
                                       vector<MilanLongInt> &privateU,
                                       vector<MilanLongInt> &QLocalVtx,
                                       vector<MilanLongInt> &QGhostVtx,
                                       vector<MilanLongInt> &QMsgType,
                                       vector<MilanInt> &QOwner,
                                       vector<MilanLongInt> &privateQLocalVtx,
                                       vector<MilanLongInt> &privateQGhostVtx,
                                       vector<MilanLongInt> &privateQMsgType,
                                       vector<MilanInt> &privateQOwner)
 { 
    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
    MilanInt ghostOwner = 0, option, igw;
 #pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)    \
    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \
     default(shared)   num_threads(NUM_THREAD)
    {
 #pragma omp for reduction(+                             \
                          : PCounter[:numProcs], myCard \
                          [:1], msgInd                  \
                          [:1], NumMessagesBundled      \
                          [:1]) \
        schedule(static)
        for (v = 0; v < NLVer; v++) {
            option = -1;
            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
            k = candidateMate[v];
            candidateMate[v] = verLocInd[k];
            w = candidateMate[v];
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
            fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
            fflush(stdout);
 #endif
            // If found a dominating edge:
            if (w >= 0)
            {
 #pragma omp critical(processExposed)
                {
                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
 		      w = computeCandidateMate(verLocPtr[v],
 					       verLocPtr[v + 1],
 					       edgeLocWeight, 0,
 					       verLocInd,
 					       StartIndex,
 					       EndIndex,
 					       GMate,
 					       Mate,
 					       Ghost2LocalMap);
 		      candidateMate[v] = w;
                    }
                    if (w >= 0) {
 		      (*myCard)++;
 		      if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex
 			option = 2;
 			if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
 			  option = 1;
 			  Mate[v] = w;
 			  GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
 			} // End of if CandidateMate[w] = v
 		      } // End of if a Ghost Vertex
 		      else  { // w is a local vertex
 			if (candidateMate[w - StartIndex] == (v + StartIndex)) {
 			  option = 3;
 			  Mate[v] = w;                           // v is local
 			  Mate[w - StartIndex] = v + StartIndex; // w is local
 #ifdef PRINT_DEBUG_INFO_
 			  cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
 			  fflush(stdout);
 #endif
 			} // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
 		      }     // End of Else
                    } // End of second if
                } // End critical processExposed
            } // End of if(w >=0)
            else  {
 	      // This piece of code is executed a really small amount of times
 	      adj11 = verLocPtr[v];
 	      adj12 = verLocPtr[v + 1];
 	      for (k1 = adj11; k1 < adj12; k1++) {
 		w = verLocInd[k1];
 		if ((w < StartIndex) || (w > EndIndex)) { // A ghost
 #ifdef PRINT_DEBUG_INFO_
 		  cout << "\n(" << myRank << ")Sending a failure message: ";
 		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		  fflush(stdout);
 #endif
 		  (*msgInd)++;
 		  (*NumMessagesBundled)++;
 		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		  // assert(ghostOwner != -1);
 		  // assert(ghostOwner != myRank);
 		  PCounter[ghostOwner]++;
 		  privateQLocalVtx.push_back(v + StartIndex);
 		  privateQGhostVtx.push_back(w);
 		  privateQMsgType.push_back(FAILURE);
 		  privateQOwner.push_back(ghostOwner);
 		} // End of if(GHOST)
 	      }     // End of for loop
            }
            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
            switch (option)
            {
            case -1:
                break;
            case 1:
                privateU.push_back(v + StartIndex);
                privateU.push_back(w);
 #ifdef PRINT_DEBUG_INFO_
                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
                fflush(stdout);
 #endif
                //  Decrement the counter:
                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
            case 2:
 #ifdef PRINT_DEBUG_INFO_
                cout << "\n(" << myRank << ")Sending a request message (291):";
                cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
                fflush(stdout);
 #endif
                (*msgInd)++;
                (*NumMessagesBundled)++;
                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
                // assert(ghostOwner != -1);
                // assert(ghostOwner != myRank);
                PCounter[ghostOwner]++;
                privateQLocalVtx.push_back(v + StartIndex);
                privateQGhostVtx.push_back(w);
                privateQMsgType.push_back(REQUEST);
                privateQOwner.push_back(ghostOwner);
                break;
            case 3:
            default:
                privateU.push_back(v + StartIndex);
                privateU.push_back(w);
                break;
            }
        } // End of for ( v=0; v < NLVer; v++ )
        queuesTransfer(U, privateU, QLocalVtx,
                       QGhostVtx,
                       QMsgType, QOwner, privateQLocalVtx,
                       privateQGhostVtx,
                       privateQMsgType,
                       privateQOwner);
    } // End of parallel region
 }
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@ -0,0 +1,294 @@
 #include "MatchBoxPC.h"
 void processMatchedVertices(
    MilanLongInt NLVer,
    vector<MilanLongInt> &UChunkBeingProcessed,
    vector<MilanLongInt> &U,
    vector<MilanLongInt> &privateU,
    MilanLongInt StartIndex,
    MilanLongInt EndIndex,
    MilanLongInt *myCard,
    MilanLongInt *msgInd,
    MilanLongInt *NumMessagesBundled,
    MilanLongInt *SPtr,
    MilanLongInt *verLocPtr,
    MilanLongInt *verLocInd,
    MilanLongInt *verDistance,
    MilanLongInt *PCounter,
    vector<MilanLongInt> &Counter,
    MilanInt myRank,
    MilanInt numProcs,
    MilanLongInt *candidateMate,
    vector<MilanLongInt> &GMate,
    MilanLongInt *Mate,
    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
    MilanReal *edgeLocWeight,
    vector<MilanLongInt> &QLocalVtx,
    vector<MilanLongInt> &QGhostVtx,
    vector<MilanLongInt> &QMsgType,
    vector<MilanInt> &QOwner,
    vector<MilanLongInt> &privateQLocalVtx,
    vector<MilanLongInt> &privateQGhostVtx,
    vector<MilanLongInt> &privateQMsgType,
    vector<MilanInt> &privateQOwner)
 {
    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
    int option;
    MilanLongInt mateVal;
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
 #ifdef COUNT_LOCAL_VERTEX
    MilanLongInt localVertices = 0;
 #endif
    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \
       privateQMsgType, privateQOwner, UChunkBeingProcessed) \
       default(shared)  num_threads(NUM_THREAD)   \
            reduction(+                           \
                      : msgInd[:1], PCounter      \
                      [:numProcs], myCard         \
                      [:1], NumMessagesBundled    \
                      [:1])
    {
        while (!U.empty()) {
 	  extractUChunk(UChunkBeingProcessed, U, privateU);
 	  for (MilanLongInt u : UChunkBeingProcessed)   {
 #ifdef PRINT_DEBUG_INFO_
 	    cout << "\n(" << myRank << ")u: " << u;
 	    fflush(stdout);
 #endif
 	    if ((u >= StartIndex) && (u <= EndIndex))     { // Process Only the Local Vertices
 #ifdef COUNT_LOCAL_VERTEX
 	      localVertices++;
 #endif
 	      // Get the Adjacency list for u
 	      adj1 = verLocPtr[u - StartIndex]; // Pointer
 	      adj2 = verLocPtr[u - StartIndex + 1];
 	      for (k = adj1; k < adj2; k++) {
 		option = -1;
 		v = verLocInd[k];
 		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
 #ifdef PRINT_DEBUG_INFO_
 		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
 		  fflush(stdout);
 #endif
 #pragma omp atomic read
 		  mateVal = Mate[v - StartIndex];
 		  // If the current vertex is pointing to a matched vertex and is not matched
 		  if (mateVal < 0)  {
 #pragma omp critical
 		    {
 		      if (candidateMate[v - StartIndex] == u) {
 			  // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 			w = computeCandidateMate(verLocPtr[v - StartIndex],
 						 verLocPtr[v - StartIndex + 1],
 						 edgeLocWeight, 0,
 						 verLocInd,
 						 StartIndex,
 						 EndIndex,
 						 GMate,
 						 Mate,
 						 Ghost2LocalMap);
 			candidateMate[v - StartIndex] = w;
 #ifdef PRINT_DEBUG_INFO_
 			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
 			fflush(stdout);
 #endif
 			// If found a dominating edge:
 			if (w >= 0)   {			  
 			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
 #ifdef PRINT_DEBUG_INFO_
 			    cout << "\n(" << myRank << ")Sending a request message:";
 			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 			    option = 2;
 			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)  {
 			      option = 1;
 			      Mate[v - StartIndex] = w;     // v is a local vertex
 			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
 			    } // End of if CandidateMate[w] = v
 			  }     // End of if a Ghost Vertex
 			  else    { // w is a local vertex
 			    if (candidateMate[w - StartIndex] == v)  {
 			      option = 3;
 			      Mate[v - StartIndex] = w; // v is a local vertex
 			      Mate[w - StartIndex] = v; // w is a local vertex
 #ifdef PRINT_DEBUG_INFO_
 			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
 			      fflush(stdout);
 #endif
 			    } // End of if(CandidateMate(w) = v
 			  }     // End of Else
 			}         // End of if(w >=0)
 			else
 			  option = 4; // End of Else: w == -1
 			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 		      } // End of If (candidateMate[v-StartIndex] == u
 		    }     // End of task
 		  }         // mateval < 0
 		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
 		else  { // Neighbor is a ghost vertex
 #pragma omp critical
 		  {
 		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
 		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
 		    if (v != Mate[u - StartIndex])
 		      option = 5; // u is local
 		  }                   // End of critical
 		}                       // End of Else //A Ghost Vertex
 		switch (option)
 		  {
 		  case -1:
 		    // No things to do
 		    break;
 		  case 1:
 		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
 		    privateU.push_back(v);
 		    privateU.push_back(w);
 		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
 		    fflush(stdout);
 #endif
 		    // Decrement the counter:
 		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
 		  case 2:
 		    // Found a dominating edge, it is a ghost
 		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		    // assert(ghostOwner != -1);
 		    // assert(ghostOwner != myRank);
 		    PCounter[ghostOwner]++;
 		    (*NumMessagesBundled)++;
 		    (*msgInd)++;
 		    privateQLocalVtx.push_back(v);
 		    privateQGhostVtx.push_back(w);
 		    privateQMsgType.push_back(REQUEST);
 		    privateQOwner.push_back(ghostOwner);
 		    break;
 		  case 3:
 		    privateU.push_back(v);
 		    privateU.push_back(w);
 		    (*myCard)++;
 		    break;
 		  case 4:
 		    // Could not find a dominating vertex
 		    adj11 = verLocPtr[v - StartIndex];
 		    adj12 = verLocPtr[v - StartIndex + 1];
 		    for (k1 = adj11; k1 < adj12; k1++) {
 		      w = verLocInd[k1];
 		      if ((w < StartIndex) || (w > EndIndex)) { // A ghost
 #ifdef PRINT_DEBUG_INFO_
 			cout << "\n(" << myRank << ")Sending a failure message: ";
 			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 			fflush(stdout);
 #endif
 			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 			// assert(ghostOwner != -1);
 			// assert(ghostOwner != myRank);
 			PCounter[ghostOwner]++;
 			(*NumMessagesBundled)++;
 			(*msgInd)++;
 			privateQLocalVtx.push_back(v);
 			privateQGhostVtx.push_back(w);
 			privateQMsgType.push_back(FAILURE);
 			privateQOwner.push_back(ghostOwner);
 		      } // End of if(GHOST)
 		    }     // End of for loop
 		    break;
 		  case 5:
 		  default:
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")Sending a success message: ";
 		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
 		    fflush(stdout);
 #endif
 		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
 		    // assert(ghostOwner != -1);
 		    // assert(ghostOwner != myRank);
 		    (*NumMessagesBundled)++;
 		    PCounter[ghostOwner]++;
 		    (*msgInd)++;
 		    privateQLocalVtx.push_back(u);
 		    privateQGhostVtx.push_back(v);
 		    privateQMsgType.push_back(SUCCESS);
 		    privateQOwner.push_back(ghostOwner);
 		    break;
 		  } // End of switch
 	      } // End of inner for
 	    }
 	  } // End of outer for
 	  queuesTransfer(U, privateU, QLocalVtx,
 			 QGhostVtx,
 			 QMsgType, QOwner, privateQLocalVtx,
 			 privateQGhostVtx,
 			 privateQMsgType,
 			 privateQOwner);
 #pragma omp critical(U)
 	  {
 	    U.insert(U.end(), privateU.begin(), privateU.end());
 	  }
 	  privateU.clear();
 #pragma omp critical(sendMessageTransfer)
 	  {
 	    QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
 	    QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
 	    QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
 	    QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
 	  }
 	  privateQLocalVtx.clear();
 	  privateQGhostVtx.clear();
 	  privateQMsgType.clear();
 	  privateQOwner.clear();
        } // End of while ( !U.empty() )
 #ifdef COUNT_LOCAL_VERTEX
        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices,
               omp_get_thread_num(),
               myRank);
 #endif
    } // End of parallel region
 }
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@ -0,0 +1,308 @@
 #include "MatchBoxPC.h"
 //#define DEBUG_HANG_
 void processMatchedVerticesAndSendMessages(
 					   MilanLongInt NLVer,
 					   vector<MilanLongInt> &UChunkBeingProcessed,
 					   vector<MilanLongInt> &U,
 					   vector<MilanLongInt> &privateU,
 					   MilanLongInt StartIndex,
 					   MilanLongInt EndIndex,
 					   MilanLongInt *myCard,
 					   MilanLongInt *msgInd,
 					   MilanLongInt *NumMessagesBundled,
 					   MilanLongInt *SPtr,
 					   MilanLongInt *verLocPtr,
 					   MilanLongInt *verLocInd,
 					   MilanLongInt *verDistance,
 					   MilanLongInt *PCounter,
 					   vector<MilanLongInt> &Counter,
 					   MilanInt myRank,
 					   MilanInt numProcs,
 					   MilanLongInt *candidateMate,
 					   vector<MilanLongInt> &GMate,
 					   MilanLongInt *Mate,
 					   map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
 					   MilanReal *edgeLocWeight,
 					   vector<MilanLongInt> &QLocalVtx,
 					   vector<MilanLongInt> &QGhostVtx,
 					   vector<MilanLongInt> &QMsgType,
 					   vector<MilanInt> &QOwner,
 					   vector<MilanLongInt> &privateQLocalVtx,
 					   vector<MilanLongInt> &privateQGhostVtx,
 					   vector<MilanLongInt> &privateQMsgType,
 					   vector<MilanInt> &privateQOwner,
 					   MPI_Comm comm,
 					   MilanLongInt *msgActual,
 					   vector<MilanLongInt> &Message)
 {
    MilanLongInt initialSize = QLocalVtx.size();
    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
    int option;
    MilanLongInt mateVal;
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
 #ifdef COUNT_LOCAL_VERTEX
    MilanLongInt localVertices = 0;
 #endif
    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
 		 privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
        num_threads(NUM_THREAD)                                                        \
            reduction(+                                                                \
                      : msgInd[:1], PCounter                                           \
                      [:numProcs], myCard                                              \
                      [:1], NumMessagesBundled                                         \
                      [:1], msgActual                                                  \
                      [:1])
    {
        while (!U.empty()) {
 	  extractUChunk(UChunkBeingProcessed, U, privateU);
 	  for (MilanLongInt u : UChunkBeingProcessed) {
 #ifdef PRINT_DEBUG_INFO_
 	    cout << "\n(" << myRank << ")u: " << u;
 	    fflush(stdout);
 #endif
 	    if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
 #ifdef COUNT_LOCAL_VERTEX
 	      localVertices++;
 #endif
 	      // Get the Adjacency list for u
 	      adj1 = verLocPtr[u - StartIndex]; // Pointer
 	      adj2 = verLocPtr[u - StartIndex + 1];
 	      for (k = adj1; k < adj2; k++) {
 		option = -1;
 		v = verLocInd[k];
 		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
 #ifdef PRINT_DEBUG_INFO_
 		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
 		  fflush(stdout);
 #endif
 #pragma omp atomic read
 		  mateVal = Mate[v - StartIndex];
 		  // If the current vertex is pointing to a matched vertex and is not matched
 		  if (mateVal < 0) {
 #pragma omp critical
 		    {
 		      if (candidateMate[v - StartIndex] == u)  {
 			// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 			w = computeCandidateMate(verLocPtr[v - StartIndex],
 						 verLocPtr[v - StartIndex + 1],
 						 edgeLocWeight, 0,
 						 verLocInd,
 						 StartIndex,
 						 EndIndex,
 						 GMate,
 						 Mate,
 						 Ghost2LocalMap);
 			candidateMate[v - StartIndex] = w;
 #ifdef PRINT_DEBUG_INFO_
 			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
 			fflush(stdout);
 #endif
 			// If found a dominating edge:
 			if (w >= 0)  {
 			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
 #ifdef PRINT_DEBUG_INFO_
 			    cout << "\n(" << myRank << ")Sending a request message:";
 			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 #endif
 			    option = 2;
 			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
 			      option = 1;
 			      Mate[v - StartIndex] = w;     // v is a local vertex
 			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
 			    } // End of if CandidateMate[w] = v
 			  }     // End of if a Ghost Vertex
 			  else   { // w is a local vertex
 			    if (candidateMate[w - StartIndex] == v)  {
 			      option = 3;
 			      Mate[v - StartIndex] = w; // v is a local vertex
 			      Mate[w - StartIndex] = v; // w is a local vertex
 #ifdef PRINT_DEBUG_INFO_
 			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
 			      fflush(stdout);
 #endif
 			    } // End of if(CandidateMate(w) = v
 			  }     // End of Else
 			}         // End of if(w >=0)
 			else
 			  option = 4; // End of Else: w == -1
 			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 		      } // End of If (candidateMate[v-StartIndex] == u
 		    }     // End of task
 		  }         // mateval < 0
 		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
 		else { // Neighbor is a ghost vertex
 #pragma omp critical
 		  {
 		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
 		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
 		    if (v != Mate[u - StartIndex])
 		      option = 5; // u is local
 		  }                   // End of critical
 		}                       // End of Else //A Ghost Vertex
 		switch (option)
 		  {
 		  case -1:
 		    // No things to do
 		    break;
 		  case 1:
 		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
 		    privateU.push_back(v);
 		    privateU.push_back(w);
 		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
 		    fflush(stdout);
 #endif
 		    // Decrement the counter:
 		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
 		  case 2:
 		    // Found a dominating edge, it is a ghost
 		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		    // Build the Message Packet:
 		    // Message[0] = v;       // LOCAL
 		    // Message[1] = w;       // GHOST
 		    // Message[2] = REQUEST; // TYPE
 		    // Send a Request (Asynchronous)
 		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 		    (*msgActual)++;
 		    (*msgInd)++;
 		    privateQLocalVtx.push_back(v);
 		    privateQGhostVtx.push_back(w);
 		    privateQMsgType.push_back(REQUEST);
 		    privateQOwner.push_back(ghostOwner);
 		    break;
 		  case 3:
 		    privateU.push_back(v);
 		    privateU.push_back(w);
 		    (*myCard)++;
 		    break;
 		  case 4:
 		    // Could not find a dominating vertex
 		    adj11 = verLocPtr[v - StartIndex];
 		    adj12 = verLocPtr[v - StartIndex + 1];
 		    for (k1 = adj11; k1 < adj12; k1++) {
 		      w = verLocInd[k1];
 		      if ((w < StartIndex) || (w > EndIndex))  { // A ghost
 #ifdef PRINT_DEBUG_INFO_
 			cout << "\n(" << myRank << ")Sending a failure message: ";
 			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
 			fflush(stdout);
 #endif
 			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 			// Build the Message Packet:
 			// Message[0] = v;       // LOCAL
 			// Message[1] = w;       // GHOST
 			// Message[2] = FAILURE; // TYPE
 			// Send a Request (Asynchronous)
 			// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 			(*msgActual)++;
 			(*msgInd)++;
 			privateQLocalVtx.push_back(v);
 			privateQGhostVtx.push_back(w);
 			privateQMsgType.push_back(FAILURE);
 			privateQOwner.push_back(ghostOwner);
 		      } // End of if(GHOST)
 		    }     // End of for loop
 		    break;
 		  case 5:
 		  default:
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")Sending a success message: ";
 		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
 		    fflush(stdout);
 #endif
 		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
 		    // Build the Message Packet:
 		    // Message[0] = u;       // LOCAL
 		    // Message[1] = v;       // GHOST
 		    // Message[2] = SUCCESS; // TYPE
 		    // Send a Request (Asynchronous)
 		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 		    (*msgActual)++;
 		    (*msgInd)++;
 		    privateQLocalVtx.push_back(u);
 		    privateQGhostVtx.push_back(v);
 		    privateQMsgType.push_back(SUCCESS);
 		    privateQOwner.push_back(ghostOwner);
 		    break;
 		  } // End of switch
 	      }     // End of inner for
 	    }
 	  } // End of outer for
 	  queuesTransfer(U, privateU, QLocalVtx,
 			 QGhostVtx,
 			 QMsgType, QOwner, privateQLocalVtx,
 			 privateQGhostVtx,
 			 privateQMsgType,
 			 privateQOwner);
        } // End of while ( !U.empty() )
 #ifdef COUNT_LOCAL_VERTEX
        printf("Count local vertexes: %ld for thread %d of processor %d\n",
               localVertices,
               omp_get_thread_num(),
               myRank);
 #endif
    } // End of parallel region
    // Send the messages
 #ifdef DEBUG_HANG_
    cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
 #endif
    for (int i = initialSize; i < QOwner.size(); i++)   {
      Message[0] = QLocalVtx[i];
      Message[1] = QGhostVtx[i];
      Message[2] = QMsgType[i];
      ghostOwner = QOwner[i];
      //MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
      //cout << myRank<<" Sending to "<<ghostOwner<<endl;
      MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
    }
 #ifdef DEBUG_HANG_
    cout << myRank<<" Done sending messages"<<endl;
 #endif
 }
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@ -0,0 +1,315 @@
 #include "MatchBoxPC.h"
 //#define DEBUG_HANG_
 void processMessages(
    MilanLongInt NLVer,
    MilanLongInt *Mate,
    MilanLongInt *candidateMate,
    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
    vector<MilanLongInt> &GMate,
    vector<MilanLongInt> &Counter,
    MilanLongInt StartIndex,
    MilanLongInt EndIndex,
    MilanLongInt *myCard,
    MilanLongInt *msgInd,
    MilanLongInt *msgActual,
    MilanReal *edgeLocWeight,
    MilanLongInt *verDistance,
    MilanLongInt *verLocPtr,
    MilanLongInt k,
    MilanLongInt *verLocInd,
    MilanInt numProcs,
    MilanInt myRank,
    MPI_Comm comm,
    vector<MilanLongInt> &Message,
    MilanLongInt numGhostEdges,
    MilanLongInt u,
    MilanLongInt v,
    MilanLongInt *S,
    vector<MilanLongInt> &U)
 {
    //#define PRINT_DEBUG_INFO_
    MilanInt Sender;
    MPI_Status computeStatus;
    MilanLongInt bundleSize, w;
    MilanLongInt adj11, adj12, k1;
    MilanLongInt ghostOwner;
    int error_codeC;
    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
    char error_message[MPI_MAX_ERROR_STRING];
    int message_length;
    MilanLongInt message_type = 0;
    // Buffer to receive bundled messages
    // Maximum messages that can be received from any processor is
    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
    vector<MilanLongInt> ReceiveBuffer;
    try
    {
        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
    }
    catch (length_error)
    {
        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
        cout << "Not enough memory to allocate the internal variables \n";
        exit(1);
    }
 #ifdef PRINT_DEBUG_INFO_
    cout
        << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
    fflush(stdout);
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << "=========================************===============================" << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
    // BLOCKING RECEIVE:
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
    fflush(stdout);
    fflush(stdout);
 #endif
    //cout << myRank<<" Receiving ...";
    error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
    if (error_codeC != MPI_SUCCESS)
    {
        MPI_Error_string(error_codeC, error_message, &message_length);
        cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
        fflush(stdout);
    }
    Sender = computeStatus.MPI_SOURCE;
    //cout << " ...from "<<Sender << endl;
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
    fflush(stdout);
 #endif
    if (Message[2] == SIZEINFO)  {
 #ifdef PRINT_DEBUG_INFO_
      cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
      fflush(stdout);
 #endif
      bundleSize = Message[0]; //#of integers in the message
      // Build the Message Buffer:
      if (!ReceiveBuffer.empty())
 	ReceiveBuffer.clear();            // Empty it out first
      ReceiveBuffer.resize(bundleSize, -1); // Initialize
 #ifdef PRINT_DEBUG_INFO_
      cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
      for (int i = 0; i < bundleSize; i++)
 	cout << ReceiveBuffer[i] << ",";
      cout << endl;
      fflush(stdout);
 #endif
      // Receive the message
      //cout << myRank<<" Receiving from "<<Sender<<endl;
      error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
      if (error_codeC != MPI_SUCCESS) {
 	MPI_Error_string(error_codeC, error_message, &message_length);
 	cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
 	fflush(stdout);
      }
 #ifdef PRINT_DEBUG_INFO_
      cout << "\n(" << myRank << ")Message Bundle After: " << endl;
      for (int i = 0; i < bundleSize; i++)
 	cout << ReceiveBuffer[i] << ",";
      cout << endl;
      fflush(stdout);
 #endif
    } else   { // Just a single message:
 #ifdef PRINT_DEBUG_INFO_
      cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
      fflush(stdout);
 #endif
      // Add the current message to Queue:
      bundleSize = 3; //#of integers in the message
      // Build the Message Buffer:
      if (!ReceiveBuffer.empty())
 	ReceiveBuffer.clear();            // Empty it out first
      ReceiveBuffer.resize(bundleSize, -1); // Initialize
      ReceiveBuffer[0] = Message[0]; // u
      ReceiveBuffer[1] = Message[1]; // v
      ReceiveBuffer[2] = Message[2]; // message_type
    }
 #ifdef DEBUG_GHOST_
    if ((v < StartIndex) || (v > EndIndex)) {
      cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
      fflush(stdout);
    }
 #endif
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
    fflush(stdout);
 #endif
    // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)  {
      u = ReceiveBuffer[bundleCounter - 3];            // GHOST
      v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
      message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
      // CASE I: REQUEST
      if (message_type == REQUEST)  {
 #ifdef PRINT_DEBUG_INFO_
 	cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
 	fflush(stdout);
 #endif
 #ifdef DEBUG_GHOST_
 	if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
 	  cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
 	  fflush(stdout);
 	}
 #endif
 	if (Mate[v - StartIndex] == -1) {
 	  // Process only if not already matched  (v is local)
 	  candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
 	  if (candidateMate[v - StartIndex] == u)   {
 	    GMate[Ghost2LocalMap[u]] = v; // u is ghost
 	    Mate[v - StartIndex] = u;     // v is local
 	    U.push_back(v);
 	    U.push_back(u);
 	    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
 	    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
 	    fflush(stdout);
 #endif
 	    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
 	  } // End of if ( candidateMate[v-StartIndex] == u )e
 	}     // End of if ( Mate[v] == -1 )
      }         // End of REQUEST
      else  { // CASE II: SUCCESS
 	if (message_type == SUCCESS) {
 #ifdef PRINT_DEBUG_INFO_
 	  cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
 	  fflush(stdout);
 #endif
 	  GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
 	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
 #ifdef DEBUG_GHOST_
 	  if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
 	    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
 	    fflush(stdout);
 	  }
 #endif
 	  if (Mate[v - StartIndex] == -1) {
 	    // Process only if not already matched ( v is local)
 	    if (candidateMate[v - StartIndex] == u)  {
 	      // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 	      w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k,
 				       verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
 	      candidateMate[v - StartIndex] = w;
 #ifdef PRINT_DEBUG_INFO_
 	      cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
 	      fflush(stdout);
 #endif
 	      // If found a dominating edge:
 	      if (w >= 0)  {
 		if ((w < StartIndex) || (w > EndIndex))   {
 		  // w is a ghost
 		  // Build the Message Packet:
 		  Message[0] = v;       // LOCAL
 		  Message[1] = w;       // GHOST
 		  Message[2] = REQUEST; // TYPE
 		  // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
 		  cout << "\n(" << myRank << ")Sending a request message: ";
 		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
 		  fflush(stdout);
 #endif
 		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		  //assert(ghostOwner != -1);
 		  //assert(ghostOwner != myRank);
 		  //cout << myRank<<" Sending to "<<ghostOwner<<endl;
 		  MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 		  (*msgInd)++;
 		  (*msgActual)++;
 		  if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
 		    Mate[v - StartIndex] = w;     // v is local
 		    GMate[Ghost2LocalMap[w]] = v; // w is ghost
 		    U.push_back(v);
 		    U.push_back(w);
 		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
 		    fflush(stdout);
 #endif
 		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
 		  } // End of if CandidateMate[w] = v
 		}     // End of if a Ghost Vertex
 		else   { // w is a local vertex
 		  if (candidateMate[w - StartIndex] == v) {
 		    Mate[v - StartIndex] = w; // v is local
 		    Mate[w - StartIndex] = v; // w is local
 		    // Q.push_back(u);
 		    U.push_back(v);
 		    U.push_back(w);
 		    (*myCard)++;
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
 		    fflush(stdout);
 #endif
 		  } // End of if(CandidateMate(w) = v
 		}     // End of Else
 	      }         // End of if(w >=0)
 	      else  { // No dominant edge found
 		adj11 = verLocPtr[v - StartIndex];
 		adj12 = verLocPtr[v - StartIndex + 1];
 		for (k1 = adj11; k1 < adj12; k1++) {
 		  w = verLocInd[k1];
 		  if ((w < StartIndex) || (w > EndIndex))  {
 		    // A ghost
 		    // Build the Message Packet:
 		    Message[0] = v;       // LOCAL
 		    Message[1] = w;       // GHOST
 		    Message[2] = FAILURE; // TYPE
 		    // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
 		    cout << "\n(" << myRank << ")Sending a failure message: ";
 		    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
 		    fflush(stdout);
 #endif
 		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
 		    //assert(ghostOwner != -1);
 		    //assert(ghostOwner != myRank);
 		    //cout << myRank<<" Sending to "<<ghostOwner<<endl;
 		    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
 		    (*msgInd)++;
 		    (*msgActual)++;
 		  } // End of if(GHOST)
 		}     // End of for loop
 	      }         // End of Else: w == -1
                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
 	    } // End of if ( candidateMate[v-StartIndex] == u )
 	  }     // End of if ( Mate[v] == -1 )
 	}         // End of if ( message_type == SUCCESS )
 	else  {
 	  // CASE III: FAILURE
 #ifdef PRINT_DEBUG_INFO_
 	  cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
 	  fflush(stdout);
 #endif
 	  GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
 	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
 	}                                                       // End of else: CASE III
      }                                                           // End of else: CASE I
    }
    return;
 }
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@ -0,0 +1,36 @@
 #include "MatchBoxPC.h"
 void queuesTransfer(vector<MilanLongInt> &U,
                    vector<MilanLongInt> &privateU,
                    vector<MilanLongInt> &QLocalVtx,
                    vector<MilanLongInt> &QGhostVtx,
                    vector<MilanLongInt> &QMsgType,
                    vector<MilanInt> &QOwner,
                    vector<MilanLongInt> &privateQLocalVtx,
                    vector<MilanLongInt> &privateQGhostVtx,
                    vector<MilanLongInt> &privateQMsgType,
                    vector<MilanInt> &privateQOwner)
 {
 #pragma omp critical(U)
    {
        U.insert(U.end(), privateU.begin(), privateU.end());
    }
    privateU.clear();
 #pragma omp critical(sendMessageTransfer)
    {
        QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
        QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
        QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
        QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
    }
    privateQLocalVtx.clear();
    privateQGhostVtx.clear();
    privateQMsgType.clear();
    privateQOwner.clear();
 }
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@ -0,0 +1,209 @@
 #include "MatchBoxPC.h"
 void sendBundledMessages(MilanLongInt *numGhostEdges,
                                MilanInt *BufferSize,
                                MilanLongInt *Buffer,
                                vector<MilanLongInt> &PCumulative,
                                vector<MilanLongInt> &PMessageBundle,
                                vector<MilanLongInt> &PSizeInfoMessages,
                                MilanLongInt *PCounter,
                                MilanLongInt NumMessagesBundled,
                                MilanLongInt *msgActual,
                                MilanLongInt *msgInd,
                                MilanInt numProcs,
                                MilanInt myRank,
                                MPI_Comm comm,
                                vector<MilanLongInt> &QLocalVtx,
                                vector<MilanLongInt> &QGhostVtx,
                                vector<MilanLongInt> &QMsgType,
                                vector<MilanInt> &QOwner,
                                vector<MPI_Request> &SRequest,
                                vector<MPI_Status> &SStatus)
 {
    MilanLongInt myIndex = 0, numMessagesToSend;
    MilanInt i = 0, OneMessageSize = 0;
 #ifdef DEBUG_HANG_
    if (myRank == 0)
        cout << "\n(" << myRank << ") Send Bundles" << endl;
    fflush(stdout);
 #endif
 #pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
    {
 #pragma omp master
        {
 // Data structures for Bundled Messages:
 #pragma omp task depend(inout                                                       \
                        : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
                                                                                 : NumMessagesBundled, numProcs)
            {
 	      try {
                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
 		PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
 		PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
 	      }
 	      catch (length_error)
 		{
 		  cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
 		  cout << "Not enough memory to allocate the internal variables \n";
 		  exit(1);
 		}
 	      PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
 	      PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
 	      PSizeInfoMessages.resize(numProcs * 3, 0);
 	    }
 #pragma omp task depend(inout                    \
                        : PCumulative) depend(in		\
                                              : PCounter)
 	    {
 	      for (i = 0; i < numProcs; i++)
 		PCumulative[i + 1] = PCumulative[i] + PCounter[i];
 	    }
 #pragma omp task depend(inout \
                        : PCounter)
 	    {
 	      // Reuse PCounter to keep track of how many messages were inserted:
 	      for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
 		PCounter[i] = 0;
 	    }
 // Build the Message Bundle packet:
 #pragma omp task depend(in                                                                                          \
                        : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
                                                                                                                : myIndex, PMessageBundle, PCounter)
 {
  for (i = 0; i < NumMessagesBundled; i++)   {
    myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
    PMessageBundle[myIndex + 0] = QLocalVtx[i];
    PMessageBundle[myIndex + 1] = QGhostVtx[i];
    PMessageBundle[myIndex + 2] = QMsgType[i];
    PCounter[QOwner[i]]++;
  }
 }
 // Send the Bundled Messages: Use ISend
 #pragma omp task depend(out \
                        : SRequest, SStatus)
 {
   try
     {
       SRequest.reserve(numProcs * 2); // At most two messages per processor
       SStatus.reserve(numProcs * 2);  // At most two messages per processor
     }
   catch (length_error)
     {
       cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
       cout << "Not enough memory to allocate the internal variables \n";
       exit(1);
     }
 }
 // Send the Messages
 #pragma omp task depend(inout                                                  \
                        : SRequest, PSizeInfoMessages, PCumulative) depend(out \
                                                                           : *msgActual, *msgInd)
 {
  for (i = 0; i < numProcs; i++)   {                    // Changed by Fabio to be an integer, addresses needs to be integers!
    if (i == myRank) // Do not send anything to yourself
      continue;
    // Send the Message with information about the size of next message:
    // Build the Message Packet:
    PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
    PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
    PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
    // Send a Request (Asynchronous)
 #ifdef PRINT_DEBUG_INFO_
    cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
    fflush(stdout);
 #endif
    if (PSizeInfoMessages[i * 3 + 0] > 0)
      { // Send only if it is a nonempty packet
 	MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
 		  &SRequest[(*msgInd)]);
 	(*msgActual)++;
 	(*msgInd)++;
 	// Now Send the message with the data packet:
 #ifdef PRINT_DEBUG_INFO_
 	cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
 	for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
 	  cout << PMessageBundle[k] << ",";
 	cout << endl;
 	fflush(stdout);
 #endif
 	MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
 		  TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
 	(*msgInd)++;
      } // End of if size > 0
  }
 }
 #pragma omp task depend(inout \
                        : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
 {
    // Free up temporary memory:
  PCumulative.clear();
  QLocalVtx.clear();
  QGhostVtx.clear();
  QMsgType.clear();
  QOwner.clear();
 }
 #pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
 {
 #ifdef PRINT_DEBUG_INFO_
  cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
  cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
  cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
  if (*numGhostEdges > 0)
    {
      cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
    }
  fflush(stdout);
 #endif
  // Allocate memory for MPI Send messages:
  /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
  OneMessageSize = 0;
  MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
  // How many messages to send?
  // Potentially three kinds of messages will be sent/received:
  // Request, Success, Failure.
  // But only two will be sent from a given processor.
  // Substract the number of messages that have already been sent as bundled messages:
  numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
  *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
 }
 #pragma omp task depend(out : Buffer) depend(in : *BufferSize)
 {
   Buffer = 0;
 #ifdef PRINT_DEBUG_INFO_
   cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
   cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
   cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
   cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
   cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
   cout << "\n(" << myRank << ")Attaching Buffer on.. ";
   fflush(stdout);
 #endif
   if ((*BufferSize) > 0)
     {
       Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
       if (Buffer == 0)
 	 {
 	   cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
 	   cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
 	   exit(1);
 	 }
       MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
     }
 }
 }
 }
 }
--- a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_cspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
  logical, parameter :: do_timings=.false.
  name='amg_c_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_matbld==-1))       &
       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
  if ((do_timings).and.(idx_matasb==-1))     &
       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
  if ((do_timings).and.(idx_mapbld==-1))       &
       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_dspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
  logical, parameter :: do_timings=.false.
  name='amg_d_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_matbld==-1))       &
       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
  if ((do_timings).and.(idx_matasb==-1))     &
       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
  if ((do_timings).and.(idx_mapbld==-1))       &
       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_sspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
  logical, parameter :: do_timings=.false.
  name='amg_s_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_matbld==-1))       &
       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
  if ((do_timings).and.(idx_matasb==-1))     &
       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
  if ((do_timings).and.(idx_mapbld==-1))       &
       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_zspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
  logical, parameter :: do_timings=.false.
  name='amg_z_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
  if ((do_timings).and.(idx_matbld==-1))       &
       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
  if ((do_timings).and.(idx_matasb==-1))     &
       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
  if ((do_timings).and.(idx_mapbld==-1))       &
       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")
  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
    if (do_timings) call psb_toc(idx_tril)
  else
--- a/amgprec/impl/solver/amg_c_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='c_gs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else
    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
    if (do_timings) call psb_toc(idx_tril)
  else
--- a/amgprec/impl/solver/amg_d_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_gs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else
    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
    if (do_timings) call psb_toc(idx_tril)
  else
--- a/amgprec/impl/solver/amg_s_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='s_gs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else
    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
    if (do_timings) call psb_toc(idx_tril)
  else
--- a/amgprec/impl/solver/amg_z_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='z_gs_solver_bld', ch_err
  integer(psb_ipk_), save :: idx_tril=-1
  logical, parameter      :: do_timings=.true.
  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
  if ((do_timings).and.(idx_tril==-1))       &
       & idx_tril = psb_get_timer_idx("GS_BLD: tril")
  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else
    info = psb_err_missing_override_method_
--- a/exec.sh
+++ b/exec.sh
@ -0,0 +1,25 @@
 cd amgprec/impl/aggregator/
 rm MatchBoxPC.o 
 rm sendBundledMessages.o 
 rm initialize.o 
 rm extractUChunk.o 
 rm isAlreadyMatched.o 
 rm findOwnerOfGhost.o 
 rm computeCandidateMate.o 
 rm parallelComputeCandidateMateB.o 
 rm processMatchedVertices.o 
 rm processCrossEdge.o 
 rm queueTransfer.o 
 rm processMessages.o 
 rm processExposedVertex.o 
 rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o 
 rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
 cd ../../../
 make all
 cd samples/advanced/pdegen
 make amg_d_pde3d
 cd runs
 mpirun -np 4 amg_d_pde3d amg_pde3d.inp
--- a/samples/advanced/pdegen/amg_d_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90
@ -93,6 +93,9 @@ contains
       & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
 #if defined(OPENMP)
    use omp_lib
 #endif
    !
    !   Discretizes the partial differential equation
    !
@ -128,7 +131,6 @@ contains
    type(psb_d_csc_sparse_mat)  :: acsc
    type(psb_d_coo_sparse_mat)  :: acoo
    type(psb_d_csr_sparse_mat)  :: acsr
    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    real(psb_dpk_), allocatable :: val(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_dpk_)            :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
    call psb_barrier(ctxt)
    talc = psb_wtime()-t0
    if (info /= psb_success_) then
      info=psb_err_from_subroutine_
      ch_err='allocation rout.'
      call psb_errpush(info,name,a_err=ch_err)
      goto 9999
    end if
    ! we build an auxiliary matrix consisting of one row at a
    ! time; just a small matrix. might be extended to generate
    ! a bunch of rows per call.
    !
    allocate(val(20*nb),irow(20*nb),&
         &icol(20*nb),stat=info)
    if (info /= psb_success_ ) then
      info=psb_err_alloc_dealloc_
      call psb_errpush(info,name)
      goto 9999
    endif
    ! loop over rows belonging to current process in a block
    ! distribution.
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
+    !$omp parallel shared(deltah,myidx,a,desc_a)
-      ib = min(nb,nlr-ii+1)
+    !
-      icoeff = 1
+    block 
-      do k=1,ib
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
-        i=ii+k-1
+      integer(psb_lpk_) :: glob_row
-        ! local matrix pointer
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
-        glob_row=myidx(i)
+      real(psb_dpk_), allocatable :: val(:)
-        ! compute gridpoint coordinates
+      real(psb_dpk_)     :: x,y,z, zt(nb)
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+#if defined(OPENMP)
-        ! x, y, z coordinates
+      nth = omp_get_num_threads()
-        x = (ix-1)*deltah
+      ith = omp_get_thread_num()
-        y = (iy-1)*deltah
+#else
-        z = (iz-1)*deltah
+      nth = 1
-        zt(k) = f_(x,y,z)
+      ith = 0
-        ! internal point: build discretization
+#endif
-        !
+      allocate(val(20*nb),irow(20*nb),&
-        !  term depending on   (x-1,y,z)
+           &icol(20*nb),stat=info)
-        !
+      if (info /= psb_success_ ) then
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+        info=psb_err_alloc_dealloc_
-        if (ix == 1) then
+        call psb_errpush(info,name)
-          zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
+        !goto 9999
-        else
+      endif
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+
-          irow(icoeff) = glob_row
+      !$omp  do schedule(dynamic)
-          icoeff       = icoeff+1
+      !     
-        endif
+      do ii=1, nlr, nb
-        !  term depending on     (x,y-1,z)
+        if (info /= psb_success_) cycle
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+        ib = min(nb,nlr-ii+1)
-        if (iy == 1) then
+        icoeff = 1
-          zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
+        do k=1,ib
-        else
+          i=ii+k-1
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+          ! local matrix pointer
-          irow(icoeff) = glob_row
+          glob_row=myidx(i)
-          icoeff       = icoeff+1
+          ! compute gridpoint coordinates
-        endif
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        !  term depending on     (x,y,z-1)
+          ! x, y, z coordinates
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          x = (ix-1)*deltah
-        if (iz == 1) then
+          y = (iy-1)*deltah
-          zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
+          z = (iz-1)*deltah
-        else
+          zt(k) = f_(x,y,z)
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+          ! internal point: build discretization
-          irow(icoeff) = glob_row
+          !
-          icoeff       = icoeff+1
+          !  term depending on   (x-1,y,z)
-        endif
+          !
-
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        !  term depending on     (x,y,z)
+          if (ix == 1) then
-        val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+            zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
-             & + c(x,y,z)
+          else
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
+            irow(icoeff) = glob_row
-        icoeff       = icoeff+1
+            icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
+          endif
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          !  term depending on     (x,y-1,z)
-        if (iz == idim) then
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-          zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
+          if (iy == 1) then
-        else
+            zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+          else
-          irow(icoeff) = glob_row
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          icoeff       = icoeff+1
+            irow(icoeff) = glob_row
-        endif
+            icoeff       = icoeff+1
-        !  term depending on     (x,y+1,z)
+          endif
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          !  term depending on     (x,y,z-1)
-        if (iy == idim) then
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-          zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
+          if (iz == 1) then
-        else
+            zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+          else
-          irow(icoeff) = glob_row
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          icoeff       = icoeff+1
+            irow(icoeff) = glob_row
-        endif
+            icoeff       = icoeff+1
-        !  term depending on     (x+1,y,z)
+          endif
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+
-        if (ix==idim) then
+          !  term depending on     (x,y,z)
-          zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
+          val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-        else
+               & + c(x,y,z)
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
          if (iz == idim) then
            zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x,y+1,z)
          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
          if (iy == idim) then
            zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x+1,y,z)
          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
          if (ix==idim) then
            zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
        end do
        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
        if(info /= psb_success_) cycle
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
        if(info /= psb_success_) cycle
        zt(:)=dzero
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+      !$omp end do
-      if(info /= psb_success_) exit
+
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+      deallocate(val,irow,icol)
-      if(info /= psb_success_) exit
+    end block
-      zt(:)=dzero
+    !$omp end parallel
      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
      if(info /= psb_success_) exit
    end do
    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
      goto 9999
    end if
    deallocate(val,irow,icol)
    call psb_barrier(ctxt)
    t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
       & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
 #if defined(OPENMP)
    use omp_lib
 #endif
    !
    !   Discretizes the partial differential equation
    !
@ -591,7 +603,6 @@ contains
    type(psb_d_csc_sparse_mat)  :: acsc
    type(psb_d_coo_sparse_mat)  :: acoo
    type(psb_d_csr_sparse_mat)  :: acsr
    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    real(psb_dpk_), allocatable :: val(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_dpk_)            :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
      goto 9999
    end if
    ! we build an auxiliary matrix consisting of one row at a
    ! time; just a small matrix. might be extended to generate
    ! a bunch of rows per call.
    !
    allocate(val(20*nb),irow(20*nb),&
         &icol(20*nb),stat=info)
    if (info /= psb_success_ ) then
      info=psb_err_alloc_dealloc_
      call psb_errpush(info,name)
      goto 9999
    endif
    ! loop over rows belonging to current process in a block
    ! distribution.
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
+    !$omp parallel shared(deltah,myidx,a,desc_a)
-      ib = min(nb,nlr-ii+1)
+    !
-      icoeff = 1
+    block 
-      do k=1,ib
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
-        i=ii+k-1
+      integer(psb_lpk_) :: glob_row
-        ! local matrix pointer
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
-        glob_row=myidx(i)
+      real(psb_dpk_), allocatable :: val(:)
-        ! compute gridpoint coordinates
+      real(psb_dpk_)     :: x,y,z, zt(nb)
-        call idx2ijk(ix,iy,glob_row,idim,idim)
+#if defined(OPENMP)
-        ! x, y coordinates
+      nth = omp_get_num_threads()
-        x = (ix-1)*deltah
+      ith = omp_get_thread_num()
-        y = (iy-1)*deltah
+#else
-
+      nth = 1
-        zt(k) = f_(x,y)
+      ith = 0
-        ! internal point: build discretization
+#endif
-        !
+      allocate(val(20*nb),irow(20*nb),&
-        !  term depending on   (x-1,y)
+           &icol(20*nb),stat=info)
-        !
+      if (info /= psb_success_ ) then
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+        info=psb_err_alloc_dealloc_
-        if (ix == 1) then
+        call psb_errpush(info,name)
-          zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
+        !goto 9999
-        else
+      endif
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+
-          irow(icoeff) = glob_row
+      ! loop over rows belonging to current process in a block
-          icoeff       = icoeff+1
+      ! distribution.
-        endif
+      !$omp  do schedule(dynamic)
-        !  term depending on     (x,y-1)
+      !     
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+      do ii=1, nlr,nb
-        if (iy == 1) then
+        ib = min(nb,nlr-ii+1)
-          zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
+        icoeff = 1
-        else
+        do k=1,ib
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+          i=ii+k-1
-          irow(icoeff) = glob_row
+          ! local matrix pointer
-          icoeff       = icoeff+1
+          glob_row=myidx(i)
-        endif
+          ! compute gridpoint coordinates
-
+          call idx2ijk(ix,iy,glob_row,idim,idim)
-        !  term depending on     (x,y)
+          ! x, y coordinates
-        val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          x = (ix-1)*deltah
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
+          y = (iy-1)*deltah
-        irow(icoeff) = glob_row
+
-        icoeff       = icoeff+1
+          zt(k) = f_(x,y)
-        !  term depending on     (x,y+1)
+          ! internal point: build discretization
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          !
-        if (iy == idim) then
+          !  term depending on   (x-1,y)
-          zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
+          !
-        else
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+          if (ix == 1) then
-          irow(icoeff) = glob_row
+            zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
-          icoeff       = icoeff+1
+          else
-        endif
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-        !  term depending on     (x+1,y)
+            irow(icoeff) = glob_row
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+            icoeff       = icoeff+1
-        if (ix==idim) then
+          endif
-          zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
+          !  term depending on     (x,y-1)
-        else
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+          if (iy == 1) then
            zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x,y)
          val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
          if (iy == idim) then
            zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x+1,y)
          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
          if (ix==idim) then
            zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
        end do
        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
        if(info /= psb_success_) cycle
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
        if(info /= psb_success_) cycle
        zt(:)=dzero
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+      !$omp end do
-      if(info /= psb_success_) exit
+
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+      deallocate(val,irow,icol)
-      if(info /= psb_success_) exit
+    end block
-      zt(:)=dzero
+    !$omp end parallel
      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
      if(info /= psb_success_) exit
    end do
    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
      goto 9999
    end if
    deallocate(val,irow,icol)
    call psb_barrier(ctxt)
    t1 = psb_wtime()
    call psb_cdasb(desc_a,info)
--- a/samples/advanced/pdegen/amg_d_pde2d.F90
+++ b/samples/advanced/pdegen/amg_d_pde2d.F90
@ -73,6 +73,9 @@ program amg_d_pde2d
  use amg_d_pde2d_exp_mod
  use amg_d_pde2d_box_mod
  use amg_d_genpde_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! input parameters
@ -93,7 +96,7 @@ program amg_d_pde2d
  type(psb_d_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_d_pde2d
  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
 #if defined(OPENMP)
  !$OMP parallel shared(nth)
  !$OMP master
  nth = omp_get_num_threads()
  !$OMP end master
  !$OMP end parallel
 #else
  nth = 1
 #endif
  if (iam < 0) then
    ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_d_pde2d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
    write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
    write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
    write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
--- a/samples/advanced/pdegen/amg_d_pde3d.F90
+++ b/samples/advanced/pdegen/amg_d_pde3d.F90
@ -74,6 +74,9 @@ program amg_d_pde3d
  use amg_d_pde3d_exp_mod
  use amg_d_pde3d_gauss_mod
  use amg_d_genpde_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! input parameters
@ -94,7 +97,7 @@ program amg_d_pde3d
  type(psb_d_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_d_pde3d
  ! other variables
  integer(psb_ipk_)  :: info, i, k
  character(len=20)  :: name,ch_err
-
+  type(psb_d_csr_sparse_mat) :: amold
  info=psb_success_
  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
 #if defined(OPENMP)
  !$OMP parallel shared(nth)
  !$OMP master
  nth = omp_get_num_threads()
  !$OMP end master
  !$OMP end parallel
 #else
  nth = 1
 #endif
  if (iam < 0) then
    ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_d_pde3d
  end if
  call psb_barrier(ctxt)
  t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
  tprec = psb_wtime()-t1
  if (info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_d_pde3d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_d_pde3d
    write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()
  end if
-
+  call psb_print_timers(ctxt)
  !
  !  cleanup storage and exit
  !
--- a/samples/advanced/pdegen/amg_s_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90
@ -93,6 +93,9 @@ contains
       & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
 #if defined(OPENMP)
    use omp_lib
 #endif
    !
    !   Discretizes the partial differential equation
    !
@ -128,7 +131,6 @@ contains
    type(psb_s_csc_sparse_mat)  :: acsc
    type(psb_s_coo_sparse_mat)  :: acoo
    type(psb_s_csr_sparse_mat)  :: acsr
    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    real(psb_spk_), allocatable :: val(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_spk_)            :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
    call psb_barrier(ctxt)
    talc = psb_wtime()-t0
    if (info /= psb_success_) then
      info=psb_err_from_subroutine_
      ch_err='allocation rout.'
      call psb_errpush(info,name,a_err=ch_err)
      goto 9999
    end if
    ! we build an auxiliary matrix consisting of one row at a
    ! time; just a small matrix. might be extended to generate
    ! a bunch of rows per call.
    !
    allocate(val(20*nb),irow(20*nb),&
         &icol(20*nb),stat=info)
    if (info /= psb_success_ ) then
      info=psb_err_alloc_dealloc_
      call psb_errpush(info,name)
      goto 9999
    endif
    ! loop over rows belonging to current process in a block
    ! distribution.
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
+    !$omp parallel shared(deltah,myidx,a,desc_a)
-      ib = min(nb,nlr-ii+1)
+    !
-      icoeff = 1
+    block 
-      do k=1,ib
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
-        i=ii+k-1
+      integer(psb_lpk_) :: glob_row
-        ! local matrix pointer
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
-        glob_row=myidx(i)
+      real(psb_spk_), allocatable :: val(:)
-        ! compute gridpoint coordinates
+      real(psb_spk_)     :: x,y,z, zt(nb)
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+#if defined(OPENMP)
-        ! x, y, z coordinates
+      nth = omp_get_num_threads()
-        x = (ix-1)*deltah
+      ith = omp_get_thread_num()
-        y = (iy-1)*deltah
+#else
-        z = (iz-1)*deltah
+      nth = 1
-        zt(k) = f_(x,y,z)
+      ith = 0
-        ! internal point: build discretization
+#endif
-        !
+      allocate(val(20*nb),irow(20*nb),&
-        !  term depending on   (x-1,y,z)
+           &icol(20*nb),stat=info)
-        !
+      if (info /= psb_success_ ) then
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+        info=psb_err_alloc_dealloc_
-        if (ix == 1) then
+        call psb_errpush(info,name)
-          zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
+        !goto 9999
-        else
+      endif
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+
-          irow(icoeff) = glob_row
+      !$omp  do schedule(dynamic)
-          icoeff       = icoeff+1
+      !     
-        endif
+      do ii=1, nlr, nb
-        !  term depending on     (x,y-1,z)
+        if (info /= psb_success_) cycle
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+        ib = min(nb,nlr-ii+1)
-        if (iy == 1) then
+        icoeff = 1
-          zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
+        do k=1,ib
-        else
+          i=ii+k-1
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+          ! local matrix pointer
-          irow(icoeff) = glob_row
+          glob_row=myidx(i)
-          icoeff       = icoeff+1
+          ! compute gridpoint coordinates
-        endif
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        !  term depending on     (x,y,z-1)
+          ! x, y, z coordinates
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          x = (ix-1)*deltah
-        if (iz == 1) then
+          y = (iy-1)*deltah
-          zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
+          z = (iz-1)*deltah
-        else
+          zt(k) = f_(x,y,z)
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+          ! internal point: build discretization
-          irow(icoeff) = glob_row
+          !
-          icoeff       = icoeff+1
+          !  term depending on   (x-1,y,z)
-        endif
+          !
-
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        !  term depending on     (x,y,z)
+          if (ix == 1) then
-        val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+            zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
-             & + c(x,y,z)
+          else
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
+            irow(icoeff) = glob_row
-        icoeff       = icoeff+1
+            icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
+          endif
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          !  term depending on     (x,y-1,z)
-        if (iz == idim) then
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-          zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
+          if (iy == 1) then
-        else
+            zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+          else
-          irow(icoeff) = glob_row
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          icoeff       = icoeff+1
+            irow(icoeff) = glob_row
-        endif
+            icoeff       = icoeff+1
-        !  term depending on     (x,y+1,z)
+          endif
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          !  term depending on     (x,y,z-1)
-        if (iy == idim) then
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-          zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
+          if (iz == 1) then
-        else
+            zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+          else
-          irow(icoeff) = glob_row
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          icoeff       = icoeff+1
+            irow(icoeff) = glob_row
-        endif
+            icoeff       = icoeff+1
-        !  term depending on     (x+1,y,z)
+          endif
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+
-        if (ix==idim) then
+          !  term depending on     (x,y,z)
-          zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
+          val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-        else
+               & + c(x,y,z)
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
          if (iz == idim) then
            zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x,y+1,z)
          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
          if (iy == idim) then
            zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x+1,y,z)
          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
          if (ix==idim) then
            zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
        end do
        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
        if(info /= psb_success_) cycle
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
        if(info /= psb_success_) cycle
        zt(:)=szero
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+      !$omp end do
-      if(info /= psb_success_) exit
+
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+      deallocate(val,irow,icol)
-      if(info /= psb_success_) exit
+    end block
-      zt(:)=szero
+    !$omp end parallel
      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
      if(info /= psb_success_) exit
    end do
    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
      goto 9999
    end if
    deallocate(val,irow,icol)
    call psb_barrier(ctxt)
    t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
       & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
 #if defined(OPENMP)
    use omp_lib
 #endif
    !
    !   Discretizes the partial differential equation
    !
@ -591,7 +603,6 @@ contains
    type(psb_s_csc_sparse_mat)  :: acsc
    type(psb_s_coo_sparse_mat)  :: acoo
    type(psb_s_csr_sparse_mat)  :: acsr
    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    real(psb_spk_), allocatable :: val(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_spk_)            :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
      goto 9999
    end if
    ! we build an auxiliary matrix consisting of one row at a
    ! time; just a small matrix. might be extended to generate
    ! a bunch of rows per call.
    !
    allocate(val(20*nb),irow(20*nb),&
         &icol(20*nb),stat=info)
    if (info /= psb_success_ ) then
      info=psb_err_alloc_dealloc_
      call psb_errpush(info,name)
      goto 9999
    endif
    ! loop over rows belonging to current process in a block
    ! distribution.
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
+    !$omp parallel shared(deltah,myidx,a,desc_a)
-      ib = min(nb,nlr-ii+1)
+    !
-      icoeff = 1
+    block 
-      do k=1,ib
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
-        i=ii+k-1
+      integer(psb_lpk_) :: glob_row
-        ! local matrix pointer
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
-        glob_row=myidx(i)
+      real(psb_spk_), allocatable :: val(:)
-        ! compute gridpoint coordinates
+      real(psb_spk_)     :: x,y,z, zt(nb)
-        call idx2ijk(ix,iy,glob_row,idim,idim)
+#if defined(OPENMP)
-        ! x, y coordinates
+      nth = omp_get_num_threads()
-        x = (ix-1)*deltah
+      ith = omp_get_thread_num()
-        y = (iy-1)*deltah
+#else
-
+      nth = 1
-        zt(k) = f_(x,y)
+      ith = 0
-        ! internal point: build discretization
+#endif
-        !
+      allocate(val(20*nb),irow(20*nb),&
-        !  term depending on   (x-1,y)
+           &icol(20*nb),stat=info)
-        !
+      if (info /= psb_success_ ) then
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+        info=psb_err_alloc_dealloc_
-        if (ix == 1) then
+        call psb_errpush(info,name)
-          zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
+        !goto 9999
-        else
+      endif
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+
-          irow(icoeff) = glob_row
+      ! loop over rows belonging to current process in a block
-          icoeff       = icoeff+1
+      ! distribution.
-        endif
+      !$omp  do schedule(dynamic)
-        !  term depending on     (x,y-1)
+      !     
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+      do ii=1, nlr,nb
-        if (iy == 1) then
+        ib = min(nb,nlr-ii+1)
-          zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
+        icoeff = 1
-        else
+        do k=1,ib
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+          i=ii+k-1
-          irow(icoeff) = glob_row
+          ! local matrix pointer
-          icoeff       = icoeff+1
+          glob_row=myidx(i)
-        endif
+          ! compute gridpoint coordinates
-
+          call idx2ijk(ix,iy,glob_row,idim,idim)
-        !  term depending on     (x,y)
+          ! x, y coordinates
-        val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          x = (ix-1)*deltah
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
+          y = (iy-1)*deltah
-        irow(icoeff) = glob_row
+
-        icoeff       = icoeff+1
+          zt(k) = f_(x,y)
-        !  term depending on     (x,y+1)
+          ! internal point: build discretization
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          !
-        if (iy == idim) then
+          !  term depending on   (x-1,y)
-          zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
+          !
-        else
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+          if (ix == 1) then
-          irow(icoeff) = glob_row
+            zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
-          icoeff       = icoeff+1
+          else
-        endif
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-        !  term depending on     (x+1,y)
+            irow(icoeff) = glob_row
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+            icoeff       = icoeff+1
-        if (ix==idim) then
+          endif
-          zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
+          !  term depending on     (x,y-1)
-        else
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+          if (iy == 1) then
            zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x,y)
          val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
          if (iy == idim) then
            zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
          !  term depending on     (x+1,y)
          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
          if (ix==idim) then
            zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
          else
            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
            irow(icoeff) = glob_row
            icoeff       = icoeff+1
          endif
        end do
        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
        if(info /= psb_success_) cycle
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
        if(info /= psb_success_) cycle
        zt(:)=szero
        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+      !$omp end do
-      if(info /= psb_success_) exit
+
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+      deallocate(val,irow,icol)
-      if(info /= psb_success_) exit
+    end block
-      zt(:)=szero
+    !$omp end parallel
      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
      if(info /= psb_success_) exit
    end do
    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
      goto 9999
    end if
    deallocate(val,irow,icol)
    call psb_barrier(ctxt)
    t1 = psb_wtime()
    call psb_cdasb(desc_a,info)
--- a/samples/advanced/pdegen/amg_s_pde2d.F90
+++ b/samples/advanced/pdegen/amg_s_pde2d.F90
@ -73,6 +73,9 @@ program amg_s_pde2d
  use amg_s_pde2d_exp_mod
  use amg_s_pde2d_box_mod
  use amg_s_genpde_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! input parameters
@ -93,7 +96,7 @@ program amg_s_pde2d
  type(psb_s_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_s_pde2d
  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
 #if defined(OPENMP)
  !$OMP parallel shared(nth)
  !$OMP master
  nth = omp_get_num_threads()
  !$OMP end master
  !$OMP end parallel
 #else
  nth = 1
 #endif
  if (iam < 0) then
    ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_s_pde2d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
    write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
    write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
    write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
--- a/samples/advanced/pdegen/amg_s_pde3d.F90
+++ b/samples/advanced/pdegen/amg_s_pde3d.F90
@ -74,6 +74,9 @@ program amg_s_pde3d
  use amg_s_pde3d_exp_mod
  use amg_s_pde3d_gauss_mod
  use amg_s_genpde_mod
 #if defined(OPENMP)
  use omp_lib
 #endif
  implicit none
  ! input parameters
@ -94,7 +97,7 @@ program amg_s_pde3d
  type(psb_s_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth
  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_s_pde3d
  ! other variables
  integer(psb_ipk_)  :: info, i, k
  character(len=20)  :: name,ch_err
-
+  type(psb_s_csr_sparse_mat) :: amold
  info=psb_success_
  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
 #if defined(OPENMP)
  !$OMP parallel shared(nth)
  !$OMP master
  nth = omp_get_num_threads()
  !$OMP end master
  !$OMP end parallel
 #else
  nth = 1
 #endif
  if (iam < 0) then
    ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_s_pde3d
  end if
  call psb_barrier(ctxt)
  t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
  tprec = psb_wtime()-t1
  if (info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_s_pde3d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_s_pde3d
    write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()
  end if
-
+  call psb_print_timers(ctxt)
  !
  !  cleanup storage and exit
  !
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+0200                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC
@ -9,7 +9,7 @@ BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F
 30                          ! IRST (restart for RGMRES and BiCGSTABL)
 1.d-6                       ! EPS
 %%%%%%%%%%%  Main preconditioner choices %%%%%%%%%%%%%%%%
-ML-VCYCLE-BJAC-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
+ML-VBM-VCYCLE-FBGS-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
 ML                          ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML
 %%%%%%%%%%%  First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%%
 FBGS                        ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous.
@ -39,8 +39,8 @@ VCYCLE                      ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL
 -3                          ! Max Number of levels in a multilevel preconditioner; if <0, lib default
 -3                          ! Target coarse matrix size per process; if <0, lib default
 SMOOTHED                    ! Type of aggregation: SMOOTHED UNSMOOTHED
-COUPLED                     ! Parallel aggregation: DEC, SYMDEC, COUPLED
+DEC                         ! Parallel aggregation: DEC, SYMDEC, COUPLED
-MATCHBOXP                   ! aggregation measure SOC1, MATCHBOXP
+SOC1                        ! aggregation measure SOC1, MATCHBOXP
 8                           ! Requested size of the aggregates for MATCHBOXP
 NATURAL                     ! Ordering of aggregation NATURAL DEGREE
 -1.5                        ! Coarsening ratio, if < 0 use library default
`@ -1,4 +1,3 @@`

	`AMG4PSBLAS`	`AMG4PSBLAS`
	`Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8)`	`Algebraic Multigrid Package based on PSBLAS (Parallel Sparse BLAS version 3.8)`