Merge branch 'dev-openmp' into development

1 year ago · 0bcc9d7b55
parent dd7cb86775 11421f53a2
commit 0bcc9d7b55
64 changed files with 5655 additions and 819 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,3 @@
-
                         AMG4PSBLAS  
 Algebraic Multigrid Package  based on PSBLAS (Parallel Sparse BLAS version 3.8)
    
--- a/amgprec/amg_d_matchboxp_mod.f90
+++ b/amgprec/amg_d_matchboxp_mod.f90
@ -143,9 +143,10 @@ contains
    type(psb_ld_coo_sparse_mat) :: tmpcoo
    logical  :: display_out_, print_out_, reproducible_
    logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
    integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
    logical, parameter :: do_timings=.true.
+    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2

    ictxt = desc_a%get_ctxt()
    call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
    call desc_a%l2gip(ilv,info,owned=.false.)

    call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
    call psb_geasb(ilaggr,desc_a,info)
    nr = a%get_nrows()
    nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
      call psb_barrier(ictxt)
      if (iam == 0) write(0,*)' out from buildmatching:', info
    end if
+    if (debug_mate) then
+      block
+        integer(psb_lpk_), allocatable :: ckmate(:)
+        allocate(ckmate(nr))
+        ckmate(1:nr) = mate(1:nr)
+        call psb_msort(ckmate(1:nr))
+        do i=1,nr-1
+          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
+            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
+          end if
+        end do
+      end block
+    end if
     
    if (info == 0) then
      if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
          cycle
        else

-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then

            wk   = w(k)
            widx = w(idx)
@ -267,7 +281,7 @@ contains
            nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
            if (nrmagg > epsilon(nrmagg)) then
              if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                  ! Now, if both vertices are local, the aggregate is local
                  ! (kinda obvious).
                  nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
                  ilaggr(idx) = nlaggr(iam)
                  wtemp(k)    = w(k)/nrmagg
                  wtemp(idx)  = w(idx)/nrmagg
+                else
+                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
+                       &mate(idx),ilaggr(idx)
                end if
                nlpairs = nlpairs+1
              else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                else
                  ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                end if
              end if
@ -325,6 +342,12 @@ contains
                nlsingl     = nlsingl + 1
              end if
            end if
+            if (ilaggr(k) == ilaggr_neginit) then
+              write(0,*) iam,' Error: no update to ',k,mate(k),&
+                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
+            end if
+          else
+            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
          end if
        end if
      end do
@ -332,7 +355,7 @@ contains
      if (do_timings) call psb_tic(idx_phase3)

      ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
      call psb_sum(ictxt,nlaggr)
      ntaggr  = sum(nlaggr(0:np-1))
      naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
      call psb_halo(wtemp,desc_a,info)
      ! Cleanup as yet unmarked entries
      do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
          idx = mate(k)
          if (idx > nr) then
            i   = ilaggr(idx)
@ -359,9 +382,14 @@ contains
          else
            write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
          end if
-        end if
-        if (ilaggr(k) <0) then
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+        else if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
+          if (mate(k) <= nr) then
+            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
+                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
+          end if
+          flush(0)
        end if
      end do
      if (debug_sync) then
@ -414,7 +442,7 @@ contains

      end block
      if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
             & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
      end if

--- a/amgprec/amg_s_matchboxp_mod.f90
+++ b/amgprec/amg_s_matchboxp_mod.f90
@ -143,9 +143,10 @@ contains
    type(psb_ls_coo_sparse_mat) :: tmpcoo
    logical  :: display_out_, print_out_, reproducible_
    logical, parameter  :: dump=.false., debug=.false., dump_mate=.false., &
-         & debug_ilaggr=.false., debug_sync=.false.
+         & debug_ilaggr=.false., debug_sync=.false., debug_mate=.false.
    integer(psb_ipk_), save :: idx_bldmtc=-1, idx_phase1=-1, idx_phase2=-1, idx_phase3=-1
    logical, parameter :: do_timings=.true.
+    integer, parameter :: ilaggr_neginit=-1, ilaggr_nonlocal=-2

    ictxt = desc_a%get_ctxt()
    call psb_info(ictxt,iam,np)
@ -187,7 +188,7 @@ contains
    call desc_a%l2gip(ilv,info,owned=.false.)

    call psb_geall(ilaggr,desc_a,info)
-    ilaggr = -1
+    ilaggr = ilaggr_neginit
    call psb_geasb(ilaggr,desc_a,info)
    nr = a%get_nrows()
    nc = a%get_ncols()
@ -213,6 +214,19 @@ contains
      call psb_barrier(ictxt)
      if (iam == 0) write(0,*)' out from buildmatching:', info
    end if
+    if (debug_mate) then
+      block
+        integer(psb_lpk_), allocatable :: ckmate(:)
+        allocate(ckmate(nr))
+        ckmate(1:nr) = mate(1:nr)
+        call psb_msort(ckmate(1:nr))
+        do i=1,nr-1
+          if ((ckmate(i)>0) .and. (ckmate(i) == ckmate(i+1))) then
+            write(0,*) iam,' Duplicate mate entry  at',i,' :',ckmate(i)
+          end if
+        end do
+      end block
+    end if
     
    if (info == 0) then
      if (do_timings) call psb_tic(idx_phase2)
@ -259,7 +273,7 @@ contains
          cycle
        else

-          if (ilaggr(k) == -1) then
+          if (ilaggr(k) == ilaggr_neginit) then

            wk   = w(k)
            widx = w(idx)
@ -267,7 +281,7 @@ contains
            nrmagg = wmax*sqrt((wk/wmax)**2+(widx/wmax)**2)
            if (nrmagg > epsilon(nrmagg)) then
              if (idx <= nr) then
-                if (ilaggr(idx) == -1) then
+                if (ilaggr(idx) == ilaggr_neginit) then
                  ! Now, if both vertices are local, the aggregate is local
                  ! (kinda obvious).
                  nlaggr(iam) = nlaggr(iam) + 1
@ -275,6 +289,9 @@ contains
                  ilaggr(idx) = nlaggr(iam)
                  wtemp(k)    = w(k)/nrmagg
                  wtemp(idx)  = w(idx)/nrmagg
+                else
+                  write(0,*) iam,' Inconsistent mate? ',k,mate(k),idx,&
+                       &mate(idx),ilaggr(idx)
                end if
                nlpairs = nlpairs+1
              else if (idx <= nc) then
@ -294,7 +311,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                else
                  ! Use a statistically unbiased tie-breaking rule,
@ -309,7 +326,7 @@ contains
                    ilaggr(k)   = nlaggr(iam)
                    nlpairs = nlpairs+1
                  else
-                    ilaggr(k) = -2
+                    ilaggr(k) = ilaggr_nonlocal
                  end if
                end if
              end if
@ -325,6 +342,12 @@ contains
                nlsingl     = nlsingl + 1
              end if
            end if
+            if (ilaggr(k) == ilaggr_neginit) then
+              write(0,*) iam,' Error: no update to ',k,mate(k),&
+                   & abs(w(k)),nrmagg,epsilon(nrmagg),wtemp(k)
+            end if
+          else
+            if (ilaggr(k)<0) write(0,*) 'Strange? ',k,ilaggr(k)
          end if
        end if
      end do
@ -332,7 +355,7 @@ contains
      if (do_timings) call psb_tic(idx_phase3)

      ! Ok, now compute offsets, gather halo and fix non-local
-      ! aggregates  (those where ilaggr == -2)
+      ! aggregates  (those where ilaggr == ilaggr_nonlocal)
      call psb_sum(ictxt,nlaggr)
      ntaggr  = sum(nlaggr(0:np-1))
      naggrm1 = sum(nlaggr(0:iam-1))
@ -347,7 +370,7 @@ contains
      call psb_halo(wtemp,desc_a,info)
      ! Cleanup as yet unmarked entries
      do k=1,nr
-        if (ilaggr(k) == -2) then
+        if (ilaggr(k) == ilaggr_nonlocal) then
          idx = mate(k)
          if (idx > nr) then
            i   = ilaggr(idx)
@ -359,9 +382,14 @@ contains
          else
            write(0,*) 'Error : unresolved (paired) index ',k,idx,i,nr,nc, ilv(k),ilv(idx)
          end if
-        end if
-        if (ilaggr(k) <0) then
-          write(0,*) 'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+        else if (ilaggr(k) <0) then
+          write(0,*) iam,'Matchboxp: Funny number: ',k,ilv(k),ilaggr(k),wtemp(k)
+          write(0,*) iam,'         :             : ',nr,nc,mate(k)
+          if (mate(k) <= nr) then
+            write(0,*) iam,'         :             : ',ilaggr(mate(k)),mate(mate(k)),&
+                 & ilv(k),ilv(mate(k)), ilv(mate(mate(k))),ilaggr(mate(mate(k)))
+          end if
+          flush(0)
        end if
      end do
      if (debug_sync) then
@ -414,7 +442,7 @@ contains

      end block
      if (iam == 0) then
-        write(0,*) 'Matching statistics: Unmatched nodes ',&
+        write(0,*) iam,'Matching statistics: Unmatched nodes ',&
             & nunmatched,' Singletons:',nlsingl,' Pairs:',nlpairs
      end if

--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@ -62,7 +62,22 @@ amg_s_parmatch_smth_bld.o \
 amg_s_parmatch_spmm_bld_inner.o

 MPCOBJS=MatchBoxPC.o \
-algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
+sendBundledMessages.o \
+initialize.o \
+extractUChunk.o \
+isAlreadyMatched.o \
+findOwnerOfGhost.o \
+clean.o \
+computeCandidateMate.o \
+parallelComputeCandidateMateB.o \
+processMatchedVertices.o \
+processMatchedVerticesAndSendMessages.o \
+processCrossEdge.o \
+queueTransfer.o \
+processMessages.o \
+processExposedVertex.o \
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o

 OBJS = $(FOBJS) $(MPCOBJS)

--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 		MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
 #if !defined(SERIAL_MPI)
  MPI_Comm C_comm=MPI_Comm_f2c(icomm);
+
 #ifdef DEBUG
  fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
-  dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
+
+
+#define TIME_TRACKER
+    #ifdef TIME_TRACKER
+        double tmr = MPI_Wtime();
+    #endif
+
+#define OMP
+#ifdef OMP
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
+							   verLocPtr, verLocInd, edgeLocWeight,
+							   verDistance,  Mate,
+							   myRank, numProcs, C_comm,
+							   msgIndSent, msgActualSent, msgPercent,
+							   ph0_time, ph1_time, ph2_time,
+							   ph1_card, ph2_card );
+#else
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
 							   myRank, numProcs, C_comm,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
+#endif
+
+
+  #ifdef TIME_TRACKER
+    tmr = MPI_Wtime() - tmr;
+    fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
+  #endif
+
 #endif
 }

--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@ -52,145 +52,412 @@

 #ifndef _matchboxpC_H_
 #define _matchboxpC_H_
-//Turn on a lot of debugging information with this switch:
+// Turn on a lot of debugging information with this switch:
 //#define PRINT_DEBUG_INFO_
 #include <stdio.h>
 #include <iostream>
 #include <assert.h>
 #include <map>
 #include <vector>
-// #include "matchboxp.h"
+#include "omp.h"
 #include "primitiveDataTypeDefinitions.h"
 #include "dataStrStaticQueue.h"

 using namespace std;

+const int NUM_THREAD = 4;
+const int UCHUNK = 10;
+
+const MilanLongInt REQUEST = 1;
+const MilanLongInt SUCCESS = 2;
+const MilanLongInt FAILURE = 3;
+const MilanLongInt SIZEINFO = 4;
+
+const int ComputeTag = 7; // Predefined tag
+const int BundleTag = 9;  // Predefined tag
+
+static vector<MilanLongInt> DEFAULT_VECTOR;
+
+// MPI type map
+template <typename T>
+MPI_Datatype TypeMap();
+template <>
+inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
+template <>
+inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
+template <>
+inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
+template <>
+inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
+
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #if !defined(SERIAL_MPI)

-#define MilanMpiLongInt  MPI_LONG_LONG
+#define MilanMpiLongInt MPI_LONG_LONG

 #ifndef _primitiveDataType_Definition_
 #define _primitiveDataType_Definition_
-    //Regular integer:
-    #ifndef INTEGER_H
-    #define INTEGER_H
-        typedef int32_t MilanInt;
-    #endif
-
-    //Regular long integer:
-    #ifndef LONG_INT_H
-    #define LONG_INT_H
-        #ifdef BIT64
-            typedef int64_t MilanLongInt;
-            typedef MPI_LONG MilanMpiLongInt;
-        #else
-            typedef int32_t MilanLongInt;
-            typedef MPI_INT MilanMpiLongInt;
-        #endif
-    #endif
-
-    //Regular boolean
-    #ifndef BOOL_H
-    #define BOOL_H
-        typedef bool MilanBool;
-    #endif
-
-    //Regular double and absolute value computation:
-    #ifndef REAL_H
-    #define REAL_H
-        typedef double MilanReal;
-        typedef MPI_DOUBLE MilanMpiReal;
-        inline MilanReal MilanAbs(MilanReal value)
-        {
-            return fabs(value);
-        }
-    #endif
-
-    //Regular float and absolute value computation:
-    #ifndef FLOAT_H
-    #define FLOAT_H
-        typedef float MilanFloat;
-        typedef MPI_FLOAT MilanMpiFloat;
-        inline MilanFloat MilanAbsFloat(MilanFloat value)
-        {
-            return fabs(value);
-        }
-    #endif
-
-    //// Define the limits:
-    #ifndef LIMITS_H
-    #define LIMITS_H
-    //Integer Maximum and Minimum:
-  //      #define MilanIntMax INT_MAX
-  //    #define MilanIntMin INT_MIN
-        #define MilanIntMax INT32_MAX
-        #define MilanIntMin INT32_MIN
-
-        #ifdef BIT64
-            #define MilanLongIntMax INT64_MAX
-            #define MilanLongIntMin -INT64_MAX
-        #else
-            #define MilanLongIntMax INT32_MAX
-            #define MilanLongIntMin -INT32_MAX
-        #endif
-
-    #endif
+// Regular integer:
+#ifndef INTEGER_H
+#define INTEGER_H
+    typedef int32_t MilanInt;
+#endif
+
+// Regular long integer:
+#ifndef LONG_INT_H
+#define LONG_INT_H
+#ifdef BIT64
+    typedef int64_t MilanLongInt;
+    typedef MPI_LONG MilanMpiLongInt;
+#else
+    typedef int32_t MilanLongInt;
+    typedef MPI_INT MilanMpiLongInt;
+#endif
+#endif
+
+// Regular boolean
+#ifndef BOOL_H
+#define BOOL_H
+    typedef bool MilanBool;
+#endif
+
+// Regular double and absolute value computation:
+#ifndef REAL_H
+#define REAL_H
+    typedef double MilanReal;
+    typedef MPI_DOUBLE MilanMpiReal;
+    inline MilanReal MilanAbs(MilanReal value)
+    {
+        return fabs(value);
+    }
+#endif
+
+// Regular float and absolute value computation:
+#ifndef FLOAT_H
+#define FLOAT_H
+    typedef float MilanFloat;
+    typedef MPI_FLOAT MilanMpiFloat;
+    inline MilanFloat MilanAbsFloat(MilanFloat value)
+    {
+        return fabs(value);
+    }
+#endif
+
+//// Define the limits:
+#ifndef LIMITS_H
+#define LIMITS_H
+    // Integer Maximum and Minimum:
+    //      #define MilanIntMax INT_MAX
+    //    #define MilanIntMin INT_MIN
+#define MilanIntMax INT32_MAX
+#define MilanIntMin INT32_MIN
+
+#ifdef BIT64
+#define MilanLongIntMax INT64_MAX
+#define MilanLongIntMin -INT64_MAX
+#else
+#define MilanLongIntMax INT32_MAX
+#define MilanLongIntMin -INT32_MAX
+#endif
+
+#endif

    // +INFINITY
    const double PLUS_INFINITY = numeric_limits<int>::infinity();
    const double MINUS_INFINITY = -PLUS_INFINITY;
-    //#define MilanRealMax LDBL_MAX
-    #define MilanRealMax PLUS_INFINITY
-    #define MilanRealMin MINUS_INFINITY
+//#define MilanRealMax LDBL_MAX
+#define MilanRealMax PLUS_INFINITY
+#define MilanRealMin MINUS_INFINITY
 #endif

-//Function of find the owner of a ghost vertex using binary search:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                     MilanInt myRank, MilanInt numProcs);
-
-  void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
- MilanLongInt NLVer, MilanLongInt NLEdge,
- MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
- MilanLongInt* verDistance,
- MilanLongInt* Mate,
- MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
- MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
- MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
- MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
- void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
-MilanLongInt NLVer, MilanLongInt NLEdge,
-MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-MilanLongInt* verDistance,
-MilanLongInt* Mate,
-MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
-void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
-void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    // Function of find the owner of a ghost vertex using binary search:
+    MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                              MilanInt myRank, MilanInt numProcs);
+
+    MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                           MilanLongInt adj2,
+                                           MilanLongInt *verLocInd,
+                                           MilanReal *edgeLocWeight);
+
+    void queuesTransfer(vector<MilanLongInt> &U,
+                        vector<MilanLongInt> &privateU,
+                        vector<MilanLongInt> &QLocalVtx,
+                        vector<MilanLongInt> &QGhostVtx,
+                        vector<MilanLongInt> &QMsgType,
+                        vector<MilanInt> &QOwner,
+                        vector<MilanLongInt> &privateQLocalVtx,
+                        vector<MilanLongInt> &privateQGhostVtx,
+                        vector<MilanLongInt> &privateQMsgType,
+                        vector<MilanInt> &privateQOwner);
+
+    bool isAlreadyMatched(MilanLongInt node,
+                          MilanLongInt StartIndex,
+                          MilanLongInt EndIndex,
+                          vector<MilanLongInt> &GMate,
+                          MilanLongInt *Mate,
+                          map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                      MilanLongInt adj2,
+                                      MilanReal *edgeLocWeight,
+                                      MilanLongInt k,
+                                      MilanLongInt *verLocInd,
+                                      MilanLongInt StartIndex,
+                                      MilanLongInt EndIndex,
+                                      vector<MilanLongInt> &GMate,
+                                      MilanLongInt *Mate,
+                                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                    MilanLongInt StartIndex, MilanLongInt EndIndex,
+                    MilanLongInt *numGhostEdgesPtr,
+                    MilanLongInt *numGhostVerticesPtr,
+                    MilanLongInt *S,
+                    MilanLongInt *verLocInd,
+                    MilanLongInt *verLocPtr,
+                    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                    vector<MilanLongInt> &Counter,
+                    vector<MilanLongInt> &verGhostPtr,
+                    vector<MilanLongInt> &verGhostInd,
+                    vector<MilanLongInt> &tempCounter,
+                    vector<MilanLongInt> &GMate,
+                    vector<MilanLongInt> &Message,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    MilanLongInt *&candidateMate,
+                    vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner);
+
+    void clean(MilanLongInt NLVer,
+               MilanInt myRank,
+               MilanLongInt MessageIndex,
+               vector<MPI_Request> &SRequest,
+               vector<MPI_Status> &SStatus,
+               MilanInt BufferSize,
+               MilanLongInt *Buffer,
+               MilanLongInt msgActual,
+               MilanLongInt *msgActualSent,
+               MilanLongInt msgInd,
+               MilanLongInt *msgIndSent,
+               MilanLongInt NumMessagesBundled,
+               MilanReal *msgPercent);
+
+    void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt *verLocInd,
+                                           MilanInt myRank,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *candidateMate);
+
+    void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                           MilanLongInt *candidateMate,
+                                           MilanLongInt *verLocInd,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt StartIndex,
+                                           MilanLongInt EndIndex,
+                                           MilanLongInt *Mate,
+                                           vector<MilanLongInt> &GMate,
+                                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *myCardPtr,
+                                           MilanLongInt *msgIndPtr,
+                                           MilanLongInt *NumMessagesBundledPtr,
+                                           MilanLongInt *SPtr,
+                                           MilanLongInt *verDistance,
+                                           MilanLongInt *PCounter,
+                                           vector<MilanLongInt> &Counter,
+                                           MilanInt myRank,
+                                           MilanInt numProcs,
+                                           vector<MilanLongInt> &U,
+                                           vector<MilanLongInt> &privateU,
+                                           vector<MilanLongInt> &QLocalVtx,
+                                           vector<MilanLongInt> &QGhostVtx,
+                                           vector<MilanLongInt> &QMsgType,
+                                           vector<MilanInt> &QOwner,
+                                           vector<MilanLongInt> &privateQLocalVtx,
+                                           vector<MilanLongInt> &privateQGhostVtx,
+                                           vector<MilanLongInt> &privateQMsgType,
+                                           vector<MilanInt> &privateQOwner);
+
+    void PROCESS_CROSS_EDGE(MilanLongInt *edge,
+                            MilanLongInt *SPtr);
+
+    void processMatchedVertices(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner);
+
+    void processMatchedVerticesAndSendMessages(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner,
+        MPI_Comm comm,
+        MilanLongInt *msgActual,
+        vector<MilanLongInt> &Message);
+
+    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                             MilanInt *BufferSizePtr,
+                             MilanLongInt *Buffer,
+                             vector<MilanLongInt> &PCumulative,
+                             vector<MilanLongInt> &PMessageBundle,
+                             vector<MilanLongInt> &PSizeInfoMessages,
+                             MilanLongInt *PCounter,
+                             MilanLongInt NumMessagesBundled,
+                             MilanLongInt *msgActualPtr,
+                             MilanLongInt *MessageIndexPtr,
+                             MilanInt numProcs,
+                             MilanInt myRank,
+                             MPI_Comm comm,
+                             vector<MilanLongInt> &QLocalVtx,
+                             vector<MilanLongInt> &QGhostVtx,
+                             vector<MilanLongInt> &QMsgType,
+                             vector<MilanInt> &QOwner,
+                             vector<MPI_Request> &SRequest,
+                             vector<MPI_Status> &SStatus);
+
+    void processMessages(
+        MilanLongInt NLVer,
+        MilanLongInt *Mate,
+        MilanLongInt *candidateMate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        vector<MilanLongInt> &GMate,
+        vector<MilanLongInt> &Counter,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *msgActualPtr,
+        MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *verLocPtr,
+        MilanLongInt k,
+        MilanLongInt *verLocInd,
+        MilanInt numProcs,
+        MilanInt myRank,
+        MPI_Comm comm,
+        vector<MilanLongInt> &Message,
+        MilanLongInt numGhostEdges,
+        MilanLongInt u,
+        MilanLongInt v,
+        MilanLongInt *SPtr,
+        vector<MilanLongInt> &U);
+
+    void extractUChunk(
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU);
+
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);

 #endif
 #ifdef __cplusplus
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@ -72,12 +72,6 @@

 #ifdef SERIAL_MPI
 #else
-//MPI type map
-template<typename T> MPI_Datatype TypeMap();
-template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
-template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
-template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
-template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }

 // DOUBLE PRECISION VERSION
 //WARNING: The vertex block on a given rank is contiguous
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@ -0,0 +1,554 @@
+#include "MatchBoxPC.h"
+
+// ***********************************************************************
+//
+//        MatchboxP: A C++ library for approximate weighted matching
+//               Mahantesh Halappanavar (hala@pnnl.gov)
+//               Pacific Northwest National Laboratory
+//
+// ***********************************************************************
+//
+//       Copyright (2021) Battelle Memorial Institute
+//                      All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+/* Function	: algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
+ *
+ * Date     : New update: Feb 17, 2019, Richland, Washington.
+ * Date		: Original development: May 17, 2009, E&CS Bldg.
+ *
+ * Purpose	: Compute Approximate Maximum Weight Matching in Linear Time
+ *
+ * Args		: inputMatrix - instance of Compressed-Col format of Matrix
+ *                Mate - The Mate array
+ *
+ * Returns	: By Value: (void)
+ *            By Reference: Mate
+ *
+ * Comments	: 1/2 Approx Algorithm. Picks the locally available heaviest edge.
+ *                Assumption: The Mate Array is empty.
+ */
+
+/*
+ NLVer = #of vertices, NLEdge = #of edges
+ CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
+ verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
+ Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
+ MPI: myRank, numProcs, comm,
+ Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
+ Statistics: ph0_time, ph1_time, ph2_time: Runtimes
+ Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
+ */
+//#define DEBUG_HANG_
+#ifdef SERIAL_MPI
+#else
+
+// DOUBLE PRECISION VERSION
+// WARNING: The vertex block on a given rank is contiguous
+void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+    MilanLongInt NLVer, MilanLongInt NLEdge,
+    MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *Mate,
+    MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+    MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
+    MilanReal *msgPercent,
+    MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+    MilanLongInt *ph1_card, MilanLongInt *ph2_card)
+{
+
+    /*
+     * verDistance: it's a vector long as the number of processors.
+     *              verDistance[i] contains the first node index of the i-th processor
+     *              verDistance[i + 1] contains the last node index of the i-th processor
+     * NLVer: number of elements in the LocPtr
+     * NLEdge: number of edges assigned to the current processor
+     *
+     * Contains the portion of matrix assigned to the processor in
+     * Yale notation
+     * verLocInd: contains the positions on row of the matrix
+     * verLocPtr: i-th value is the position of the first element on the i-th row and
+     *            i+1-th value is the position of the first element on the i+1-th row
+     */
+
+#if !defined(SERIAL_MPI)
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
+    fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ") verDistance [" ;
+    for (int i = 0; i < numProcs; i++)
+      cout << verDistance[i] << "," << verDistance[i+1];
+    cout  << "]\n";
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0) {
+      cout << "\n(" << myRank << ") verDistance [" ;
+      for (int i = 0; i < numProcs; i++)
+	cout << verDistance[i] << "," ;
+      cout  << verDistance[numProcs]<< "]\n";
+    }
+    fflush(stdout);
+#endif
+
+    MilanLongInt StartIndex = verDistance[myRank];       // The starting vertex owned by the current rank
+    MilanLongInt EndIndex   = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
+
+    MPI_Status computeStatus;
+
+    MilanLongInt msgActual = 0, msgInd = 0;
+    MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
+    MilanReal startTime, finishTime;
+
+    startTime = MPI_Wtime();
+
+    // Data structures for sending and receiving messages:
+    vector<MilanLongInt> Message; // [ u, v, message_type ]
+    Message.resize(3, -1);
+    // Data structures for Message Bundling:
+    // Although up to two messages can be sent along any cross edge,
+    // only one message will be sent in the initialization phase -
+    // one of: REQUEST/FAILURE/SUCCESS
+    vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
+    vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
+
+    MilanLongInt *PCounter = new MilanLongInt[numProcs];
+    for (int i = 0; i < numProcs; i++)
+        PCounter[i] = 0;
+
+    MilanLongInt NumMessagesBundled = 0;
+    // TODO when the last computational section will be refactored this could be eliminated
+    MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
+    MilanLongInt *candidateMate = nullptr;
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")NV: " << NLVer << "  Edges: " << NLEdge;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")StartIndex: " << StartIndex << "  EndIndex: " << EndIndex;
+    fflush(stdout);
+#endif
+    // Other Variables:
+    MilanLongInt u = -1, v = -1, w = -1, i = 0;
+    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
+    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
+    MilanLongInt myCard = 0;
+
+    // Build the Ghost Vertex Set: Vg
+    map<MilanLongInt, MilanLongInt> Ghost2LocalMap;       // Map each ghost vertex to a local vertex
+    vector<MilanLongInt> Counter;                         // Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
+#endif
+
+    // Define Adjacency Lists for Ghost Vertices:
+    // cout<<"Building Ghost data structures ... \n\n";
+    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    // Mate array for ghost vertices:
+    vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
+    MilanLongInt S;
+    MilanLongInt privateMyCard = 0;
+    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
+    vector<MPI_Request> SRequest;  // Requests that are used for each send message
+    vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
+    MilanLongInt MessageIndex = 0; // Pointer for current message
+    MilanInt BufferSize;
+    MilanLongInt *Buffer;
+
+    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
+    vector<MilanInt> privateQOwner;
+    vector<MilanLongInt> U, privateU;
+
+    initialize(NLVer, NLEdge, StartIndex,
+               EndIndex, &numGhostEdges,
+               &numGhostVertices, &S,
+               verLocInd, verLocPtr,
+               Ghost2LocalMap, Counter,
+               verGhostPtr, verGhostInd,
+               tempCounter, GMate,
+               Message, QLocalVtx,
+               QGhostVtx, QMsgType, QOwner,
+               candidateMate, U,
+               privateU,
+               privateQLocalVtx,
+               privateQGhostVtx,
+               privateQMsgType,
+               privateQOwner);
+
+    finishTime = MPI_Wtime();
+    *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished initialization" << endl;
+    fflush(stdout);
+#endif
+   
+    startTime = MPI_Wtime();
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // Compute the Initial Matching Set:
+
+    /*
+     * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
+     * the two.
+     * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
+     */
+
+    PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
+                                      verLocPtr,
+                                      verLocInd,
+                                      myRank,
+                                      edgeLocWeight,
+                                      candidateMate);
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Exposed Vertex" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " candidateMate after parallelCompute " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << candidateMate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
+    /*
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+     * TODO: write comment
+     *
+     * TODO: Test when it's actually more efficient to execute this code
+     *       in parallel.
+     */
+    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
+                                      candidateMate,
+                                      verLocInd,
+                                      verLocPtr,
+                                      StartIndex,
+                                      EndIndex,
+                                      Mate,
+                                      GMate,
+                                      Ghost2LocalMap,
+                                      edgeLocWeight,
+                                      &myCard,
+                                      &msgInd,
+                                      &NumMessagesBundled,
+                                      &S,
+                                      verDistance,
+                                      PCounter,
+                                      Counter,
+                                      myRank,
+                                      numProcs,
+                                      U,
+                                      privateU,
+                                      QLocalVtx,
+                                      QGhostVtx,
+                                      QMsgType,
+                                      QOwner,
+                                      privateQLocalVtx,
+                                      privateQGhostVtx,
+                                      privateQMsgType,
+                                      privateQOwner);
+
+    tempCounter.clear(); // Do not need this any more
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Exposed Vertex" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " Mate after Exposed Vertices " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << Mate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
+
+    ///////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////
+
+    // TODO what would be the optimal UCHUNK
+    vector<MilanLongInt> UChunkBeingProcessed;
+    UChunkBeingProcessed.reserve(UCHUNK);
+
+    processMatchedVertices(NLVer,
+                           UChunkBeingProcessed,
+                           U,
+                           privateU,
+                           StartIndex,
+                           EndIndex,
+                           &myCard,
+                           &msgInd,
+                           &NumMessagesBundled,
+                           &S,
+                           verLocPtr,
+                           verLocInd,
+                           verDistance,
+                           PCounter,
+                           Counter,
+                           myRank,
+                           numProcs,
+                           candidateMate,
+                           GMate,
+                           Mate,
+                           Ghost2LocalMap,
+                           edgeLocWeight,
+                           QLocalVtx,
+                           QGhostVtx,
+                           QMsgType,
+                           QOwner,
+                           privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished Process Vertices" << endl;
+    fflush(stdout);
+#if 0
+    cout << myRank << " Mate after Matched Vertices " <<endl;
+    for (int i=0; i<NLVer; i++) {
+      cout << Mate[i] << " " ;
+    }
+    cout << endl;
+#endif
+#endif
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    sendBundledMessages(&numGhostEdges,
+                        &BufferSize,
+                        Buffer,
+                        PCumulative,
+                        PMessageBundle,
+                        PSizeInfoMessages,
+                        PCounter,
+                        NumMessagesBundled,
+                        &msgActual,
+                        &MessageIndex,
+                        numProcs,
+                        myRank,
+                        comm,
+                        QLocalVtx,
+                        QGhostVtx,
+                        QMsgType,
+                        QOwner,
+                        SRequest,
+                        SStatus);
+
+    ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
+
+    finishTime = MPI_Wtime();
+    *ph1_time = finishTime - startTime; // Time taken for Phase-1
+
+#ifdef DEBUG_HANG_
+    cout << myRank << " Finished sendBundles" << endl;
+    fflush(stdout);
+#endif
+ 
+    *ph1_card = myCard;                 // Cardinality at the end of Phase-1
+    startTime = MPI_Wtime();
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // Main While Loop:
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Entering While(true) loop..";
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+    while (true)  {
+#ifdef DEBUG_HANG_
+      //if (myRank == 0)
+      cout << "\n(" << myRank << ") Main loop" << endl;
+      fflush(stdout);
+#endif
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+      
+      processMatchedVerticesAndSendMessages(NLVer,
+					    UChunkBeingProcessed,
+					    U,
+					    privateU,
+					    StartIndex,
+					    EndIndex,
+					    &myCard,
+					    &msgInd,
+					    &NumMessagesBundled,
+					    &S,
+					    verLocPtr,
+					    verLocInd,
+					    verDistance,
+					    PCounter,
+					    Counter,
+					    myRank,
+					    numProcs,
+					    candidateMate,
+					    GMate,
+					    Mate,
+					    Ghost2LocalMap,
+					    edgeLocWeight,
+					    QLocalVtx,
+					    QGhostVtx,
+					    QMsgType,
+					    QOwner,
+					    privateQLocalVtx,
+					    privateQGhostVtx,
+					    privateQMsgType,
+					    privateQOwner,
+					    comm,
+					    &msgActual,
+					    Message);
+      
+      ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+
+        //// BREAK IF NO MESSAGES EXPECTED /////////
+#ifdef DEBUG_HANG_
+#if 0
+      cout << myRank << " Mate after ProcessMatchedAndSend phase "<<S <<endl;
+      for (int i=0; i<NLVer; i++) {
+	cout << Mate[i] << " " ;
+      }
+      cout << endl;
+#endif
+#endif
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
+#endif
+
+        if (S == 0) {
+#ifdef DEBUG_HANG_
+	  cout << "\n(" << myRank << ") Breaking out" << endl;
+	  fflush(stdout);
+#endif
+	  break;
+        }
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+
+        processMessages(NLVer,
+                        Mate,
+                        candidateMate,
+                        Ghost2LocalMap,
+                        GMate,
+                        Counter,
+                        StartIndex,
+                        EndIndex,
+                        &myCard,
+                        &msgInd,
+                        &msgActual,
+                        edgeLocWeight,
+                        verDistance,
+                        verLocPtr,
+                        k,
+                        verLocInd,
+                        numProcs,
+                        myRank,
+                        comm,
+                        Message,
+                        numGhostEdges,
+                        u,
+                        v,
+                        &S,
+                        U);
+
+        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
+#ifdef DEBUG_HANG_
+#if 0
+      cout << myRank << " Mate after ProcessMessages phase "<<S <<endl;
+      for (int i=0; i<NLVer; i++) {
+	cout << Mate[i] << " " ;
+      }
+      cout << endl;
+#endif
+#endif
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
+        fflush(stdout);
+#endif
+    } // End of while (true)
+
+    clean(NLVer,
+          myRank,
+          MessageIndex,
+          SRequest,
+          SStatus,
+          BufferSize,
+          Buffer,
+          msgActual,
+          msgActualSent,
+          msgInd,
+          msgIndSent,
+          NumMessagesBundled,
+          msgPercent);
+
+    finishTime = MPI_Wtime();
+    *ph2_time = finishTime - startTime; // Time taken for Phase-2
+    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
+}
+// End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
+#endif
+
+#endif
--- a/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_c_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.

  name='amg_c_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")

  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_c_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
+  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)

  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_c_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_c_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1

  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()

  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")

  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.

  if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_c_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
      end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
    call csr_restr%cp_from_coo(coo_restr,info)

+    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_c_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info

  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  complex(psb_spk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
+    !$omp end parallel do 
  else
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
+    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+      
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+            
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
+              info=psb_err_internal_error_
+              !$omp end atomic 
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+            
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+#else
  step1: do ii=1, nr
+    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
    end if

    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
      end if

      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -244,8 +424,15 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
+    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_c_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

  call acsr%free()
-
  call psb_erractionrestore(err_act)
  return

--- a/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_c_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_c_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif

  implicit none

@ -99,6 +102,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
        else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
        end if
+      else
+        s_neigh_coo%val(k) = -sone        
      end if
    end do
  end do
+  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)

  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
+    !$omp end parallel do 
  else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
+    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if

+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+#else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -259,8 +441,9 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_c_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_caggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_spk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1

  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()

  call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+

  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
+  if (do_timings) call psb_tic(idx_phase1)

  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = czero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
+    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if


+  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= czero) then
      adiag(i) = cone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = cone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 

    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
+  if (do_timings) call psb_toc(idx_phase1)

-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999

@ -267,6 +291,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_caggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_d_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.

  name='amg_d_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")

  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_d_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
+  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)

  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_d_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_d_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1

  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()

  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")

  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.

  if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_d_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
      end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
    call csr_restr%cp_from_coo(coo_restr,info)

+    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_d_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info

  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  real(psb_dpk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
+    !$omp end parallel do 
  else
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
+    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+      
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+            
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
+              info=psb_err_internal_error_
+              !$omp end atomic 
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+            
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+#else
  step1: do ii=1, nr
+    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
    end if

    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
      end if

      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -244,8 +424,15 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
+    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_d_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

  call acsr%free()
-
  call psb_erractionrestore(err_act)
  return

--- a/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_d_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_d_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif

  implicit none

@ -99,6 +102,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
        else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
        end if
+      else
+        s_neigh_coo%val(k) = -done        
      end if
    end do
  end do
+  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)

  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
+    !$omp end parallel do 
  else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
+    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if

+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+#else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -259,8 +441,9 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_d_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_daggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1

  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()

  call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+

  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
+  if (do_timings) call psb_tic(idx_phase1)

  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = dzero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
+    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if


+  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= dzero) then
      adiag(i) = done / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = done
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 

    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
+  if (do_timings) call psb_toc(idx_phase1)

-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999

@ -267,6 +291,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_daggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_s_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.

  name='amg_s_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")

  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_s_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
+  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)

  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_s_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_s_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1

  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()

  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")

  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.

  if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_s_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
      end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
    call csr_restr%cp_from_coo(coo_restr,info)

+    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_s_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info

  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  real(psb_spk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
+    !$omp end parallel do 
  else
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
+    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+      
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+            
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
+              info=psb_err_internal_error_
+              !$omp end atomic 
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+            
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+#else
  step1: do ii=1, nr
+    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
    end if

    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
      end if

      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -244,8 +424,15 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
+    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_s_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

  call acsr%free()
-
  call psb_erractionrestore(err_act)
  return

--- a/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_s_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_s_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif

  implicit none

@ -99,6 +102,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = sone
+          s_neigh_coo%val(k) = sone
        else
-          s_neigh_coo%val(ip) = -sone
+          s_neigh_coo%val(k) = -sone
        end if
+      else
+        s_neigh_coo%val(k) = -sone        
      end if
    end do
  end do
+  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)

  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
+    !$omp end parallel do 
  else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
+    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if

+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+#else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -259,8 +441,9 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_s_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_saggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_spk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1

  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()

  call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+

  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
+  if (do_timings) call psb_tic(idx_phase1)

  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = szero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
+    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if


+  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= szero) then
      adiag(i) = sone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = sone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 

    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
+  if (do_timings) call psb_toc(idx_phase1)

-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999

@ -267,6 +291,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_saggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
+++ b/amgprec/impl/aggregator/amg_z_dec_aggregator_tprol.f90
@ -97,6 +97,8 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  integer(psb_lpk_)   :: ntaggr
  integer(psb_ipk_)   :: debug_level, debug_unit
  logical             :: clean_zeros
+  integer(psb_ipk_), save :: idx_map_bld=-1, idx_map_tprol=-1
+  logical, parameter      :: do_timings=.false.

  name='amg_z_dec_aggregator_tprol'
  call psb_erractionsave(err_act)
@ -108,6 +110,10 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_map_bld==-1))       &
+       & idx_map_bld = psb_get_timer_idx("DEC_TPROL: map_bld")
+  if ((do_timings).and.(idx_map_tprol==-1))     &
+       & idx_map_tprol = psb_get_timer_idx("DEC_TPROL: map_tprol")

  call amg_check_def(parms%ml_cycle,'Multilevel cycle',&
       &   amg_mult_ml_,is_legal_ml_cycle)
@ -121,10 +127,14 @@ subroutine  amg_z_dec_aggregator_build_tprol(ag,parms,ag_data,&
  ! The decoupled aggregator based on SOC measures ignores
  ! ag_data except for clean_zeros; soc_map_bld is a procedure pointer.
  !
+  if (do_timings) call psb_tic(idx_map_bld)
  clean_zeros = ag%do_clean_zeros
  call ag%soc_map_bld(parms%aggr_ord,parms%aggr_thresh,clean_zeros,a,desc_a,nlaggr,ilaggr,info)
+  if (do_timings) call psb_toc(idx_map_bld)
+  if (do_timings) call psb_tic(idx_map_tprol)

  if (info==psb_success_) call amg_map_to_tprol(desc_a,ilaggr,nlaggr,t_prol,info)
+  if (do_timings) call psb_toc(idx_map_tprol)
  if (info /= psb_success_) then
    info=psb_err_from_subroutine_
    call psb_errpush(info,name,a_err='soc_map_bld/map_to_tprol')
--- a/amgprec/impl/aggregator/amg_z_ptap_bld.f90
+++ b/amgprec/impl/aggregator/amg_z_ptap_bld.f90
@ -76,7 +76,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  integer(psb_ipk_) :: nrow, ncol, nrl, nzl, ip, nzt, i, k
  integer(psb_lpk_) ::  nrsave, ncsave, nzsave, nza
  logical, parameter :: do_timings=.false., oldstyle=.false., debug=.false.  
-  integer(psb_ipk_), save :: idx_spspmm=-1
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_cpytrans1=-1, idx_cpytrans2=-1

  name='amg_ptap_bld'
  if(psb_get_errstatus().ne.0) return 
@ -93,7 +93,11 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ncol  = desc_a%get_local_cols()

  if ((do_timings).and.(idx_spspmm==-1)) &
-       & idx_spspmm = psb_get_timer_idx("SPMM_BLD: par_spspmm")
+       & idx_spspmm = psb_get_timer_idx("PTAP_BLD: par_spspmm")
+  if ((do_timings).and.(idx_cpytrans1==-1)) &
+       & idx_cpytrans1 = psb_get_timer_idx("PTAP_BLD: cpy&trans1")
+  if ((do_timings).and.(idx_cpytrans2==-1)) &
+       & idx_cpytrans2 = psb_get_timer_idx("PTAP_BLD: cpy&trans2")

  naggr   = nlaggr(me+1)
  ntaggr  = sum(nlaggr)
@ -128,6 +132,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
  ! Ok first product done.

  if (present(desc_ax)) then
+    if (do_timings) call psb_tic(idx_cpytrans1)
    block 
      call coo_prol%cp_to_coo(coo_restr,info)
      call coo_restr%set_ncols(desc_ac%get_local_cols())
@ -137,7 +142,7 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
      call coo_restr%set_ncols(desc_ax%get_local_cols())
    end block
    call csr_restr%cp_from_coo(coo_restr,info)
-
+    if (do_timings) call psb_toc(idx_cpytrans1)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
@ -168,26 +173,27 @@ subroutine amg_z_ptap_bld(a_csr,desc_a,nlaggr,parms,ac,&
    call coo_restr%transp()
    nzl = coo_restr%get_nzeros()
    nrl = desc_ac%get_local_rows()
-    i=0
+    call coo_restr%fix(info)
+    i=coo_restr%get_nzeros()
    !
    ! Only keep local rows
    !
-    do k=1, nzl
-      if ((1 <= coo_restr%ia(k)) .and.(coo_restr%ia(k) <= nrl)) then
-        i = i+1
-        coo_restr%val(i) = coo_restr%val(k)
-        coo_restr%ia(i)  = coo_restr%ia(k)
-        coo_restr%ja(i)  = coo_restr%ja(k)
+    search: do k=i,1,-1
+      if (coo_restr%ia(k) <= nrl) then
+        call coo_restr%set_nzeros(k)
+        exit search
      end if
-    end do
-    call coo_restr%set_nzeros(i)
-    call coo_restr%fix(info) 
+    end do search
+
    nzl  = coo_restr%get_nzeros()
    call coo_restr%set_nrows(desc_ac%get_local_rows())
    call coo_restr%set_ncols(desc_a%get_local_cols())
    if (debug) call check_coo(me,trim(name)//' Check 2 on coo_restr:',coo_restr)
+    if (do_timings) call psb_tic(idx_cpytrans2)
+        
    call csr_restr%cp_from_coo(coo_restr,info)

+    if (do_timings) call psb_toc(idx_cpytrans2)
    if (info /= psb_success_) then 
      call psb_errpush(psb_err_from_subroutine_,name,a_err='spcnv coo_restr')
      goto 9999
--- a/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc1_map_bld.F90
@ -72,7 +72,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod
  use amg_base_prec_type
  use amg_z_inner_mod
-
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! Arguments
@ -85,7 +87,7 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_), intent(out)               :: info

  ! Local variables
-  integer(psb_ipk_), allocatable :: ils(:), neigh(:), irow(:), icol(:),&
+  integer(psb_ipk_), allocatable :: neigh(:), irow(:), icol(:),&
       & ideg(:), idxs(:)
  integer(psb_lpk_), allocatable :: tmpaggr(:)
  complex(psb_dpk_), allocatable  :: val(:), diag(:)
@ -99,6 +101,9 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_)   :: nrow, ncol, n_ne
  integer(psb_lpk_)   :: nrglob
  character(len=20)   :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc1_p1=-1, idx_soc1_p2=-1, idx_soc1_p3=-1
+  integer(psb_ipk_), save :: idx_soc1_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc1_map_bld'
@ -114,6 +119,14 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc1_p0==-1))       &
+       & idx_soc1_p0 = psb_get_timer_idx("SOC1_MAP: phase0")
+  if ((do_timings).and.(idx_soc1_p1==-1))       &
+       & idx_soc1_p1 = psb_get_timer_idx("SOC1_MAP: phase1")
+  if ((do_timings).and.(idx_soc1_p2==-1))       &
+       & idx_soc1_p2 = psb_get_timer_idx("SOC1_MAP: phase2")
+  if ((do_timings).and.(idx_soc1_p3==-1))       &
+       & idx_soc1_p3 = psb_get_timer_idx("SOC1_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -133,33 +146,194 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc1_p0)
  call a%cp_to(acsr)
+  if (do_timings) call psb_toc(idx_soc1_p0)
  if (clean_zeros) call acsr%clean_zeros(info)
  if (iorder == amg_aggr_ord_nat_) then 
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i
    end do
+    !$omp end parallel do 
  else
+    !$omp parallel do private(i) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = acsr%irp(i+1) - acsr%irp(i)
    end do
+    !$omp end parallel do 
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if
-
+  if (do_timings) call psb_tic(idx_soc1_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
-  icnt = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz, nc, i,j,m, nz, ilg,  ip, rsz
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+      
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)          
+          if (info /= 0) cycle step1
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+            
+          if (ilaggr(i) == -(nr+1)) then
+            nz         = (acsr%irp(i+1)-acsr%irp(i))
+            if ((nz<0).or.(nz>size(icol))) then
+              !$omp atomic write
+              info=psb_err_internal_error_
+              !$omp end atomic 
+              call psb_errpush(info,name)
+              cycle step1
+              !goto 9999
+            end if
+
+            icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
+            val(1:nz)  = acsr%val(acsr%irp(i):acsr%irp(i+1)-1) 
+
+            !
+            ! Build the set of all strongly coupled nodes 
+            !
+            ip = 0
+            do k=1, nz
+              j   = icol(k)
+              ! If any of the neighbours is already assigned,
+              ! we will not reset. 
+              if (ilaggr(j) > 0) cycle step1
+              if (abs(val(k)) > theta*sqrt(abs(diag(i)*diag(j)))) then
+                ip = ip + 1
+                icol(ip) = icol(k)
+              end if
+            enddo
+            
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, ip
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+#else
  step1: do ii=1, nr
+    if (info /= 0) cycle
    i = idxs(ii)
    if ((i<1).or.(i>nr)) then
      info=psb_err_internal_error_
      call psb_errpush(info,name)
-      goto 9999
+      cycle step1
+      !goto 9999
    end if

    if (ilaggr(i) == -(nr+1)) then
@ -167,7 +341,8 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      if ((nz<0).or.(nz>size(icol))) then
        info=psb_err_internal_error_
        call psb_errpush(info,name)
-        goto 9999
+        cycle step1
+        !goto 9999
      end if

      icol(1:nz) = acsr%ja(acsr%irp(i):acsr%irp(i+1)-1)
@ -195,7 +370,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      !
      disjoint = all(ilaggr(icol(1:ip)) == -(nr+1)).or.(ip==0)
      if (disjoint) then       
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        do k=1, ip
          ilaggr(icol(k)) = naggr
@ -204,16 +378,22 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
-         & ' Check 1:',count(ilaggr == -(nr+1))
+         & ' Check   1:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
  end if
-
+  if (do_timings) call psb_toc(idx_soc1_p1)
+  if (do_timings) call psb_tic(idx_soc1_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,theta)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -244,8 +424,15 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc1_p2)
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check 1.5:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

-
+  if (do_timings) call psb_tic(idx_soc1_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -274,7 +461,6 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        end if
      enddo
      if (ip > 0) then
-        icnt      = icnt + 1 
        naggr     = naggr + 1
        ilaggr(i) = naggr
        do k=1, ip
@ -292,7 +478,10 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,info)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip)
  do i=1, nr
+    if (info /= 0) cycle
    if (ilaggr(i) < 0) then
      nz = (acsr%irp(i+1)-acsr%irp(i))
      if (nz == 1) then
@ -303,15 +492,18 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle 
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc1_p3)
  if (naggr > ncol) then 
-    !write(0,*) name,'Error : naggr > ncol',naggr,ncol
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
    goto 9999
@ -336,9 +528,13 @@ subroutine amg_z_soc1_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nlaggr(:) = 0
  nlaggr(me+1) = naggr
  call psb_sum(ctxt,nlaggr(1:np))
+  if (debug_level >= psb_debug_outer_) then 
+    write(debug_unit,*) me,' ',trim(name),&
+         & ' Check   2:',naggr,count(ilaggr(1:nr) == -(nr+1)), count(ilaggr(1:nr)>0),&
+         & count(ilaggr(1:nr) == -(nr+1))+count(ilaggr(1:nr)>0),nr
+  end if

  call acsr%free()
-
  call psb_erractionrestore(err_act)
  return

--- a/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
+++ b/amgprec/impl/aggregator/amg_z_soc2_map_bld.F90
@ -71,6 +71,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  use psb_base_mod 
  use amg_base_prec_type
  use amg_z_inner_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif

  implicit none

@ -99,6 +102,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  integer(psb_ipk_) :: np, me
  integer(psb_ipk_) :: nrow, ncol, n_ne
  character(len=20)  :: name, ch_err
+  integer(psb_ipk_), save :: idx_soc2_p1=-1, idx_soc2_p2=-1, idx_soc2_p3=-1
+  integer(psb_ipk_), save :: idx_soc2_p0=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  name = 'amg_soc2_map_bld'
@ -114,6 +120,14 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  nrow   = desc_a%get_local_rows()
  ncol   = desc_a%get_local_cols()
  nrglob = desc_a%get_global_rows()
+  if ((do_timings).and.(idx_soc2_p0==-1))       &
+       & idx_soc2_p0 = psb_get_timer_idx("SOC2_MAP: phase0")
+  if ((do_timings).and.(idx_soc2_p1==-1))       &
+       & idx_soc2_p1 = psb_get_timer_idx("SOC2_MAP: phase1")
+  if ((do_timings).and.(idx_soc2_p2==-1))       &
+       & idx_soc2_p2 = psb_get_timer_idx("SOC2_MAP: phase2")
+  if ((do_timings).and.(idx_soc2_p3==-1))       &
+       & idx_soc2_p3 = psb_get_timer_idx("SOC2_MAP: phase3")

  nr = a%get_nrows()
  nc = a%get_ncols()
@ -125,6 +139,7 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
    goto 9999
  end if

+  if (do_timings) call psb_tic(idx_soc2_p0)
  diag = a%get_diag(info)
  if(info /= psb_success_) then
    info=psb_err_from_subroutine_
@ -137,55 +152,217 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  ! 
  call a%cp_to(muij)
  if (clean_zeros) call muij%clean_zeros(info)
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
      if (j<= nr) muij%val(k) = abs(muij%val(k))/sqrt(abs(diag(i)*diag(j)))
    end do
  end do
-
+  !$omp end parallel do 
  !
  ! Compute the 1-neigbour; mark strong links with +1, weak links with -1
  !
  call s_neigh_coo%allocate(nr,nr,muij%get_nzeros())
-  ip = 0 
+  !$omp parallel do private(i,j,k) shared(nr,diag,muij) schedule(static)
  do i=1, nr
    do k=muij%irp(i),muij%irp(i+1)-1
      j = muij%ja(k)
+      s_neigh_coo%ia(k)  = i
+      s_neigh_coo%ja(k)  = j
      if (j<=nr) then 
-        ip = ip + 1
-        s_neigh_coo%ia(ip)  = i
-        s_neigh_coo%ja(ip)  = j
        if (real(muij%val(k)) >= theta) then 
-          s_neigh_coo%val(ip) = done
+          s_neigh_coo%val(k) = done
        else
-          s_neigh_coo%val(ip) = -done
+          s_neigh_coo%val(k) = -done
        end if
+      else
+        s_neigh_coo%val(k) = -done        
      end if
    end do
  end do
+  !$omp end parallel do
  !write(*,*) 'S_NEIGH: ',nr,ip
-  call s_neigh_coo%set_nzeros(ip)
+  call s_neigh_coo%set_nzeros(muij%get_nzeros())
  call s_neigh%mv_from_coo(s_neigh_coo,info)

  if (iorder == amg_aggr_ord_nat_) then
+    
+    !$omp parallel do private(i) shared(ilaggr,idxs) schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      idxs(i)   = i 
    end do
+    !$omp end parallel do 
  else 
+    !$omp parallel do private(i) shared(ilaggr,idxs,muij)  schedule(static)
    do i=1, nr
      ilaggr(i) = -(nr+1)
      ideg(i)   = muij%irp(i+1) - muij%irp(i)
    end do
+    !$omp end parallel do
    call psb_msort(ideg,ix=idxs,dir=psb_sort_down_)
  end if

+  if (do_timings) call psb_toc(idx_soc2_p0)
+  if (do_timings) call psb_tic(idx_soc2_p1)

  !
  ! Phase one: Start with disjoint groups.
  ! 
  naggr = 0
+#if defined(OPENMP)
+  block
+    integer(psb_ipk_), allocatable  :: bnds(:), locnaggr(:)
+    integer(psb_ipk_) :: myth,nths, kk
+    ! The parallelization makes use of a locaggr(:) array; each thread
+    ! keeps its own version of naggr, and when the loop ends, a prefix is applied
+    ! to locnaggr to determine:
+    ! 1. The total number of aggregaters NAGGR;
+    ! 2. How much should each thread shift its own aggregates
+    ! Part 2 requires to keep track of which thread defined each entry
+    ! of ilaggr(), so that each entry can be adjusted correctly: even
+    ! if an entry I belongs to the range BNDS(TH)>BNDS(TH+1)-1, it may have
+    ! been set because it is strongly connected to an entry J belonging to a
+    ! different thread. 
+
+    !$omp parallel shared(s_neigh,bnds,idxs,locnaggr,ilaggr,nr,naggr,diag,theta,nths,info) &
+    !$omp private(icol,val,myth,kk) 
+    block
+      integer(psb_ipk_) :: ii,nlp,k,kp,n,ia,isz,nc,i,j,m,nz,ilg,ip,rsz,ip1,nzcnt
+      integer(psb_lpk_) :: itmp
+      !$omp master
+      nths = omp_get_num_threads()
+      allocate(bnds(0:nths),locnaggr(0:nths+1))
+      locnaggr(:) = 0
+      bnds(0) = 1
+      !$omp end master      
+      !$omp barrier
+      myth = omp_get_thread_num()
+      rsz = nr/nths
+      if (myth < mod(nr,nths)) rsz = rsz + 1
+      bnds(myth+1) = rsz
+      !$omp barrier
+      !$omp master
+      do i=1,nths
+        bnds(i) = bnds(i) + bnds(i-1)
+      end do
+      info = 0
+      !$omp end master
+      !$omp barrier
+
+      !$omp do schedule(static) private(disjoint) 
+      do kk=0, nths-1
+        step1: do ii=bnds(kk), bnds(kk+1)-1
+          i = idxs(ii)
+          if (info /= 0) then
+            write(0,*) ' Step1:',kk,ii,i,info
+            cycle step1
+          end if
+          if ((i<1).or.(i>nr)) then
+            !$omp atomic write
+            info=psb_err_internal_error_
+            !$omp end atomic 
+            call psb_errpush(info,name)
+            cycle step1
+            !goto 9999
+          end if
+
+
+          if (ilaggr(i) == -(nr+1)) then 
+            !
+            ! Get the 1-neighbourhood of I 
+            !
+            ip1 = s_neigh%irp(i)
+            nz  = s_neigh%irp(i+1)-ip1
+            !
+            ! If the neighbourhood only contains I, skip it
+            !
+            if (nz ==0) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+            if ((nz==1).and.(s_neigh%ja(ip1)==i)) then
+              ilaggr(i) = 0
+              cycle step1
+            end if
+
+            nzcnt = count(real(s_neigh%val(ip1:ip1+nz-1)) > 0)
+            icol(1:nzcnt) = pack(s_neigh%ja(ip1:ip1+nz-1),(real(s_neigh%val(ip1:ip1+nz-1)) > 0))
+            disjoint = all(ilaggr(icol(1:nzcnt)) == -(nr+1)) 
+
+            !
+            ! If the whole strongly coupled neighborhood of I is
+            ! as yet unconnected, turn it into the next aggregate.
+            ! Same if ip==0 (in which case, neighborhood only
+            ! contains I even if it does not look like it from matrix)
+            ! The fact that DISJOINT is private and not under lock
+            ! generates a certain un-repeatability, in that between
+            ! computing DISJOINT and assigning, another thread might
+            ! alter the values of ILAGGR.
+            ! However, a certain unrepeatability is already present
+            ! because the sequence of aggregates is computed with a
+            ! different order than in serial mode.
+            ! In any case, even if the enteries of ILAGGR may be
+            ! overwritten, the important thing is that each entry is
+            ! consistent and they generate a correct aggregation map.
+            !
+            if (disjoint) then
+              locnaggr(kk)     = locnaggr(kk) + 1
+              itmp = (bnds(kk)-1+locnaggr(kk))*nths+kk
+              if (itmp < (bnds(kk)-1+locnaggr(kk))) then
+                !$omp atomic update
+                info = max(12345678,info)
+                !$omp end atomic
+                cycle step1
+              end if
+              !$omp atomic write
+              ilaggr(i) = itmp
+              !$omp end atomic
+              do k=1, nzcnt
+                !$omp atomic write
+                ilaggr(icol(k)) = itmp
+                !$omp end atomic
+              end do
+            end if
+          end if
+        enddo step1
+      end do
+      !$omp end do
+
+      !$omp master
+      naggr = sum(locnaggr(0:nths-1))
+      do i=1,nths
+        locnaggr(i) = locnaggr(i) + locnaggr(i-1)
+      end do
+      do i=nths+1,1,-1
+        locnaggr(i) = locnaggr(i-1)
+      end do
+      locnaggr(0) = 0
+      !write(0,*) 'LNAG ',locnaggr(nths+1)
+      !$omp end master 
+      !$omp barrier
+      !$omp  do schedule(static) 
+      do kk=0, nths-1
+        do ii=bnds(kk), bnds(kk+1)-1
+          if (ilaggr(ii) > 0) then 
+            kp = mod(ilaggr(ii),nths)
+            ilaggr(ii) = (ilaggr(ii)/nths)- (bnds(kp)-1) + locnaggr(kp)
+          end if
+        end do
+      end do
+      !$omp end do
+    end block
+    !$omp end parallel
+  end block
+  if (info /= 0) then
+    if (info == 12345678) write(0,*) 'Overflow in encoding ILAGGR'
+    info=psb_err_internal_error_
+    call psb_errpush(info,name)
+    goto 9999
+  end if
+
+#else
  icnt = 0
  step1: do ii=1, nr
    i = idxs(ii)
@ -224,16 +401,21 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    endif
  enddo step1
-  
+#endif  
  if (debug_level >= psb_debug_outer_) then 
    write(debug_unit,*) me,' ',trim(name),&
         & ' Check 1:',count(ilaggr == -(nr+1))
  end if
-
+  if (do_timings) call psb_toc(idx_soc2_p1)
+  if (do_timings) call psb_tic(idx_soc2_p2)
  !
  ! Phase two: join the neighbours
  !
+  !$omp workshare
  tmpaggr = ilaggr
+  !$omp end workshare
+  !$omp parallel do schedule(static) shared(tmpaggr,ilaggr,nr,naggr,diag,muij,s_neigh)& 
+  !$omp     private(ii,i,j,k,nz,icol,val,ip,cpling)
  step2: do ii=1,nr
    i = idxs(ii)

@ -259,8 +441,9 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
      end if
    end if
  end do step2
-
-
+  !$omp end parallel do
+  if (do_timings) call psb_toc(idx_soc2_p2)
+  if (do_timings) call psb_tic(idx_soc2_p3)
  !
  ! Phase three: sweep over leftovers, if any 
  !
@ -294,6 +477,8 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
  end do step3

  ! Any leftovers?
+  !$omp parallel do schedule(static) shared(ilaggr,s_neigh,info)& 
+  !$omp     private(ii,i,j,k)
  do i=1, nr
    if (ilaggr(i) <= 0) then
      nz = (s_neigh%irp(i+1)-s_neigh%irp(i))
@ -305,13 +490,17 @@ subroutine amg_z_soc2_map_bld(iorder,theta,clean_zeros,a,desc_a,nlaggr,ilaggr,in
        ! other processes. 
        ilaggr(i) = -(nrglob+nr)
      else
+        !$omp atomic write
        info=psb_err_internal_error_
+        !$omp end atomic
        call psb_errpush(info,name,a_err='Fatal error: non-singleton leftovers')
-        goto 9999
+        cycle
      endif
    end if
  end do
-
+  !$omp end parallel do
+  if (info /= 0) goto 9999
+  if (do_timings) call psb_toc(idx_soc2_p3)  
  if (naggr > ncol) then 
    info=psb_err_internal_error_
    call psb_errpush(info,name,a_err='Fatal error: naggr>ncol')
--- a/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
+++ b/amgprec/impl/aggregator/amg_zaggrmat_smth_bld.f90
@ -140,6 +140,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  real(psb_dpk_)     :: anorm, omega, tmp, dg, theta
  logical, parameter :: debug_new=.false.
  character(len=80) :: filename
+  logical, parameter :: do_timings=.false.
+  integer(psb_ipk_), save :: idx_spspmm=-1, idx_phase1=-1, idx_gtrans=-1, idx_phase2=-1, idx_refine=-1
+  integer(psb_ipk_), save :: idx_phase3=-1, idx_cdasb=-1, idx_ptap=-1

  name='amg_aggrmat_smth_bld'
  info=psb_success_
@ -153,6 +156,23 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ctxt = desc_a%get_context()

  call psb_info(ctxt, me, np)
+  if ((do_timings).and.(idx_spspmm==-1)) &
+       & idx_spspmm = psb_get_timer_idx("DEC_SMTH_BLD: par_spspmm")
+  if ((do_timings).and.(idx_phase1==-1)) &
+       & idx_phase1 = psb_get_timer_idx("DEC_SMTH_BLD: phase1    ")
+  if ((do_timings).and.(idx_phase2==-1)) &
+       & idx_phase2 = psb_get_timer_idx("DEC_SMTH_BLD: phase2    ")
+  if ((do_timings).and.(idx_phase3==-1)) &
+       & idx_phase3 = psb_get_timer_idx("DEC_SMTH_BLD: phase3    ")
+  if ((do_timings).and.(idx_gtrans==-1)) &
+       & idx_gtrans = psb_get_timer_idx("DEC_SMTH_BLD: gtrans    ")
+  if ((do_timings).and.(idx_refine==-1)) &
+       & idx_refine = psb_get_timer_idx("DEC_SMTH_BLD: refine    ")
+  if ((do_timings).and.(idx_cdasb==-1)) &
+       & idx_cdasb = psb_get_timer_idx("DEC_SMTH_BLD: cdasb     ")
+  if ((do_timings).and.(idx_ptap==-1)) &
+       & idx_ptap = psb_get_timer_idx("DEC_SMTH_BLD: ptap_bld  ")
+

  nglob = desc_a%get_global_rows()
  nrow  = desc_a%get_local_rows()
@ -171,6 +191,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
  ! naggr: number of local aggregates
  ! nrow: local rows. 
  ! 
+  if (do_timings) call psb_tic(idx_phase1)

  ! Get the diagonal D
  adiag = a%get_diag(info)
@ -196,7 +217,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    !
    ! Build the filtered matrix Af from A
    ! 
-
+    !$OMP parallel do private(i,j,tmp,jd) schedule(static)
    do i=1, nrow
      tmp = zzero
      jd  = -1 
@ -214,11 +235,13 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
        acsrf%val(jd)=acsrf%val(jd)-tmp
      end if
    enddo
+    !$OMP end parallel do 
    ! Take out zeroed terms 
    call acsrf%clean_zeros(info)
  end if


+  !$OMP parallel do private(i) schedule(static)
  do i=1,size(adiag)
    if (adiag(i) /= zzero) then
      adiag(i) = zone / adiag(i)
@ -226,7 +249,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
      adiag(i) = zone
    end if
  end do
-
+  !$OMP end parallel do 
  if (parms%aggr_omega_alg == amg_eig_est_) then 

    if (parms%aggr_eig == amg_max_norm_) then 
@ -252,8 +275,9 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(info,name,a_err='invalid amg_aggr_omega_alg_')
    goto 9999
  end if
+  if (do_timings) call psb_toc(idx_phase1)

-  
+  if (do_timings) call psb_tic(idx_phase2)
  call acsrf%scal(adiag,info)
  if (info /= psb_success_) goto 9999

@ -267,6 +291,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call psb_cdasb(desc_ac,info)
  call psb_cd_reinit(desc_ac,info)
+  if (do_timings) call psb_toc(idx_phase2)
+  if (do_timings) call psb_tic(idx_phase3)
  !
  ! Build the smoothed prolongator using either A or Af
  !    acsr1 = (I-w*D*A) Prol      acsr1 = (I-w*D*Af) Prol 
@ -279,8 +305,8 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&
    call psb_errpush(psb_err_from_subroutine_,name,a_err='spspmm 1')
    goto 9999
  end if
-
-
+  if (do_timings) call psb_toc(idx_phase3)
+  if (do_timings) call psb_tic(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done SPSPMM 1'
@ -292,7 +318,7 @@ subroutine amg_zaggrmat_smth_bld(a,desc_a,ilaggr,nlaggr,parms,&

  call op_prol%mv_from(coo_prol)
  call op_restr%mv_from(coo_restr)
-
+  if (do_timings) call psb_toc(idx_ptap)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),&
       & 'Done smooth_aggregate '
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@ -0,0 +1,91 @@
+#include "MatchBoxPC.h"
+
+// TODO comment
+
+void clean(MilanLongInt NLVer,
+           MilanInt myRank,
+           MilanLongInt MessageIndex,
+           vector<MPI_Request> &SRequest,
+           vector<MPI_Status> &SStatus,
+           MilanInt BufferSize,
+           MilanLongInt *Buffer,
+           MilanLongInt msgActual,
+           MilanLongInt *msgActualSent,
+           MilanLongInt msgInd,
+           MilanLongInt *msgIndSent,
+           MilanLongInt NumMessagesBundled,
+           MilanReal *msgPercent)
+{
+    // Cleanup Phase
+
+#pragma omp parallel
+    {
+#pragma omp master
+        {
+#pragma omp task
+            {
+
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ") Waitall= " << endl;
+                fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+                cout << "\n(" << myRank << ") Waitall " << endl;
+                fflush(stdout);
+#endif
+                //return;
+
+                MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
+
+                // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+                if (BufferSize > 0)
+                {
+                    MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
+                    free(Buffer);                            // Free the memory that was allocated
+                }
+            }
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
+            fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
+            cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
+            cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
+            cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
+            if (msgInd > 0)
+            {
+                cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
+            }
+            fflush(stdout);
+#endif
+
+#pragma omp task
+            {
+                *msgActualSent = msgActual;
+                *msgIndSent = msgInd;
+                if (msgInd > 0)
+                {
+                    *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
+                }
+                else
+                {
+                    *msgPercent = 0;
+                }
+            }
+
+#ifdef DEBUG_HANG_
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Done" << endl;
+            fflush(stdout);
+#endif
+        }
+    }
+}
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@ -0,0 +1,73 @@
+#include "MatchBoxPC.h"
+
+/**
+ * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
+ * Returns the vertices with the highest weight
+ * @param adj1
+ * @param adj2
+ * @param verLocInd
+ * @param edgeLocWeight
+ * @return
+ */
+MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                       MilanLongInt adj2,
+                                       MilanLongInt *verLocInd,
+                                       MilanReal *edgeLocWeight)
+{
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+    int finalK;
+    for (int k = adj1; k < adj2; k++) {
+      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
+	heaviestEdgeWt = edgeLocWeight[k];
+	w = verLocInd[k];
+	finalK = k;
+      }
+    } // End of for loop
+    return finalK;
+}
+
+/**
+ * //TODO documentation
+ * @param adj1
+ * @param adj2
+ * @param edgeLocWeight
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                  MilanLongInt adj2,
+                                  MilanReal *edgeLocWeight,
+                                  MilanLongInt k,
+                                  MilanLongInt *verLocInd,
+                                  MilanLongInt StartIndex,
+                                  MilanLongInt EndIndex,
+                                  vector<MilanLongInt> &GMate,
+                                  MilanLongInt *Mate,
+                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+{
+    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+    for (k = adj1; k < adj2; k++)   {
+      if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+	continue;
+      
+      if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+	  ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))     {
+	heaviestEdgeWt = edgeLocWeight[k];
+	w = verLocInd[k];
+      }
+    } // End of for loop
+      //  End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+    
+    return w;
+}
--- a/amgprec/impl/aggregator/dataStrStaticQueue.h
+++ b/amgprec/impl/aggregator/dataStrStaticQueue.h
@ -80,9 +80,11 @@ class staticQueue
 		MilanLongInt squeueTail;
 		MilanLongInt NumNodes;

+		//FIXME I had to comment this piece of code in order to make everything work.
+		//		why?
 		//Prevent Assignment and Pass by Value:
-		staticQueue(const staticQueue& src);
-		staticQueue& operator=(const staticQueue& rhs);
+		//staticQueue(const staticQueue& src);
+		//staticQueue& operator=(const staticQueue& rhs);

 	public:
 		//Constructors and Destructors
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@ -0,0 +1,31 @@
+#include "MatchBoxPC.h"
+
+void extractUChunk(
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU)
+{
+
+    UChunkBeingProcessed.clear();
+#pragma omp critical(U)
+    {
+
+        if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+        {
+            while (!privateU.empty())
+                UChunkBeingProcessed.push_back(privateU.back());
+            privateU.pop_back();
+        }
+        else
+        {
+            for (int i = 0; i < UCHUNK; i++)
+            { // Pop the new nodes
+                if (U.empty())
+                    break;
+                UChunkBeingProcessed.push_back(U.back());
+                U.pop_back();
+            }
+        }
+
+    } // End of critical U // End of critical U
+}
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@ -0,0 +1,29 @@
+#include "MatchBoxPC.h"
+
+/// Find the owner of a ghost node:
+MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                          MilanInt myRank, MilanInt numProcs)
+{
+
+  MilanLongInt mStartInd = mVerDistance[myRank];
+  MilanInt Start = 0;
+  MilanInt End = numProcs;
+  MilanInt Current = 0;
+
+  while (Start <= End)
+  {
+    Current = (End + Start) / 2;
+    // CASE-1:
+    if (mVerDistance[Current] == vtxIndex) return Current;
+    else // CASE 2:
+      if (mVerDistance[Current] > vtxIndex)
+        End = Current - 1;
+      else // CASE 3:
+        Start = Current + 1;
+  } // End of While()
+
+ if (mVerDistance[Current] > vtxIndex)
+      return (Current - 1);
+
+  return Current;
+} // End of findOwnerOfGhost()
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@ -0,0 +1,304 @@
+#include "MatchBoxPC.h"
+
+void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                MilanLongInt StartIndex, MilanLongInt EndIndex,
+                MilanLongInt *numGhostEdges,
+                MilanLongInt *numGhostVertices,
+                MilanLongInt *S,
+                MilanLongInt *verLocInd,
+                MilanLongInt *verLocPtr,
+                map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                vector<MilanLongInt> &Counter,
+                vector<MilanLongInt> &verGhostPtr,
+                vector<MilanLongInt> &verGhostInd,
+                vector<MilanLongInt> &tempCounter,
+                vector<MilanLongInt> &GMate,
+                vector<MilanLongInt> &Message,
+                vector<MilanLongInt> &QLocalVtx,
+                vector<MilanLongInt> &QGhostVtx,
+                vector<MilanLongInt> &QMsgType,
+                vector<MilanInt> &QOwner,
+                MilanLongInt *&candidateMate,
+                vector<MilanLongInt> &U,
+                vector<MilanLongInt> &privateU,
+                vector<MilanLongInt> &privateQLocalVtx,
+                vector<MilanLongInt> &privateQGhostVtx,
+                vector<MilanLongInt> &privateQMsgType,
+                vector<MilanInt> &privateQOwner)
+{
+
+    MilanLongInt insertMe = 0;
+    MilanLongInt adj1, adj2;
+    int i, v, k, w;
+    // index that starts with zero to |Vg|  - 1
+    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
+
+#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
+    {
+
+#pragma omp single
+        {
+
+#ifdef TIME_TRACKER
+            double Ghost2LocalInitialization = MPI_Wtime();
+#endif
+
+            /*
+             * OMP Ghost2LocalInitialization
+             * This loop analyzes all the edges and when finds a ghost edge
+             * puts it in the Ghost2LocalMap.
+             * A critical region is needed when inserting data in the map.
+             *
+             * Despite the critical region it is still productive to
+             * parallelize this cycle because the critical region is exeuted
+             * only when a ghost edge is found and ghost edges are a minority,
+             * circa 3.5% during the tests.
+             */
+#pragma omp task depend(out \
+                        : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
+            {
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
+                                                     : numGhostEdges[:1])
+                for (i = 0; i < NLEdge; i++)
+                { // O(m) - Each edge stored twice
+                    insertMe = verLocInd[i];
+                    if ((insertMe < StartIndex) || (insertMe > EndIndex))
+                    { // Find a ghost
+                        (*numGhostEdges)++;
+#pragma omp critical
+                        {
+                            storedAlready = Ghost2LocalMap.find(insertMe);
+                            if (storedAlready != Ghost2LocalMap.end())
+                            {                                     // Has already been added
+                                Counter[storedAlready->second]++; // Increment the counter
+                            }
+                            else
+                            {                                                 // Insert an entry for the ghost:
+                                Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
+                                Counter.push_back(1);                         // Initialize the counter
+                                (*numGhostVertices)++;                        // Increment the number of ghost vertices
+                            }                                                 // End of else()
+                        }
+                    } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+                }     // End of for(ghost vertices)
+            }         // end of task depend
+
+            // *numGhostEdges = atomicNumGhostEdges;
+#ifdef TIME_TRACKER
+            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
+            if (!Ghost2LocalMap.empty())
+            {
+                cout << "\n(" << myRank << ")Final Map : on process ";
+                cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
+                fflush(stdout);
+                storedAlready = Ghost2LocalMap.begin();
+                do
+                {
+                    cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
+                    fflush(stdout);
+                    storedAlready++;
+                } while (storedAlready != Ghost2LocalMap.end());
+            }
+#endif
+
+#pragma omp task depend(out                                                       \
+                        : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
+                                                                               : *numGhostVertices, *numGhostEdges)
+            {
+
+                // Initialize adjacency Lists for Ghost Vertices:
+                try
+                {
+                    verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
+                    tempCounter.reserve(*numGhostVertices);     // Pointer Vector
+                    verGhostInd.reserve(*numGhostEdges);        // Index Vector
+                    GMate.reserve(*numGhostVertices);           // Ghost Mate Vector
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+                // Initialize the Vectors:
+                verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
+                tempCounter.resize(*numGhostVertices, 0);     // Temporary Counter
+                verGhostInd.resize(*numGhostEdges, -1);       // Index Vector
+                GMate.resize(*numGhostVertices, -1);          // Temporary Counter
+                verGhostPtr[0] = 0;                           // The first value
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
+                fflush(stdout);
+#endif
+
+            } // End of task
+
+#pragma omp task depend(out                      \
+                        : verGhostPtr) depend(in \
+                                              : Counter, *numGhostVertices)
+            {
+
+#ifdef TIME_TRACKER
+                double verGhostPtrInitialization = MPI_Wtime();
+#endif
+                for (i = 0; i < *numGhostVertices; i++)
+                { // O(|Ghost Vertices|)
+                    verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+#ifdef PRINT_DEBUG_INFO_
+                    cout << verGhostPtr[i] << "\t";
+                    fflush(stdout);
+#endif
+                }
+
+#ifdef TIME_TRACKER
+                verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+                fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+#endif
+            } // End of task
+
+#ifdef PRINT_DEBUG_INFO_
+            if (*numGhostVertices > 0)
+                cout << verGhostPtr[*numGhostVertices] << "\n";
+            fflush(stdout);
+#endif
+
+#ifdef TIME_TRACKER
+            double verGhostIndInitialization = MPI_Wtime();
+#endif
+
+            /*
+             * OMP verGhostIndInitialization
+             *
+             * In this cycle the verGhostInd is initialized
+             * with the datas related to ghost edges.
+             * The check to see if a node is a ghost node is
+             * executed in paralle and when a ghost node
+             * is found a critical region is started.
+             *
+             * Despite the critical region it's still useful to
+             * parallelize the for cause the ghost nodes
+             * are a minority hence the critical region is executed
+             * few times, circa 3.5% of the times in the tests.
+             */
+#pragma omp task depend(in                                                               \
+                        : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
+                                                                                     : verGhostInd)
+            {
+#pragma omp taskloop num_tasks(NUM_THREAD)
+                for (v = 0; v < NLVer; v++)
+                {
+                    adj1 = verLocPtr[v]; // Vertex Pointer
+                    adj2 = verLocPtr[v + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        w = verLocInd[k]; // Get the adjacent vertex
+                        if ((w < StartIndex) || (w > EndIndex))
+                        { // Find a ghost
+#pragma omp critical
+                            {
+                                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
+                                tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
+                            }
+                            verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
+                        }                                           // End of if((w < StartIndex) || (w > EndIndex))
+                    }                                               // End of for(k)
+                }                                                   // End of for (v)
+            }                                                       // end of tasklopp
+
+#ifdef TIME_TRACKER
+            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Ghost Vertex Index: ";
+            for (v = 0; v < *numGhostEdges; v++)
+                cout << verGhostInd[v] << "\t";
+            cout << endl;
+            fflush(stdout);
+#endif
+
+#pragma omp task depend(in                           \
+                        : *numGhostEdges) depend(out \
+                                                 : QLocalVtx, QGhostVtx, QMsgType, QOwner)
+            {
+                try
+                {
+                    QLocalVtx.reserve(*numGhostEdges); // Local Vertex
+                    QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
+                    QMsgType.reserve(*numGhostEdges);  // Message Type (Request/Failure)
+                    QOwner.reserve(*numGhostEdges);    // Owner of the ghost: COmpute once and use later
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+            } // end of task
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << "=========================************===============================" << endl;
+            fflush(stdout);
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
+            fflush(stdout);
+#endif
+
+#ifdef DEBUG_HANG_
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
+#endif
+
+#pragma omp task depend(in                              \
+                        : *numGhostVertices) depend(out \
+                                                    : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
+            {
+
+                // Allocate Data Structures:
+                /*
+                 * candidateMate was a vector and has been replaced with an array
+                 * there is no point in using the vector (or maybe there is (???))
+                 * so I replaced it with an array wich is slightly faster
+                 */
+                candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
+
+                *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
+
+                /*
+                 * Create the Queue Data Structure for the Dominating Set
+                 *
+                 * I had to declare the staticuQueue U before the parallel region
+                 * to have it in the correct scope. Since we can't change the dimension
+                 * of a staticQueue I had to destroy the previous object and instantiate
+                 * a new one of the correct size.
+                 */
+                //new (&U) staticQueue(NLVer + (*numGhostVertices));
+                U.reserve(NLVer + (*numGhostVertices));
+
+                // Initialize the private vectors                
+                privateQLocalVtx.reserve(*numGhostVertices);
+                privateQGhostVtx.reserve(*numGhostVertices);
+                privateQMsgType.reserve(*numGhostVertices);
+                privateQOwner.reserve(*numGhostVertices);
+                privateU.reserve(*numGhostVertices);
+            } // end of task
+
+        } // End of single region
+    }     // End of parallel region
+}
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@ -0,0 +1,46 @@
+#include "MatchBoxPC.h"
+
+/**
+ * //TODO documentation
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+bool isAlreadyMatched(MilanLongInt node,
+                      MilanLongInt StartIndex,
+                      MilanLongInt EndIndex,
+                      vector<MilanLongInt> &GMate,
+                      MilanLongInt *Mate,
+                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+{
+
+    /*
+#pragma omp critical(Mate)
+    {
+        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
+            result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
+        } else { //A local vertex
+            result = (Mate[node - StartIndex] >= 0); // Already matched
+        }
+
+    }
+    */
+    MilanLongInt val;
+    if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
+    { 
+#pragma omp atomic read
+        val = GMate[Ghost2LocalMap[node]];
+        return val >= 0; // Already matched
+    }
+
+    // If not ghost vertex
+#pragma omp atomic read
+    val = Mate[node - StartIndex];
+
+    return val >= 0; // Already matched
+}
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@ -0,0 +1,27 @@
+#include "MatchBoxPC.h"
+
+void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                              MilanLongInt *verLocPtr,
+                                              MilanLongInt *verLocInd,
+                                              MilanInt myRank,
+                                              MilanReal *edgeLocWeight,
+                                              MilanLongInt *candidateMate)
+{
+
+    MilanLongInt v = -1;
+
+#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
+    {
+
+#pragma omp for schedule(static)
+        for (v = 0; v < NLVer; v++) {
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
+            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+        }
+    }
+}
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@ -0,0 +1,24 @@
+#include "MatchBoxPC.h"
+
+void PROCESS_CROSS_EDGE(MilanLongInt *edge,
+                        MilanLongInt *S)
+{
+    // Start: PARALLEL_PROCESS_CROSS_EDGE_B
+    MilanLongInt captureCounter;
+
+#pragma omp atomic capture
+    captureCounter = --(*edge); // Decrement
+
+    //assert(captureCounter >= 0);
+
+    if (captureCounter == 0)
+#pragma omp atomic
+        (*S)--; // Decrement S
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
+    fflush(stdout);
+#endif
+
+    // End: PARALLEL_PROCESS_CROSS_EDGE_B
+}
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@ -0,0 +1,195 @@
+#include "MatchBoxPC.h"
+
+void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                       MilanLongInt *candidateMate,
+                                       MilanLongInt *verLocInd,
+                                       MilanLongInt *verLocPtr,
+                                       MilanLongInt StartIndex,
+                                       MilanLongInt EndIndex,
+                                       MilanLongInt *Mate,
+                                       vector<MilanLongInt> &GMate,
+                                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                       MilanReal *edgeLocWeight,
+                                       MilanLongInt *myCard,
+                                       MilanLongInt *msgInd,
+                                       MilanLongInt *NumMessagesBundled,
+                                       MilanLongInt *S,
+                                       MilanLongInt *verDistance,
+                                       MilanLongInt *PCounter,
+                                       vector<MilanLongInt> &Counter,
+                                       MilanInt myRank,
+                                       MilanInt numProcs,
+                                       vector<MilanLongInt> &U,
+                                       vector<MilanLongInt> &privateU,
+                                       vector<MilanLongInt> &QLocalVtx,
+                                       vector<MilanLongInt> &QGhostVtx,
+                                       vector<MilanLongInt> &QMsgType,
+                                       vector<MilanInt> &QOwner,
+                                       vector<MilanLongInt> &privateQLocalVtx,
+                                       vector<MilanLongInt> &privateQGhostVtx,
+                                       vector<MilanLongInt> &privateQMsgType,
+                                       vector<MilanInt> &privateQOwner)
+{ 
+
+    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
+    MilanInt ghostOwner = 0, option, igw;
+
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)    \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) \
+     default(shared)   num_threads(NUM_THREAD)
+
+    {
+#pragma omp for reduction(+                             \
+                          : PCounter[:numProcs], myCard \
+                          [:1], msgInd                  \
+                          [:1], NumMessagesBundled      \
+                          [:1]) \
+        schedule(static)
+        for (v = 0; v < NLVer; v++) {
+            option = -1;
+            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            k = candidateMate[v];
+            candidateMate[v] = verLocInd[k];
+            w = candidateMate[v];
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
+            fflush(stdout);
+#endif
+            // If found a dominating edge:
+            if (w >= 0)
+            {
+
+#pragma omp critical(processExposed)
+                {
+                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap)) {
+		      w = computeCandidateMate(verLocPtr[v],
+					       verLocPtr[v + 1],
+					       edgeLocWeight, 0,
+					       verLocInd,
+					       StartIndex,
+					       EndIndex,
+					       GMate,
+					       Mate,
+					       Ghost2LocalMap);
+		      candidateMate[v] = w;
+                    }
+		    
+                    if (w >= 0) {
+		      (*myCard)++;
+		      if ((w < StartIndex) || (w > EndIndex)) { // w is a ghost vertex
+			option = 2;
+			if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex) {
+			  option = 1;
+			  Mate[v] = w;
+			  GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
+			  
+			} // End of if CandidateMate[w] = v
+
+		      } // End of if a Ghost Vertex
+		      else  { // w is a local vertex
+			
+			if (candidateMate[w - StartIndex] == (v + StartIndex)) {
+			  option = 3;
+			  Mate[v] = w;                           // v is local
+			  Mate[w - StartIndex] = v + StartIndex; // w is local
+			  
+#ifdef PRINT_DEBUG_INFO_
+			  cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+			  fflush(stdout);
+#endif
+			  
+			} // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+		      }     // End of Else
+		      
+                    } // End of second if
+		    
+                } // End critical processExposed
+		
+            } // End of if(w >=0)
+            else  {
+	      // This piece of code is executed a really small amount of times
+	      adj11 = verLocPtr[v];
+	      adj12 = verLocPtr[v + 1];
+	      for (k1 = adj11; k1 < adj12; k1++) {
+		w = verLocInd[k1];
+		if ((w < StartIndex) || (w > EndIndex)) { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+		  cout << "\n(" << myRank << ")Sending a failure message: ";
+		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  fflush(stdout);
+#endif
+		  (*msgInd)++;
+		  (*NumMessagesBundled)++;
+		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  // assert(ghostOwner != -1);
+		  // assert(ghostOwner != myRank);
+		  PCounter[ghostOwner]++;
+		  
+		  privateQLocalVtx.push_back(v + StartIndex);
+		  privateQGhostVtx.push_back(w);
+		  privateQMsgType.push_back(FAILURE);
+		  privateQOwner.push_back(ghostOwner);
+		  
+		} // End of if(GHOST)
+	      }     // End of for loop
+            }
+            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+	    
+            switch (option)
+            {
+            case -1:
+                break;
+            case 1:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
+
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
+                fflush(stdout);
+#endif
+
+                //  Decrement the counter:
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+            case 2:
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Sending a request message (291):";
+                cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                fflush(stdout);
+#endif
+                (*msgInd)++;
+                (*NumMessagesBundled)++;
+                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                // assert(ghostOwner != -1);
+                // assert(ghostOwner != myRank);
+                PCounter[ghostOwner]++;
+
+                privateQLocalVtx.push_back(v + StartIndex);
+                privateQGhostVtx.push_back(w);
+                privateQMsgType.push_back(REQUEST);
+                privateQOwner.push_back(ghostOwner);
+                break;
+            case 3:
+            default:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
+                break;
+            }
+
+        } // End of for ( v=0; v < NLVer; v++ )
+
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
+
+    } // End of parallel region
+}
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@ -0,0 +1,294 @@
+#include "MatchBoxPC.h"
+
+void processMatchedVertices(
+    MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
+    MilanLongInt *SPtr,
+    MilanLongInt *verLocPtr,
+    MilanLongInt *verLocInd,
+    MilanLongInt *verDistance,
+    MilanLongInt *PCounter,
+    vector<MilanLongInt> &Counter,
+    MilanInt myRank,
+    MilanInt numProcs,
+    MilanLongInt *candidateMate,
+    vector<MilanLongInt> &GMate,
+    MilanLongInt *Mate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    MilanReal *edgeLocWeight,
+    vector<MilanLongInt> &QLocalVtx,
+    vector<MilanLongInt> &QGhostVtx,
+    vector<MilanLongInt> &QMsgType,
+    vector<MilanInt> &QOwner,
+    vector<MilanLongInt> &privateQLocalVtx,
+    vector<MilanLongInt> &privateQGhostVtx,
+    vector<MilanLongInt> &privateQMsgType,
+    vector<MilanInt> &privateQOwner)
+{
+
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
+    MilanLongInt mateVal;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, \
+       privateQMsgType, privateQOwner, UChunkBeingProcessed) \
+       default(shared)  num_threads(NUM_THREAD)   \
+            reduction(+                           \
+                      : msgInd[:1], PCounter      \
+                      [:numProcs], myCard         \
+                      [:1], NumMessagesBundled    \
+                      [:1])
+    {
+
+        while (!U.empty()) {
+	  
+	  extractUChunk(UChunkBeingProcessed, U, privateU);
+	  
+	  for (MilanLongInt u : UChunkBeingProcessed)   {
+#ifdef PRINT_DEBUG_INFO_
+	    cout << "\n(" << myRank << ")u: " << u;
+	    fflush(stdout);
+#endif
+	    if ((u >= StartIndex) && (u <= EndIndex))     { // Process Only the Local Vertices
+	      
+#ifdef COUNT_LOCAL_VERTEX
+	      localVertices++;
+#endif
+	      
+	      // Get the Adjacency list for u
+	      adj1 = verLocPtr[u - StartIndex]; // Pointer
+	      adj2 = verLocPtr[u - StartIndex + 1];
+	      for (k = adj1; k < adj2; k++) {
+		option = -1;
+		v = verLocInd[k];
+		
+		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
+		  
+#ifdef PRINT_DEBUG_INFO_
+		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+		  fflush(stdout);
+#endif
+#pragma omp atomic read
+		  mateVal = Mate[v - StartIndex];
+		  // If the current vertex is pointing to a matched vertex and is not matched
+		  if (mateVal < 0)  {
+#pragma omp critical
+		    {
+		      if (candidateMate[v - StartIndex] == u) {
+			  // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+			w = computeCandidateMate(verLocPtr[v - StartIndex],
+						 verLocPtr[v - StartIndex + 1],
+						 edgeLocWeight, 0,
+						 verLocInd,
+						 StartIndex,
+						 EndIndex,
+						 GMate,
+						 Mate,
+						 Ghost2LocalMap);
+			
+			candidateMate[v - StartIndex] = w;
+			
+#ifdef PRINT_DEBUG_INFO_
+			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+			fflush(stdout);
+#endif
+			// If found a dominating edge:
+			if (w >= 0)   {			  
+			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+			    cout << "\n(" << myRank << ")Sending a request message:";
+			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+			    option = 2;
+			    
+			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)  {
+			      option = 1;
+			      Mate[v - StartIndex] = w;     // v is a local vertex
+			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+			      
+			    } // End of if CandidateMate[w] = v
+			  }     // End of if a Ghost Vertex
+			  else    { // w is a local vertex
+			    if (candidateMate[w - StartIndex] == v)  {
+			      option = 3;
+			      Mate[v - StartIndex] = w; // v is a local vertex
+			      Mate[w - StartIndex] = v; // w is a local vertex
+			      
+#ifdef PRINT_DEBUG_INFO_
+			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+			      fflush(stdout);
+#endif
+			    } // End of if(CandidateMate(w) = v
+			  }     // End of Else
+			}         // End of if(w >=0)
+			else
+			  option = 4; // End of Else: w == -1
+			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+		      } // End of If (candidateMate[v-StartIndex] == u
+		    }     // End of task
+		  }         // mateval < 0
+		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+		else  { // Neighbor is a ghost vertex
+		  
+#pragma omp critical
+		  {
+		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+		    if (v != Mate[u - StartIndex])
+		      option = 5; // u is local
+		  }                   // End of critical
+		}                       // End of Else //A Ghost Vertex
+		
+		switch (option)
+		  {
+		  case -1:
+		    // No things to do
+		    break;
+		  case 1:
+		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    
+		    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+		    fflush(stdout);
+#endif
+		    // Decrement the counter:
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+		  case 2:
+		    
+		    // Found a dominating edge, it is a ghost
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    // assert(ghostOwner != -1);
+		    // assert(ghostOwner != myRank);
+		    PCounter[ghostOwner]++;
+		    (*NumMessagesBundled)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(v);
+		    privateQGhostVtx.push_back(w);
+		    privateQMsgType.push_back(REQUEST);
+		    privateQOwner.push_back(ghostOwner);
+		    break;
+		  case 3:
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    
+		    (*myCard)++;
+		    break;
+		  case 4:
+		    // Could not find a dominating vertex
+		    adj11 = verLocPtr[v - StartIndex];
+		    adj12 = verLocPtr[v - StartIndex + 1];
+		    for (k1 = adj11; k1 < adj12; k1++) {
+		      w = verLocInd[k1];
+		      if ((w < StartIndex) || (w > EndIndex)) { // A ghost
+			
+#ifdef PRINT_DEBUG_INFO_
+			cout << "\n(" << myRank << ")Sending a failure message: ";
+			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			fflush(stdout);
+#endif
+			
+			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			// assert(ghostOwner != -1);
+			// assert(ghostOwner != myRank);
+			
+			PCounter[ghostOwner]++;
+			(*NumMessagesBundled)++;
+			(*msgInd)++;
+			
+			privateQLocalVtx.push_back(v);
+			privateQGhostVtx.push_back(w);
+			privateQMsgType.push_back(FAILURE);
+			privateQOwner.push_back(ghostOwner);
+			
+		      } // End of if(GHOST)
+		    }     // End of for loop
+		    break;
+		  case 5:
+		  default:
+		    
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")Sending a success message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+		    fflush(stdout);
+#endif
+		    
+		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+		    // assert(ghostOwner != -1);
+		    // assert(ghostOwner != myRank);
+		    
+		    (*NumMessagesBundled)++;
+		    PCounter[ghostOwner]++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(u);
+		    privateQGhostVtx.push_back(v);
+		    privateQMsgType.push_back(SUCCESS);
+		    privateQOwner.push_back(ghostOwner);
+		    
+		    break;
+		  } // End of switch
+		
+	      } // End of inner for
+	    }
+	  } // End of outer for
+	  
+	  queuesTransfer(U, privateU, QLocalVtx,
+			 QGhostVtx,
+			 QMsgType, QOwner, privateQLocalVtx,
+			 privateQGhostVtx,
+			 privateQMsgType,
+			 privateQOwner);
+	  
+#pragma omp critical(U)
+	  {
+	    U.insert(U.end(), privateU.begin(), privateU.end());
+	  }
+	  
+	  privateU.clear();
+	  
+#pragma omp critical(sendMessageTransfer)
+	  {
+	    
+	    QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+	    QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+	    QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+	    QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+	  }
+	  
+	  privateQLocalVtx.clear();
+	  privateQGhostVtx.clear();
+	  privateQMsgType.clear();
+	  privateQOwner.clear();
+	  
+        } // End of while ( !U.empty() )
+	
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+	
+#endif
+    } // End of parallel region
+}
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@ -0,0 +1,308 @@
+#include "MatchBoxPC.h"
+//#define DEBUG_HANG_
+void processMatchedVerticesAndSendMessages(
+					   MilanLongInt NLVer,
+					   vector<MilanLongInt> &UChunkBeingProcessed,
+					   vector<MilanLongInt> &U,
+					   vector<MilanLongInt> &privateU,
+					   MilanLongInt StartIndex,
+					   MilanLongInt EndIndex,
+					   MilanLongInt *myCard,
+					   MilanLongInt *msgInd,
+					   MilanLongInt *NumMessagesBundled,
+					   MilanLongInt *SPtr,
+					   MilanLongInt *verLocPtr,
+					   MilanLongInt *verLocInd,
+					   MilanLongInt *verDistance,
+					   MilanLongInt *PCounter,
+					   vector<MilanLongInt> &Counter,
+					   MilanInt myRank,
+					   MilanInt numProcs,
+					   MilanLongInt *candidateMate,
+					   vector<MilanLongInt> &GMate,
+					   MilanLongInt *Mate,
+					   map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+					   MilanReal *edgeLocWeight,
+					   vector<MilanLongInt> &QLocalVtx,
+					   vector<MilanLongInt> &QGhostVtx,
+					   vector<MilanLongInt> &QMsgType,
+					   vector<MilanInt> &QOwner,
+					   vector<MilanLongInt> &privateQLocalVtx,
+					   vector<MilanLongInt> &privateQGhostVtx,
+					   vector<MilanLongInt> &privateQMsgType,
+					   vector<MilanInt> &privateQOwner,
+					   MPI_Comm comm,
+					   MilanLongInt *msgActual,
+					   vector<MilanLongInt> &Message)
+{
+
+    MilanLongInt initialSize = QLocalVtx.size();
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
+    MilanLongInt mateVal;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+    //#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option) \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx,\
+		 privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                        \
+            reduction(+                                                                \
+                      : msgInd[:1], PCounter                                           \
+                      [:numProcs], myCard                                              \
+                      [:1], NumMessagesBundled                                         \
+                      [:1], msgActual                                                  \
+                      [:1])
+    {
+
+        while (!U.empty()) {
+	  
+	  extractUChunk(UChunkBeingProcessed, U, privateU);
+	  
+	  for (MilanLongInt u : UChunkBeingProcessed) {
+#ifdef PRINT_DEBUG_INFO_
+	    cout << "\n(" << myRank << ")u: " << u;
+	    fflush(stdout);
+#endif
+	    if ((u >= StartIndex) && (u <= EndIndex)) { // Process Only the Local Vertices
+	      
+#ifdef COUNT_LOCAL_VERTEX
+	      localVertices++;
+#endif
+	      
+	      // Get the Adjacency list for u
+	      adj1 = verLocPtr[u - StartIndex]; // Pointer
+	      adj2 = verLocPtr[u - StartIndex + 1];
+	      for (k = adj1; k < adj2; k++) {
+		option = -1;
+		v = verLocInd[k];
+		
+		if ((v >= StartIndex) && (v <= EndIndex)) { // If Local Vertex:
+		  
+#ifdef PRINT_DEBUG_INFO_
+		  cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+		  fflush(stdout);
+#endif
+#pragma omp atomic read
+		  mateVal = Mate[v - StartIndex];
+		  // If the current vertex is pointing to a matched vertex and is not matched
+		  if (mateVal < 0) {
+#pragma omp critical
+		    {
+		      if (candidateMate[v - StartIndex] == u)  {
+			// Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+			w = computeCandidateMate(verLocPtr[v - StartIndex],
+						 verLocPtr[v - StartIndex + 1],
+						 edgeLocWeight, 0,
+						 verLocInd,
+						 StartIndex,
+						 EndIndex,
+						 GMate,
+						 Mate,
+						 Ghost2LocalMap);
+			
+			candidateMate[v - StartIndex] = w;
+			
+#ifdef PRINT_DEBUG_INFO_
+			cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+			fflush(stdout);
+#endif
+			// If found a dominating edge:
+			if (w >= 0)  {
+			  
+			  if ((w < StartIndex) || (w > EndIndex))  { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+			    cout << "\n(" << myRank << ")Sending a request message:";
+			    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+			    option = 2;
+			    
+			    if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+			      option = 1;
+			      Mate[v - StartIndex] = w;     // v is a local vertex
+			      GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+			      
+			    } // End of if CandidateMate[w] = v
+			  }     // End of if a Ghost Vertex
+			  else   { // w is a local vertex
+			    if (candidateMate[w - StartIndex] == v)  {
+			      option = 3;
+			      Mate[v - StartIndex] = w; // v is a local vertex
+			      Mate[w - StartIndex] = v; // w is a local vertex
+			      
+#ifdef PRINT_DEBUG_INFO_
+			      cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+			      fflush(stdout);
+#endif
+			    } // End of if(CandidateMate(w) = v
+			  }     // End of Else
+			}         // End of if(w >=0)
+			else
+			  option = 4; // End of Else: w == -1
+			// End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+		      } // End of If (candidateMate[v-StartIndex] == u
+		    }     // End of task
+		  }         // mateval < 0
+		}             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+		else { // Neighbor is a ghost vertex
+		  
+#pragma omp critical
+		  {
+		    if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+		      candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+		    if (v != Mate[u - StartIndex])
+		      option = 5; // u is local
+		  }                   // End of critical
+		}                       // End of Else //A Ghost Vertex
+		
+		switch (option)
+		  {
+		  case -1:
+		    // No things to do
+		    break;
+		  case 1:
+		    // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+		    fflush(stdout);
+#endif
+		    // Decrement the counter:
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+		  case 2:
+		    
+		    // Found a dominating edge, it is a ghost
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    
+		    // Build the Message Packet:
+		    // Message[0] = v;       // LOCAL
+		    // Message[1] = w;       // GHOST
+		    // Message[2] = REQUEST; // TYPE
+		    // Send a Request (Asynchronous)
+		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    
+		    (*msgActual)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(v);
+		    privateQGhostVtx.push_back(w);
+		    privateQMsgType.push_back(REQUEST);
+		    privateQOwner.push_back(ghostOwner);
+		    break;
+		  case 3:
+		    privateU.push_back(v);
+		    privateU.push_back(w);
+		    (*myCard)++;
+		    break;
+		  case 4:
+		    // Could not find a dominating vertex
+		    adj11 = verLocPtr[v - StartIndex];
+		    adj12 = verLocPtr[v - StartIndex + 1];
+		    for (k1 = adj11; k1 < adj12; k1++) {
+		      w = verLocInd[k1];
+		      if ((w < StartIndex) || (w > EndIndex))  { // A ghost
+			
+#ifdef PRINT_DEBUG_INFO_
+			cout << "\n(" << myRank << ")Sending a failure message: ";
+			cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			fflush(stdout);
+#endif
+			
+			ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+			
+			// Build the Message Packet:
+			// Message[0] = v;       // LOCAL
+			// Message[1] = w;       // GHOST
+			// Message[2] = FAILURE; // TYPE
+			// Send a Request (Asynchronous)
+			// MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+			
+			(*msgActual)++;
+			(*msgInd)++;
+			
+			privateQLocalVtx.push_back(v);
+			privateQGhostVtx.push_back(w);
+			privateQMsgType.push_back(FAILURE);
+			privateQOwner.push_back(ghostOwner);
+			
+		      } // End of if(GHOST)
+		    }     // End of for loop
+		    break;
+		  case 5:
+		  default:
+		    
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")Sending a success message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+		    fflush(stdout);
+#endif
+		    
+		    ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+		    
+		    // Build the Message Packet:
+		    // Message[0] = u;       // LOCAL
+		    // Message[1] = v;       // GHOST
+		    // Message[2] = SUCCESS; // TYPE
+		    // Send a Request (Asynchronous)
+		    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    
+		    (*msgActual)++;
+		    (*msgInd)++;
+		    
+		    privateQLocalVtx.push_back(u);
+		    privateQGhostVtx.push_back(v);
+		    privateQMsgType.push_back(SUCCESS);
+		    privateQOwner.push_back(ghostOwner);
+		    
+		    break;
+		  } // End of switch
+	      }     // End of inner for
+	    }
+	  } // End of outer for
+	  
+	  queuesTransfer(U, privateU, QLocalVtx,
+			 QGhostVtx,
+			 QMsgType, QOwner, privateQLocalVtx,
+			 privateQGhostVtx,
+			 privateQMsgType,
+			 privateQOwner);
+	  
+        } // End of while ( !U.empty() )
+	
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+	
+#endif
+    } // End of parallel region
+    
+    // Send the messages
+#ifdef DEBUG_HANG_
+    cout << myRank<<" Sending: "<<QOwner.size()-initialSize<<" messages" <<endl;
+#endif
+    for (int i = initialSize; i < QOwner.size(); i++)   {
+      
+      Message[0] = QLocalVtx[i];
+      Message[1] = QGhostVtx[i];
+      Message[2] = QMsgType[i];
+      ghostOwner = QOwner[i];
+      
+      //MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+      //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+      MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+    }
+#ifdef DEBUG_HANG_
+    cout << myRank<<" Done sending messages"<<endl;
+#endif
+}
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@ -0,0 +1,315 @@
+#include "MatchBoxPC.h"
+//#define DEBUG_HANG_
+
+void processMessages(
+    MilanLongInt NLVer,
+    MilanLongInt *Mate,
+    MilanLongInt *candidateMate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    vector<MilanLongInt> &GMate,
+    vector<MilanLongInt> &Counter,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *msgActual,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *verLocPtr,
+    MilanLongInt k,
+    MilanLongInt *verLocInd,
+    MilanInt numProcs,
+    MilanInt myRank,
+    MPI_Comm comm,
+    vector<MilanLongInt> &Message,
+    MilanLongInt numGhostEdges,
+    MilanLongInt u,
+    MilanLongInt v,
+    MilanLongInt *S,
+    vector<MilanLongInt> &U)
+{
+
+    //#define PRINT_DEBUG_INFO_
+
+    MilanInt Sender;
+    MPI_Status computeStatus;
+    MilanLongInt bundleSize, w;
+    MilanLongInt adj11, adj12, k1;
+    MilanLongInt ghostOwner;
+    int error_codeC;
+    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+    char error_message[MPI_MAX_ERROR_STRING];
+    int message_length;
+    MilanLongInt message_type = 0;
+
+    // Buffer to receive bundled messages
+    // Maximum messages that can be received from any processor is
+    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
+    vector<MilanLongInt> ReceiveBuffer;
+    try
+    {
+        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+
+#ifdef PRINT_DEBUG_INFO_
+    cout
+        << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+    // BLOCKING RECEIVE:
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+    //cout << myRank<<" Receiving ...";
+    error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
+    if (error_codeC != MPI_SUCCESS)
+    {
+        MPI_Error_string(error_codeC, error_message, &message_length);
+        cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
+        fflush(stdout);
+    }
+    Sender = computeStatus.MPI_SOURCE;
+    //cout << " ...from "<<Sender << endl;
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
+    fflush(stdout);
+#endif
+
+    if (Message[2] == SIZEINFO)  {
+
+#ifdef PRINT_DEBUG_INFO_
+      cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
+      fflush(stdout);
+#endif
+      bundleSize = Message[0]; //#of integers in the message
+      // Build the Message Buffer:
+      if (!ReceiveBuffer.empty())
+	ReceiveBuffer.clear();            // Empty it out first
+      ReceiveBuffer.resize(bundleSize, -1); // Initialize
+#ifdef PRINT_DEBUG_INFO_
+      cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
+      for (int i = 0; i < bundleSize; i++)
+	cout << ReceiveBuffer[i] << ",";
+      cout << endl;
+      fflush(stdout);
+#endif
+      // Receive the message
+      //cout << myRank<<" Receiving from "<<Sender<<endl;
+      error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
+      if (error_codeC != MPI_SUCCESS) {
+	MPI_Error_string(error_codeC, error_message, &message_length);
+	cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
+	fflush(stdout);
+      }
+#ifdef PRINT_DEBUG_INFO_
+      cout << "\n(" << myRank << ")Message Bundle After: " << endl;
+      for (int i = 0; i < bundleSize; i++)
+	cout << ReceiveBuffer[i] << ",";
+      cout << endl;
+      fflush(stdout);
+#endif
+    } else   { // Just a single message:
+#ifdef PRINT_DEBUG_INFO_
+      cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
+      fflush(stdout);
+#endif
+      // Add the current message to Queue:
+      bundleSize = 3; //#of integers in the message
+      // Build the Message Buffer:
+      if (!ReceiveBuffer.empty())
+	ReceiveBuffer.clear();            // Empty it out first
+      ReceiveBuffer.resize(bundleSize, -1); // Initialize
+      
+      ReceiveBuffer[0] = Message[0]; // u
+      ReceiveBuffer[1] = Message[1]; // v
+      ReceiveBuffer[2] = Message[2]; // message_type
+    }
+    
+#ifdef DEBUG_GHOST_
+    if ((v < StartIndex) || (v > EndIndex)) {
+      cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+      fflush(stdout);
+    }
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
+    fflush(stdout);
+#endif
+
+    // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
+    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)  {
+      u = ReceiveBuffer[bundleCounter - 3];            // GHOST
+      v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
+      message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
+      
+      // CASE I: REQUEST
+      if (message_type == REQUEST)  {
+#ifdef PRINT_DEBUG_INFO_
+	cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
+	fflush(stdout);
+#endif
+#ifdef DEBUG_GHOST_
+	if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
+	  cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+	  fflush(stdout);
+	}
+
+#endif
+
+	if (Mate[v - StartIndex] == -1) {
+	  // Process only if not already matched  (v is local)
+	  candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
+	  if (candidateMate[v - StartIndex] == u)   {
+	    GMate[Ghost2LocalMap[u]] = v; // u is ghost
+	    Mate[v - StartIndex] = u;     // v is local
+	    U.push_back(v);
+	    U.push_back(u);
+	    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+	    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
+	    fflush(stdout);
+#endif
+	    
+	    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+	  } // End of if ( candidateMate[v-StartIndex] == u )e
+	}     // End of if ( Mate[v] == -1 )
+      }         // End of REQUEST
+      else  { // CASE II: SUCCESS
+	if (message_type == SUCCESS) {
+#ifdef PRINT_DEBUG_INFO_
+	  cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
+	  fflush(stdout);
+#endif
+	  GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
+	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+#ifdef DEBUG_GHOST_
+	  if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer)) {
+	    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+	    fflush(stdout);
+	  }
+#endif
+	  if (Mate[v - StartIndex] == -1) {
+	    // Process only if not already matched ( v is local)
+	    if (candidateMate[v - StartIndex] == u)  {
+	      // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+	      w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k,
+				       verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
+	      candidateMate[v - StartIndex] = w;
+#ifdef PRINT_DEBUG_INFO_
+	      cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
+	      fflush(stdout);
+#endif
+	      // If found a dominating edge:
+	      if (w >= 0)  {
+		if ((w < StartIndex) || (w > EndIndex))   {
+		  // w is a ghost
+		  // Build the Message Packet:
+		  Message[0] = v;       // LOCAL
+		  Message[1] = w;       // GHOST
+		  Message[2] = REQUEST; // TYPE
+		  // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+		  cout << "\n(" << myRank << ")Sending a request message: ";
+		  cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+		  fflush(stdout);
+#endif
+		  ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		  //assert(ghostOwner != -1);
+		  //assert(ghostOwner != myRank);
+		  //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+		  MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		  (*msgInd)++;
+		  (*msgActual)++;
+		  if (candidateMate[NLVer + Ghost2LocalMap[w]] == v) {
+		    Mate[v - StartIndex] = w;     // v is local
+		    GMate[Ghost2LocalMap[w]] = v; // w is ghost
+		    U.push_back(v);
+		    U.push_back(w);
+		    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+		    fflush(stdout);
+#endif
+		    
+		    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+		  } // End of if CandidateMate[w] = v
+		}     // End of if a Ghost Vertex
+		else   { // w is a local vertex
+		  if (candidateMate[w - StartIndex] == v) {
+		    Mate[v - StartIndex] = w; // v is local
+		    Mate[w - StartIndex] = v; // w is local
+		    // Q.push_back(u);
+		    U.push_back(v);
+		    U.push_back(w);
+		    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+		    fflush(stdout);
+#endif
+		  } // End of if(CandidateMate(w) = v
+		}     // End of Else
+	      }         // End of if(w >=0)
+	      else  { // No dominant edge found
+		adj11 = verLocPtr[v - StartIndex];
+		adj12 = verLocPtr[v - StartIndex + 1];
+		for (k1 = adj11; k1 < adj12; k1++) {
+		  w = verLocInd[k1];
+		  if ((w < StartIndex) || (w > EndIndex))  {
+		    // A ghost
+		    // Build the Message Packet:
+		    Message[0] = v;       // LOCAL
+		    Message[1] = w;       // GHOST
+		    Message[2] = FAILURE; // TYPE
+		    // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+		    cout << "\n(" << myRank << ")Sending a failure message: ";
+		    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+		    fflush(stdout);
+#endif
+		    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+		    //assert(ghostOwner != -1);
+		    //assert(ghostOwner != myRank);
+		    //cout << myRank<<" Sending to "<<ghostOwner<<endl;
+		    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+		    (*msgInd)++;
+		    (*msgActual)++;
+		  } // End of if(GHOST)
+		}     // End of for loop
+	      }         // End of Else: w == -1
+                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+	    } // End of if ( candidateMate[v-StartIndex] == u )
+	  }     // End of if ( Mate[v] == -1 )
+	}         // End of if ( message_type == SUCCESS )
+	else  {
+	  // CASE III: FAILURE
+#ifdef PRINT_DEBUG_INFO_
+	  cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
+	  fflush(stdout);
+#endif
+	  GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+	  PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
+	}                                                       // End of else: CASE III
+      }                                                           // End of else: CASE I
+    }
+    
+    return;
+}
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@ -0,0 +1,36 @@
+#include "MatchBoxPC.h"
+
+void queuesTransfer(vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner)
+{
+
+#pragma omp critical(U)
+    {
+        U.insert(U.end(), privateU.begin(), privateU.end());
+    }
+
+    privateU.clear();
+
+#pragma omp critical(sendMessageTransfer)
+    {
+
+        QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+        QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+        QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+        QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+    }
+
+    privateQLocalVtx.clear();
+    privateQGhostVtx.clear();
+    privateQMsgType.clear();
+    privateQOwner.clear();
+
+}
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@ -0,0 +1,209 @@
+#include "MatchBoxPC.h"
+
+void sendBundledMessages(MilanLongInt *numGhostEdges,
+                                MilanInt *BufferSize,
+                                MilanLongInt *Buffer,
+                                vector<MilanLongInt> &PCumulative,
+                                vector<MilanLongInt> &PMessageBundle,
+                                vector<MilanLongInt> &PSizeInfoMessages,
+                                MilanLongInt *PCounter,
+                                MilanLongInt NumMessagesBundled,
+                                MilanLongInt *msgActual,
+                                MilanLongInt *msgInd,
+                                MilanInt numProcs,
+                                MilanInt myRank,
+                                MPI_Comm comm,
+                                vector<MilanLongInt> &QLocalVtx,
+                                vector<MilanLongInt> &QGhostVtx,
+                                vector<MilanLongInt> &QMsgType,
+                                vector<MilanInt> &QOwner,
+                                vector<MPI_Request> &SRequest,
+                                vector<MPI_Status> &SStatus)
+{
+
+    MilanLongInt myIndex = 0, numMessagesToSend;
+    MilanInt i = 0, OneMessageSize = 0;
+
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") Send Bundles" << endl;
+    fflush(stdout);
+#endif
+
+#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
+    {
+#pragma omp master
+        {
+// Data structures for Bundled Messages:
+#pragma omp task depend(inout                                                       \
+                        : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
+                                                                                 : NumMessagesBundled, numProcs)
+            {
+	      try {
+                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
+		PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
+		PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
+	      }
+	      catch (length_error)
+		{
+		  cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+		  cout << "Not enough memory to allocate the internal variables \n";
+		  exit(1);
+		}
+	      PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
+	      PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
+	      PSizeInfoMessages.resize(numProcs * 3, 0);
+	    }
+
+#pragma omp task depend(inout                    \
+                        : PCumulative) depend(in		\
+                                              : PCounter)
+	    {
+	      for (i = 0; i < numProcs; i++)
+		PCumulative[i + 1] = PCumulative[i] + PCounter[i];
+	    }
+	    
+#pragma omp task depend(inout \
+                        : PCounter)
+	    {
+	      // Reuse PCounter to keep track of how many messages were inserted:
+	      for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+		PCounter[i] = 0;
+	    }
+
+// Build the Message Bundle packet:
+#pragma omp task depend(in                                                                                          \
+                        : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
+                                                                                                                : myIndex, PMessageBundle, PCounter)
+{
+  for (i = 0; i < NumMessagesBundled; i++)   {
+    myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
+    PMessageBundle[myIndex + 0] = QLocalVtx[i];
+    PMessageBundle[myIndex + 1] = QGhostVtx[i];
+    PMessageBundle[myIndex + 2] = QMsgType[i];
+    PCounter[QOwner[i]]++;
+  }
+ }
+ 
+// Send the Bundled Messages: Use ISend
+#pragma omp task depend(out \
+                        : SRequest, SStatus)
+ {
+   try
+     {
+       SRequest.reserve(numProcs * 2); // At most two messages per processor
+       SStatus.reserve(numProcs * 2);  // At most two messages per processor
+     }
+   catch (length_error)
+     {
+       cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+       cout << "Not enough memory to allocate the internal variables \n";
+       exit(1);
+     }
+ }
+ 
+// Send the Messages
+#pragma omp task depend(inout                                                  \
+                        : SRequest, PSizeInfoMessages, PCumulative) depend(out \
+                                                                           : *msgActual, *msgInd)
+{
+  for (i = 0; i < numProcs; i++)   {                    // Changed by Fabio to be an integer, addresses needs to be integers!
+    if (i == myRank) // Do not send anything to yourself
+      continue;
+    // Send the Message with information about the size of next message:
+    // Build the Message Packet:
+    PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
+    PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
+    PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
+    // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
+    fflush(stdout);
+#endif
+    if (PSizeInfoMessages[i * 3 + 0] > 0)
+      { // Send only if it is a nonempty packet
+	MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
+		  &SRequest[(*msgInd)]);
+	(*msgActual)++;
+	(*msgInd)++;
+	// Now Send the message with the data packet:
+#ifdef PRINT_DEBUG_INFO_
+	cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
+	for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
+	  cout << PMessageBundle[k] << ",";
+	cout << endl;
+	fflush(stdout);
+#endif
+	MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
+		  TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
+	(*msgInd)++;
+      } // End of if size > 0
+  }
+}
+
+#pragma omp task depend(inout \
+                        : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
+{
+
+    // Free up temporary memory:
+  PCumulative.clear();
+  QLocalVtx.clear();
+  QGhostVtx.clear();
+  QMsgType.clear();
+  QOwner.clear();
+}
+
+#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
+{
+
+#ifdef PRINT_DEBUG_INFO_
+  cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+  cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
+  cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
+  if (*numGhostEdges > 0)
+    {
+      cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
+    }
+  fflush(stdout);
+#endif
+
+  // Allocate memory for MPI Send messages:
+  /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+  OneMessageSize = 0;
+  MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
+  // How many messages to send?
+  // Potentially three kinds of messages will be sent/received:
+  // Request, Success, Failure.
+  // But only two will be sent from a given processor.
+  // Substract the number of messages that have already been sent as bundled messages:
+  numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
+  *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+}
+
+#pragma omp task depend(out : Buffer) depend(in : *BufferSize)
+ {
+   Buffer = 0;
+#ifdef PRINT_DEBUG_INFO_
+   cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
+   cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
+   cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+   cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
+   cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
+   cout << "\n(" << myRank << ")Attaching Buffer on.. ";
+   fflush(stdout);
+#endif
+   if ((*BufferSize) > 0)
+     {
+       Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
+       if (Buffer == 0)
+	 {
+	   cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+	   cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
+	   exit(1);
+	 }
+       MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
+     }
+ }
+}
+}
+}
--- a/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_c_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_cspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.

  name='amg_c_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")

  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
+  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_c_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
+  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_d_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_dspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.

  name='amg_d_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")

  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
+  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_d_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
+  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_s_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_sspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.

  name='amg_s_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")

  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
+  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_s_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
+  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
+++ b/amgprec/impl/level/amg_z_base_onelev_mat_asb.f90
@ -109,6 +109,8 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  type(psb_zspmat_type)    :: ac, op_restr, op_prol
  integer(psb_ipk_)        :: nzl, inl
  integer(psb_ipk_)        :: debug_level, debug_unit
+  integer(psb_ipk_), save  :: idx_matbld=-1, idx_matasb=-1, idx_mapbld=-1
+  logical, parameter :: do_timings=.false.

  name='amg_z_onelev_mat_asb'
  call psb_erractionsave(err_act)
@ -120,6 +122,12 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  info  = psb_success_
  ctxt = desc_a%get_context()
  call psb_info(ctxt,me,np)
+  if ((do_timings).and.(idx_matbld==-1))       &
+       & idx_matbld = psb_get_timer_idx("LEV_MASB: mat_bld")
+  if ((do_timings).and.(idx_matasb==-1))     &
+       & idx_matasb = psb_get_timer_idx("LEV_MASB: mat_asb")
+  if ((do_timings).and.(idx_mapbld==-1))       &
+       & idx_mapbld = psb_get_timer_idx("LEV_MASB: map_bld")

  call amg_check_def(lv%parms%aggr_prol,'Smoother',&
       &   amg_smooth_prol_,is_legal_ml_aggr_prol)
@ -139,9 +147,10 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! the mapping defined by amg_aggrmap_bld and applying the aggregation
  ! algorithm specified by lv%iprcparm(amg_aggr_prol_)
  !
+  if (do_timings) call psb_tic(idx_matbld)
  call lv%aggr%mat_bld(lv%parms,a,desc_a,ilaggr,nlaggr,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,t_prol,info)
-
+  if (do_timings) call psb_toc(idx_matbld)
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_aggrmat_asb')
    goto 9999
@ -151,14 +160,17 @@ subroutine amg_z_base_onelev_mat_asb(lv,a,desc_a,ilaggr,nlaggr,t_prol,info)
  ! Now build its descriptor and convert global indices for
  ! ac, op_restr and op_prol
  !
+  if (do_timings) call psb_tic(idx_matasb)
  if (info == psb_success_) &
       & call lv%aggr%mat_asb(lv%parms,a,desc_a,&
       & lv%ac,lv%desc_ac,op_prol,op_restr,info)
-  
+  if (do_timings) call psb_toc(idx_matasb)
+  if (do_timings) call psb_tic(idx_mapbld)  
  if (info == psb_success_) call lv%ac%cscnv(info,type='csr',dupl=psb_dupl_add_)
  
  if (info == psb_success_) call lv%aggr%bld_map(desc_a, lv%desc_ac,&
       & ilaggr,nlaggr,op_restr,op_prol,lv%linmap,info)
+  if (do_timings) call psb_toc(idx_mapbld)  
  if(info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='mat_asb/map_bld')
    goto 9999
--- a/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_c_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)

  else

--- a/amgprec/impl/solver/amg_c_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_c_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='c_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_c_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else

    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_d_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)

  else

--- a/amgprec/impl/solver/amg_d_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_d_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_d_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else

    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_s_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)

  else

--- a/amgprec/impl/solver/amg_s_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_s_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='s_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_s_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else

    info = psb_err_missing_override_method_
--- a/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_bwgs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='d_bwgs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("BWGS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,7 +81,10 @@ subroutine amg_z_bwgs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in bwgs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=-ione,jmax=nrow_a,u=sv%u)     
+    if (do_timings) call psb_toc(idx_tril)

  else

--- a/amgprec/impl/solver/amg_z_gs_solver_bld.f90
+++ b/amgprec/impl/solver/amg_z_gs_solver_bld.f90
@ -56,6 +56,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  type(psb_ctxt_type) :: ctxt
  integer(psb_ipk_)   :: np, me, i, err_act, debug_unit, debug_level
  character(len=20)   :: name='z_gs_solver_bld', ch_err
+  integer(psb_ipk_), save :: idx_tril=-1
+  logical, parameter      :: do_timings=.true.

  info=psb_success_
  call psb_erractionsave(err_act)
@ -65,6 +67,8 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
  call psb_info(ctxt, me, np)
  if (debug_level >= psb_debug_outer_) &
       & write(debug_unit,*) me,' ',trim(name),' start'
+  if ((do_timings).and.(idx_tril==-1))       &
+       & idx_tril = psb_get_timer_idx("GS_BLD: tril")


  n_row  = desc_a%get_local_rows()
@ -77,8 +81,11 @@ subroutine amg_z_gs_solver_bld(a,desc_a,sv,info,b,amold,vmold,imold)
    ! This cuts out the off-diagonal part, because it's supposed to
    ! be handled by the outer Jacobi smoother.
    !
+    !write(0,*) 'Calling A%TRIL in gs_solver_bld'
+    if (do_timings) call psb_tic(idx_tril)
    call a%tril(sv%l,info,diag=izero,jmax=nrow_a,u=sv%u)
-
+    if (do_timings) call psb_toc(idx_tril)
+    !write(0,*) 'From A%TRIL in gs_solver_bld',a%get_nzeros(),sv%l%get_nzeros(),sv%u%get_nzeros()
  else

    info = psb_err_missing_override_method_
--- a/exec.sh
+++ b/exec.sh
@ -0,0 +1,25 @@
+cd amgprec/impl/aggregator/
+rm MatchBoxPC.o 
+rm sendBundledMessages.o 
+rm initialize.o 
+rm extractUChunk.o 
+rm isAlreadyMatched.o 
+rm findOwnerOfGhost.o 
+rm computeCandidateMate.o 
+rm parallelComputeCandidateMateB.o 
+rm processMatchedVertices.o 
+rm processCrossEdge.o 
+rm queueTransfer.o 
+rm processMessages.o 
+rm processExposedVertex.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
+cd ../../../
+make all
+cd samples/advanced/pdegen
+make amg_d_pde3d
+cd runs
+mpirun -np 4 amg_d_pde3d amg_pde3d.inp
+
+
+
--- a/samples/advanced/pdegen/amg_d_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_d_genpde_mod.F90
@ -93,6 +93,9 @@ contains
       & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
    !
    !   Discretizes the partial differential equation
    !
@ -128,7 +131,6 @@ contains
    type(psb_d_csc_sparse_mat)  :: acsc
    type(psb_d_coo_sparse_mat)  :: acoo
    type(psb_d_csr_sparse_mat)  :: acsr
-    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_dpk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_dpk_)            :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
    call psb_barrier(ctxt)
    talc = psb_wtime()-t0

-    if (info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='allocation rout.'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        ! x, y, z coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-        z = (iz-1)*deltah
-        zt(k) = f_(x,y,z)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y,z)
-        !
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        if (ix == 1) then
-          zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1,z)
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y,z-1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-        if (iz == 1) then
-          zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y,z)
-        val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-             & + c(x,y,z)
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
-        if (iz == idim) then
-          zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y+1,z)
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y,z)
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
-        if (ix==idim) then
-          zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_dpk_), allocatable :: val(:)
+      real(psb_dpk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr, nb
+        if (info /= psb_success_) cycle
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+          ! x, y, z coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+          z = (iz-1)*deltah
+          zt(k) = f_(x,y,z)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y,z)
+          !
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+          if (ix == 1) then
+            zt(k) = g(dzero,y,z)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1,z)
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,dzero,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y,z-1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          if (iz == 1) then
+            zt(k) = g(x,y,dzero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y,z)
+          val(icoeff)=(2*done)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+               & + c(x,y,z)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          if (iz == idim) then
+            zt(k) = g(x,y,done)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y+1,z)
+          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,done,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y,z)
+          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+          if (ix==idim) then
+            zt(k) = g(done,y,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif

+        end do
+        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=dzero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=dzero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel

    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
      goto 9999
    end if

-    deallocate(val,irow,icol)

    call psb_barrier(ctxt)
    t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
       & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
    !
    !   Discretizes the partial differential equation
    !
@ -591,7 +603,6 @@ contains
    type(psb_d_csc_sparse_mat)  :: acsc
    type(psb_d_coo_sparse_mat)  :: acoo
    type(psb_d_csr_sparse_mat)  :: acsr
-    real(psb_dpk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_dpk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_dpk_)            :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
      goto 9999
    end if

-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,glob_row,idim,idim)
-        ! x, y coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-
-        zt(k) = f_(x,y)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y)
-        !
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-        if (ix == 1) then
-          zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1)
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y)
-        val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y+1)
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y)
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
-        if (ix==idim) then
-          zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_dpk_), allocatable :: val(:)
+      real(psb_dpk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      ! loop over rows belonging to current process in a block
+      ! distribution.
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr,nb
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,glob_row,idim,idim)
+          ! x, y coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+
+          zt(k) = f_(x,y)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y)
+          !
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+          if (ix == 1) then
+            zt(k) = g(dzero,y)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1)
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,dzero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y)
+          val(icoeff)=(2*done)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
+          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,done)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y)
+          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+          if (ix==idim) then
+            zt(k) = g(done,y)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif

+        end do
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=dzero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=dzero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel

    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
      goto 9999
    end if

-    deallocate(val,irow,icol)
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
    call psb_cdasb(desc_a,info)
--- a/samples/advanced/pdegen/amg_d_pde2d.F90
+++ b/samples/advanced/pdegen/amg_d_pde2d.F90
@ -73,6 +73,9 @@ program amg_d_pde2d
  use amg_d_pde2d_exp_mod
  use amg_d_pde2d_box_mod
  use amg_d_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! input parameters
@ -93,7 +96,7 @@ program amg_d_pde2d
  type(psb_d_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth

  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_d_pde2d

  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif

  if (iam < 0) then
    ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_d_pde2d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
    write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
    write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
    write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
--- a/samples/advanced/pdegen/amg_d_pde3d.F90
+++ b/samples/advanced/pdegen/amg_d_pde3d.F90
@ -74,6 +74,9 @@ program amg_d_pde3d
  use amg_d_pde3d_exp_mod
  use amg_d_pde3d_gauss_mod
  use amg_d_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! input parameters
@ -94,7 +97,7 @@ program amg_d_pde3d
  type(psb_d_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth

  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_d_pde3d
  ! other variables
  integer(psb_ipk_)  :: info, i, k
  character(len=20)  :: name,ch_err
-
+  type(psb_d_csr_sparse_mat) :: amold
  info=psb_success_


  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif

  if (iam < 0) then
    ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_d_pde3d
  end if
  call psb_barrier(ctxt)
  t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
  tprec = psb_wtime()-t1
  if (info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_d_pde3d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_d_pde3d
    write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()

  end if
-
+  call psb_print_timers(ctxt)
  !
  !  cleanup storage and exit
  !
--- a/samples/advanced/pdegen/amg_s_genpde_mod.F90
+++ b/samples/advanced/pdegen/amg_s_genpde_mod.F90
@ -93,6 +93,9 @@ contains
       & a1,a2,a3,b1,b2,b3,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
    !
    !   Discretizes the partial differential equation
    !
@ -128,7 +131,6 @@ contains
    type(psb_s_csc_sparse_mat)  :: acsc
    type(psb_s_coo_sparse_mat)  :: acoo
    type(psb_s_csr_sparse_mat)  :: acsr
-    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -141,8 +143,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_spk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_spk_)            :: deltah, sqdeltah, deltah2
@ -368,119 +369,128 @@ contains
    call psb_barrier(ctxt)
    talc = psb_wtime()-t0

-    if (info /= psb_success_) then
-      info=psb_err_from_subroutine_
-      ch_err='allocation rout.'
-      call psb_errpush(info,name,a_err=ch_err)
-      goto 9999
-    end if
-
-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
-        ! x, y, z coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-        z = (iz-1)*deltah
-        zt(k) = f_(x,y,z)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y,z)
-        !
-        val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
-        if (ix == 1) then
-          zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1,z)
-        val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y,z-1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
-        if (iz == 1) then
-          zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y,z)
-        val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
-             & + c(x,y,z)
-        call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y,z+1)
-        val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
-        if (iz == idim) then
-          zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y+1,z)
-        val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y,z)
-        val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
-        if (ix==idim) then
-          zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_spk_), allocatable :: val(:)
+      real(psb_spk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr, nb
+        if (info /= psb_success_) cycle
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,iz,glob_row,idim,idim,idim)
+          ! x, y, z coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+          z = (iz-1)*deltah
+          zt(k) = f_(x,y,z)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y,z)
+          !
+          val(icoeff) = -a1(x,y,z)/sqdeltah-b1(x,y,z)/deltah2
+          if (ix == 1) then
+            zt(k) = g(szero,y,z)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1,z)
+          val(icoeff)  = -a2(x,y,z)/sqdeltah-b2(x,y,z)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,szero,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y,z-1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah-b3(x,y,z)/deltah2
+          if (iz == 1) then
+            zt(k) = g(x,y,szero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz-1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y,z)
+          val(icoeff)=(2*sone)*(a1(x,y,z)+a2(x,y,z)+a3(x,y,z))/sqdeltah &
+               & + c(x,y,z)
+          call ijk2idx(icol(icoeff),ix,iy,iz,idim,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y,z+1)
+          val(icoeff)=-a3(x,y,z)/sqdeltah+b3(x,y,z)/deltah2
+          if (iz == idim) then
+            zt(k) = g(x,y,sone)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy,iz+1,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y+1,z)
+          val(icoeff)=-a2(x,y,z)/sqdeltah+b2(x,y,z)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,sone,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y,z)
+          val(icoeff)=-a1(x,y,z)/sqdeltah+b1(x,y,z)/deltah2
+          if (ix==idim) then
+            zt(k) = g(sone,y,z)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,iz,idim,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif

+        end do
+        !write(0,*) ' Outer in_parallel ',omp_in_parallel()
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=szero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=szero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel

    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -490,7 +500,6 @@ contains
      goto 9999
    end if

-    deallocate(val,irow,icol)

    call psb_barrier(ctxt)
    t1 = psb_wtime()
@ -557,6 +566,9 @@ contains
       & a1,a2,b1,b2,c,g,info,f,amold,vmold,partition, nrl,iv)
    use psb_base_mod
    use psb_util_mod
+#if defined(OPENMP)
+    use omp_lib
+#endif
    !
    !   Discretizes the partial differential equation
    !
@ -591,7 +603,6 @@ contains
    type(psb_s_csc_sparse_mat)  :: acsc
    type(psb_s_coo_sparse_mat)  :: acoo
    type(psb_s_csr_sparse_mat)  :: acsr
-    real(psb_spk_)           :: zt(nb),x,y,z,xph,xmh,yph,ymh,zph,zmh
    integer(psb_ipk_) :: nnz,nr,nlr,i,j,ii,ib,k, partition_
    integer(psb_lpk_) :: m,n,glob_row,nt
    integer(psb_ipk_) :: ix,iy,iz,ia,indx_owner
@ -604,8 +615,7 @@ contains
    ! Process grid
    integer(psb_ipk_) :: np, iam
    integer(psb_ipk_) :: icoeff
-    integer(psb_lpk_), allocatable     :: irow(:),icol(:),myidx(:)
-    real(psb_spk_), allocatable :: val(:)
+    integer(psb_lpk_), allocatable     :: myidx(:)
    ! deltah dimension of each grid cell
    ! deltat discretization time
    real(psb_spk_)            :: deltah, sqdeltah, deltah2, dd
@ -816,93 +826,109 @@ contains
      goto 9999
    end if

-    ! we build an auxiliary matrix consisting of one row at a
-    ! time; just a small matrix. might be extended to generate
-    ! a bunch of rows per call.
-    !
-    allocate(val(20*nb),irow(20*nb),&
-         &icol(20*nb),stat=info)
-    if (info /= psb_success_ ) then
-      info=psb_err_alloc_dealloc_
-      call psb_errpush(info,name)
-      goto 9999
-    endif
-
-
-    ! loop over rows belonging to current process in a block
-    ! distribution.
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
-    do ii=1, nlr,nb
-      ib = min(nb,nlr-ii+1)
-      icoeff = 1
-      do k=1,ib
-        i=ii+k-1
-        ! local matrix pointer
-        glob_row=myidx(i)
-        ! compute gridpoint coordinates
-        call idx2ijk(ix,iy,glob_row,idim,idim)
-        ! x, y coordinates
-        x = (ix-1)*deltah
-        y = (iy-1)*deltah
-
-        zt(k) = f_(x,y)
-        ! internal point: build discretization
-        !
-        !  term depending on   (x-1,y)
-        !
-        val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
-        if (ix == 1) then
-          zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x,y-1)
-        val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
-        if (iy == 1) then
-          zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-
-        !  term depending on     (x,y)
-        val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
-        call ijk2idx(icol(icoeff),ix,iy,idim,idim)
-        irow(icoeff) = glob_row
-        icoeff       = icoeff+1
-        !  term depending on     (x,y+1)
-        val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
-        if (iy == idim) then
-          zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
-          irow(icoeff) = glob_row
-          icoeff       = icoeff+1
-        endif
-        !  term depending on     (x+1,y)
-        val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
-        if (ix==idim) then
-          zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
-        else
-          call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+    !$omp parallel shared(deltah,myidx,a,desc_a)
+    !
+    block 
+      integer(psb_ipk_) :: i,j,k,ii,ib,icoeff, ix,iy,iz, ith,nth
+      integer(psb_lpk_) :: glob_row
+      integer(psb_lpk_), allocatable :: irow(:),icol(:)
+      real(psb_spk_), allocatable :: val(:)
+      real(psb_spk_)     :: x,y,z, zt(nb)
+#if defined(OPENMP)
+      nth = omp_get_num_threads()
+      ith = omp_get_thread_num()
+#else
+      nth = 1
+      ith = 0
+#endif
+      allocate(val(20*nb),irow(20*nb),&
+           &icol(20*nb),stat=info)
+      if (info /= psb_success_ ) then
+        info=psb_err_alloc_dealloc_
+        call psb_errpush(info,name)
+        !goto 9999
+      endif
+
+      ! loop over rows belonging to current process in a block
+      ! distribution.
+      !$omp  do schedule(dynamic)
+      !     
+      do ii=1, nlr,nb
+        ib = min(nb,nlr-ii+1)
+        icoeff = 1
+        do k=1,ib
+          i=ii+k-1
+          ! local matrix pointer
+          glob_row=myidx(i)
+          ! compute gridpoint coordinates
+          call idx2ijk(ix,iy,glob_row,idim,idim)
+          ! x, y coordinates
+          x = (ix-1)*deltah
+          y = (iy-1)*deltah
+
+          zt(k) = f_(x,y)
+          ! internal point: build discretization
+          !
+          !  term depending on   (x-1,y)
+          !
+          val(icoeff) = -a1(x,y)/sqdeltah-b1(x,y)/deltah2
+          if (ix == 1) then
+            zt(k) = g(szero,y)*(-val(icoeff)) + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix-1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x,y-1)
+          val(icoeff)  = -a2(x,y)/sqdeltah-b2(x,y)/deltah2
+          if (iy == 1) then
+            zt(k) = g(x,szero)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy-1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+
+          !  term depending on     (x,y)
+          val(icoeff)=(2*sone)*(a1(x,y) + a2(x,y))/sqdeltah + c(x,y)
+          call ijk2idx(icol(icoeff),ix,iy,idim,idim)
          irow(icoeff) = glob_row
          icoeff       = icoeff+1
-        endif
+          !  term depending on     (x,y+1)
+          val(icoeff)=-a2(x,y)/sqdeltah+b2(x,y)/deltah2
+          if (iy == idim) then
+            zt(k) = g(x,sone)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix,iy+1,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif
+          !  term depending on     (x+1,y)
+          val(icoeff)=-a1(x,y)/sqdeltah+b1(x,y)/deltah2
+          if (ix==idim) then
+            zt(k) = g(sone,y)*(-val(icoeff))   + zt(k)
+          else
+            call ijk2idx(icol(icoeff),ix+1,iy,idim,idim)
+            irow(icoeff) = glob_row
+            icoeff       = icoeff+1
+          endif

+        end do
+        call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
+        if(info /= psb_success_) cycle
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
+        if(info /= psb_success_) cycle
+        zt(:)=szero
+        call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
+        if(info /= psb_success_) cycle
      end do
-      call psb_spins(icoeff-1,irow,icol,val,a,desc_a,info)
-      if(info /= psb_success_) exit
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),bv,desc_a,info)
-      if(info /= psb_success_) exit
-      zt(:)=szero
-      call psb_geins(ib,myidx(ii:ii+ib-1),zt(1:ib),xv,desc_a,info)
-      if(info /= psb_success_) exit
-    end do
+      !$omp end do
+
+      deallocate(val,irow,icol)
+    end block
+    !$omp end parallel

    tgen = psb_wtime()-t1
    if(info /= psb_success_) then
@ -912,8 +938,6 @@ contains
      goto 9999
    end if

-    deallocate(val,irow,icol)
-
    call psb_barrier(ctxt)
    t1 = psb_wtime()
    call psb_cdasb(desc_a,info)
--- a/samples/advanced/pdegen/amg_s_pde2d.F90
+++ b/samples/advanced/pdegen/amg_s_pde2d.F90
@ -73,6 +73,9 @@ program amg_s_pde2d
  use amg_s_pde2d_exp_mod
  use amg_s_pde2d_box_mod
  use amg_s_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! input parameters
@ -93,7 +96,7 @@ program amg_s_pde2d
  type(psb_s_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth

  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -197,6 +200,15 @@ program amg_s_pde2d

  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif

  if (iam < 0) then
    ! This should not happen, but just in case
@ -451,12 +463,14 @@ program amg_s_pde2d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')   np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
-    write(psb_out_unit,'("PDE Coefficients                   : ",a)')  trim(pdecoeff)
-    write(psb_out_unit,'("Krylov method                      : ",a)')  trim(s_choice%kmethd)
-    write(psb_out_unit,'("Preconditioner                     : ",a)')  trim(p_choice%descr)
-    write(psb_out_unit,'("Iterations to convergence          : ",i12)')   iter
+    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
+    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
+    write(psb_out_unit,'("Preconditioner                     : ",a)') trim(p_choice%descr)
+    write(psb_out_unit,'("Iterations to convergence          : ",i12)')    iter
    write(psb_out_unit,'("Relative error estimate on exit    : ",es12.5)') err
    write(psb_out_unit,'("Number of levels in hierarchy      : ",i12)')    prec%get_nlevs()
    write(psb_out_unit,'("Time to build hierarchy            : ",es12.5)') thier
--- a/samples/advanced/pdegen/amg_s_pde3d.F90
+++ b/samples/advanced/pdegen/amg_s_pde3d.F90
@ -74,6 +74,9 @@ program amg_s_pde3d
  use amg_s_pde3d_exp_mod
  use amg_s_pde3d_gauss_mod
  use amg_s_genpde_mod
+#if defined(OPENMP)
+  use omp_lib
+#endif
  implicit none

  ! input parameters
@ -94,7 +97,7 @@ program amg_s_pde3d
  type(psb_s_vect_type) :: x,b,r
  ! parallel environment
  type(psb_ctxt_type) :: ctxt
-  integer(psb_ipk_)   :: iam, np
+  integer(psb_ipk_)   :: iam, np, nth

  ! solver parameters
  integer(psb_ipk_)        :: iter, itmax,itrace, istopc, irst, nlv
@ -192,12 +195,21 @@ program amg_s_pde3d
  ! other variables
  integer(psb_ipk_)  :: info, i, k
  character(len=20)  :: name,ch_err
-
+  type(psb_s_csr_sparse_mat) :: amold
  info=psb_success_


  call psb_init(ctxt)
  call psb_info(ctxt,iam,np)
+#if defined(OPENMP)
+  !$OMP parallel shared(nth)
+  !$OMP master
+  nth = omp_get_num_threads()
+  !$OMP end master
+  !$OMP end parallel
+#else
+  nth = 1
+#endif

  if (iam < 0) then
    ! This should not happen, but just in case
@ -390,7 +402,7 @@ program amg_s_pde3d
  end if
  call psb_barrier(ctxt)
  t1 = psb_wtime()
-  call prec%smoothers_build(a,desc_a,info)
+  call prec%smoothers_build(a,desc_a,info,amold=amold)
  tprec = psb_wtime()-t1
  if (info /= psb_success_) then
    call psb_errpush(psb_err_from_subroutine_,name,a_err='amg_smoothers_bld')
@ -455,7 +467,9 @@ program amg_s_pde3d
  call psb_sum(ctxt,precsize)
  call prec%descr(info,iout=psb_out_unit)
  if (iam == psb_root_) then
-    write(psb_out_unit,'("Computed solution on ",i8," processors")')  np
+    write(psb_out_unit,'("Computed solution on ",i8," process(es)")')  np
+    write(psb_out_unit,'("Number of threads                  : ",i12)') nth
+    write(psb_out_unit,'("Total number of tasks              : ",i12)') nth*np
    write(psb_out_unit,'("Linear system size                 : ",i12)') system_size
    write(psb_out_unit,'("PDE Coefficients                   : ",a)') trim(pdecoeff)
    write(psb_out_unit,'("Krylov method                      : ",a)') trim(s_choice%kmethd)
@ -478,7 +492,7 @@ program amg_s_pde3d
    write(psb_out_unit,'("Storage format for DESC_A          : ",a  )') desc_a%get_fmt()

  end if
-
+  call psb_print_timers(ctxt)
  !
  !  cleanup storage and exit
  !
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+0200                        ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC
@ -9,7 +9,7 @@ BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS F
 30                          ! IRST (restart for RGMRES and BiCGSTABL)
 1.d-6                       ! EPS
 %%%%%%%%%%%  Main preconditioner choices %%%%%%%%%%%%%%%%
-ML-VCYCLE-BJAC-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
+ML-VBM-VCYCLE-FBGS-D-BJAC       ! Longer descriptive name for preconditioner (up to 20 chars)
 ML                          ! Preconditioner type: NONE JACOBI GS FBGS BJAC AS ML
 %%%%%%%%%%%  First smoother (for all levels but coarsest) %%%%%%%%%%%%%%%%
 FBGS                        ! Smoother type JACOBI FBGS GS BWGS BJAC AS. For 1-level, repeats previous.
@ -39,8 +39,8 @@ VCYCLE                      ! Type of multilevel CYCLE: VCYCLE WCYCLE KCYCLE MUL
 -3                          ! Max Number of levels in a multilevel preconditioner; if <0, lib default
 -3                          ! Target coarse matrix size per process; if <0, lib default
 SMOOTHED                    ! Type of aggregation: SMOOTHED UNSMOOTHED
-COUPLED                     ! Parallel aggregation: DEC, SYMDEC, COUPLED
-MATCHBOXP                   ! aggregation measure SOC1, MATCHBOXP
+DEC                         ! Parallel aggregation: DEC, SYMDEC, COUPLED
+SOC1                        ! aggregation measure SOC1, MATCHBOXP
 8                           ! Requested size of the aggregates for MATCHBOXP
 NATURAL                     ! Ordering of aggregation NATURAL DEGREE
 -1.5                        ! Coarsening ratio, if < 0 use library default