Merge branch 'petrilli-m' into openmp-match

2 years ago · 0d624df346
parent 28634f6cda 6414d3aef3
commit 0d624df346
25 changed files with 2975 additions and 128 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,3 @@
-
                         AMG4PSBLAS  
 Algebraic Multigrid Package  based on PSBLAS (Parallel Sparse BLAS version 3.8)
    
--- a/amgprec/impl/aggregator/Makefile
+++ b/amgprec/impl/aggregator/Makefile
@ -62,7 +62,21 @@ amg_s_parmatch_smth_bld.o \
 amg_s_parmatch_spmm_bld_inner.o

 MPCOBJS=MatchBoxPC.o \
-algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o
+sendBundledMessages.o \
+initialize.o \
+extractUChunk.o \
+isAlreadyMatched.o \
+findOwnerOfGhost.o \
+computeCandidateMate.o \
+parallelComputeCandidateMateB.o \
+processMatchedVertices.o \
+processMatchedVerticesAndSendMessages.o \
+processCrossEdge.o \
+queueTransfer.o \
+processMessages.o \
+processExposedVertex.o \
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o \
+algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o

 OBJS = $(FOBJS) $(MPCOBJS)

--- a/amgprec/impl/aggregator/MatchBoxPC.cpp
+++ b/amgprec/impl/aggregator/MatchBoxPC.cpp
@ -60,17 +60,43 @@ void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
 		MilanLongInt* ph1_card, MilanLongInt* ph2_card ) {
 #if !defined(SERIAL_MPI)
  MPI_Comm C_comm=MPI_Comm_f2c(icomm);
+
 #ifdef DEBUG
  fprintf(stderr,"MatchBoxPC: rank %d nlver %ld nledge %ld [ %ld %ld ]\n",
 	  myRank,NLVer, NLEdge,verDistance[0],verDistance[1]);
 #endif
-  dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
+
+
+#define TIME_TRACKER
+    #ifdef TIME_TRACKER
+        double tmr = MPI_Wtime();
+    #endif
+
+#define OMP
+#ifdef OMP
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(NLVer, NLEdge,
+							   verLocPtr, verLocInd, edgeLocWeight,
+							   verDistance,  Mate,
+							   myRank, numProcs, C_comm,
+							   msgIndSent, msgActualSent, msgPercent,
+							   ph0_time, ph1_time, ph2_time,
+							   ph1_card, ph2_card );
+#else
+        dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(NLVer, NLEdge,
 							   verLocPtr, verLocInd, edgeLocWeight,
 							   verDistance,  Mate,
 							   myRank, numProcs, C_comm,
 							   msgIndSent, msgActualSent, msgPercent,
 							   ph0_time, ph1_time, ph2_time,
 							   ph1_card, ph2_card );
+#endif
+
+
+  #ifdef TIME_TRACKER
+    tmr = MPI_Wtime() - tmr;
+    fprintf(stderr, "Elaboration time: %f for %ld nodes\n", tmr, NLVer);
+  #endif
+
 #endif
 }

--- a/amgprec/impl/aggregator/MatchBoxPC.h
+++ b/amgprec/impl/aggregator/MatchBoxPC.h
@ -52,145 +52,412 @@

 #ifndef _matchboxpC_H_
 #define _matchboxpC_H_
-//Turn on a lot of debugging information with this switch:
+// Turn on a lot of debugging information with this switch:
 //#define PRINT_DEBUG_INFO_
 #include <stdio.h>
 #include <iostream>
 #include <assert.h>
 #include <map>
 #include <vector>
-// #include "matchboxp.h"
+#include "omp.h"
 #include "primitiveDataTypeDefinitions.h"
 #include "dataStrStaticQueue.h"

 using namespace std;

+const int NUM_THREAD = 4;
+const int UCHUNK = 10;
+
+const MilanLongInt REQUEST = 1;
+const MilanLongInt SUCCESS = 2;
+const MilanLongInt FAILURE = 3;
+const MilanLongInt SIZEINFO = 4;
+
+const int ComputeTag = 7; // Predefined tag
+const int BundleTag = 9;  // Predefined tag
+
+static vector<MilanLongInt> DEFAULT_VECTOR;
+
+// MPI type map
+template <typename T>
+MPI_Datatype TypeMap();
+template <>
+inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
+template <>
+inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
+template <>
+inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
+template <>
+inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }
+
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #if !defined(SERIAL_MPI)

-#define MilanMpiLongInt  MPI_LONG_LONG
+#define MilanMpiLongInt MPI_LONG_LONG

 #ifndef _primitiveDataType_Definition_
 #define _primitiveDataType_Definition_
-    //Regular integer:
-    #ifndef INTEGER_H
-    #define INTEGER_H
-        typedef int32_t MilanInt;
-    #endif
-
-    //Regular long integer:
-    #ifndef LONG_INT_H
-    #define LONG_INT_H
-        #ifdef BIT64
-            typedef int64_t MilanLongInt;
-            typedef MPI_LONG MilanMpiLongInt;
-        #else
-            typedef int32_t MilanLongInt;
-            typedef MPI_INT MilanMpiLongInt;
-        #endif
-    #endif
-
-    //Regular boolean
-    #ifndef BOOL_H
-    #define BOOL_H
-        typedef bool MilanBool;
-    #endif
-
-    //Regular double and absolute value computation:
-    #ifndef REAL_H
-    #define REAL_H
-        typedef double MilanReal;
-        typedef MPI_DOUBLE MilanMpiReal;
-        inline MilanReal MilanAbs(MilanReal value)
-        {
-            return fabs(value);
-        }
-    #endif
-
-    //Regular float and absolute value computation:
-    #ifndef FLOAT_H
-    #define FLOAT_H
-        typedef float MilanFloat;
-        typedef MPI_FLOAT MilanMpiFloat;
-        inline MilanFloat MilanAbsFloat(MilanFloat value)
-        {
-            return fabs(value);
-        }
-    #endif
-
-    //// Define the limits:
-    #ifndef LIMITS_H
-    #define LIMITS_H
-    //Integer Maximum and Minimum:
-  //      #define MilanIntMax INT_MAX
-  //    #define MilanIntMin INT_MIN
-        #define MilanIntMax INT32_MAX
-        #define MilanIntMin INT32_MIN
-
-        #ifdef BIT64
-            #define MilanLongIntMax INT64_MAX
-            #define MilanLongIntMin -INT64_MAX
-        #else
-            #define MilanLongIntMax INT32_MAX
-            #define MilanLongIntMin -INT32_MAX
-        #endif
-
-    #endif
+// Regular integer:
+#ifndef INTEGER_H
+#define INTEGER_H
+    typedef int32_t MilanInt;
+#endif
+
+// Regular long integer:
+#ifndef LONG_INT_H
+#define LONG_INT_H
+#ifdef BIT64
+    typedef int64_t MilanLongInt;
+    typedef MPI_LONG MilanMpiLongInt;
+#else
+    typedef int32_t MilanLongInt;
+    typedef MPI_INT MilanMpiLongInt;
+#endif
+#endif
+
+// Regular boolean
+#ifndef BOOL_H
+#define BOOL_H
+    typedef bool MilanBool;
+#endif
+
+// Regular double and absolute value computation:
+#ifndef REAL_H
+#define REAL_H
+    typedef double MilanReal;
+    typedef MPI_DOUBLE MilanMpiReal;
+    inline MilanReal MilanAbs(MilanReal value)
+    {
+        return fabs(value);
+    }
+#endif
+
+// Regular float and absolute value computation:
+#ifndef FLOAT_H
+#define FLOAT_H
+    typedef float MilanFloat;
+    typedef MPI_FLOAT MilanMpiFloat;
+    inline MilanFloat MilanAbsFloat(MilanFloat value)
+    {
+        return fabs(value);
+    }
+#endif
+
+//// Define the limits:
+#ifndef LIMITS_H
+#define LIMITS_H
+    // Integer Maximum and Minimum:
+    //      #define MilanIntMax INT_MAX
+    //    #define MilanIntMin INT_MIN
+#define MilanIntMax INT32_MAX
+#define MilanIntMin INT32_MIN
+
+#ifdef BIT64
+#define MilanLongIntMax INT64_MAX
+#define MilanLongIntMin -INT64_MAX
+#else
+#define MilanLongIntMax INT32_MAX
+#define MilanLongIntMin -INT32_MAX
+#endif
+
+#endif

    // +INFINITY
    const double PLUS_INFINITY = numeric_limits<int>::infinity();
    const double MINUS_INFINITY = -PLUS_INFINITY;
-    //#define MilanRealMax LDBL_MAX
-    #define MilanRealMax PLUS_INFINITY
-    #define MilanRealMin MINUS_INFINITY
+//#define MilanRealMax LDBL_MAX
+#define MilanRealMax PLUS_INFINITY
+#define MilanRealMin MINUS_INFINITY
 #endif

-//Function of find the owner of a ghost vertex using binary search:
-inline MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
-                                     MilanInt myRank, MilanInt numProcs);
-
-  void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
- MilanLongInt NLVer, MilanLongInt NLEdge,
- MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
- MilanLongInt* verDistance,
- MilanLongInt* Mate,
- MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
- MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
- MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
- MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
- void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC
-(
-MilanLongInt NLVer, MilanLongInt NLEdge,
-MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-MilanLongInt* verDistance,
-MilanLongInt* Mate,
-MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
-MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
-void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanReal* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
-
-void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
-		MilanLongInt* verLocPtr, MilanLongInt* verLocInd, MilanFloat* edgeLocWeight,
-		MilanLongInt* verDistance,
-		MilanLongInt* Mate,
-		MilanInt myRank, MilanInt numProcs, MilanInt icomm,
-		MilanLongInt* msgIndSent, MilanLongInt* msgActualSent, MilanReal* msgPercent,
-		MilanReal* ph0_time, MilanReal* ph1_time, MilanReal* ph2_time,
-		MilanLongInt* ph1_card, MilanLongInt* ph2_card );
+    // Function of find the owner of a ghost vertex using binary search:
+    MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                              MilanInt myRank, MilanInt numProcs);
+
+    MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                           MilanLongInt adj2,
+                                           MilanLongInt *verLocInd,
+                                           MilanReal *edgeLocWeight);
+
+    void queuesTransfer(vector<MilanLongInt> &U,
+                        vector<MilanLongInt> &privateU,
+                        vector<MilanLongInt> &QLocalVtx,
+                        vector<MilanLongInt> &QGhostVtx,
+                        vector<MilanLongInt> &QMsgType,
+                        vector<MilanInt> &QOwner,
+                        vector<MilanLongInt> &privateQLocalVtx,
+                        vector<MilanLongInt> &privateQGhostVtx,
+                        vector<MilanLongInt> &privateQMsgType,
+                        vector<MilanInt> &privateQOwner);
+
+    bool isAlreadyMatched(MilanLongInt node,
+                          MilanLongInt StartIndex,
+                          MilanLongInt EndIndex,
+                          vector<MilanLongInt> &GMate,
+                          MilanLongInt *Mate,
+                          map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                      MilanLongInt adj2,
+                                      MilanReal *edgeLocWeight,
+                                      MilanLongInt k,
+                                      MilanLongInt *verLocInd,
+                                      MilanLongInt StartIndex,
+                                      MilanLongInt EndIndex,
+                                      vector<MilanLongInt> &GMate,
+                                      MilanLongInt *Mate,
+                                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap);
+
+    void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                    MilanLongInt StartIndex, MilanLongInt EndIndex,
+                    MilanLongInt *numGhostEdgesPtr,
+                    MilanLongInt *numGhostVerticesPtr,
+                    MilanLongInt *S,
+                    MilanLongInt *verLocInd,
+                    MilanLongInt *verLocPtr,
+                    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                    vector<MilanLongInt> &Counter,
+                    vector<MilanLongInt> &verGhostPtr,
+                    vector<MilanLongInt> &verGhostInd,
+                    vector<MilanLongInt> &tempCounter,
+                    vector<MilanLongInt> &GMate,
+                    vector<MilanLongInt> &Message,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    MilanLongInt *&candidateMate,
+                    vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner);
+
+    void clean(MilanLongInt NLVer,
+               MilanInt myRank,
+               MilanLongInt MessageIndex,
+               vector<MPI_Request> &SRequest,
+               vector<MPI_Status> &SStatus,
+               MilanInt BufferSize,
+               MilanLongInt *Buffer,
+               MilanLongInt msgActual,
+               MilanLongInt *msgActualSent,
+               MilanLongInt msgInd,
+               MilanLongInt *msgIndSent,
+               MilanLongInt NumMessagesBundled,
+               MilanReal *msgPercent);
+
+    void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt *verLocInd,
+                                           MilanInt myRank,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *candidateMate);
+
+    void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                           MilanLongInt *candidateMate,
+                                           MilanLongInt *verLocInd,
+                                           MilanLongInt *verLocPtr,
+                                           MilanLongInt StartIndex,
+                                           MilanLongInt EndIndex,
+                                           MilanLongInt *Mate,
+                                           vector<MilanLongInt> &GMate,
+                                           map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                           MilanReal *edgeLocWeight,
+                                           MilanLongInt *myCardPtr,
+                                           MilanLongInt *msgIndPtr,
+                                           MilanLongInt *NumMessagesBundledPtr,
+                                           MilanLongInt *SPtr,
+                                           MilanLongInt *verDistance,
+                                           MilanLongInt *PCounter,
+                                           vector<MilanLongInt> &Counter,
+                                           MilanInt myRank,
+                                           MilanInt numProcs,
+                                           vector<MilanLongInt> &U,
+                                           vector<MilanLongInt> &privateU,
+                                           vector<MilanLongInt> &QLocalVtx,
+                                           vector<MilanLongInt> &QGhostVtx,
+                                           vector<MilanLongInt> &QMsgType,
+                                           vector<MilanInt> &QOwner,
+                                           vector<MilanLongInt> &privateQLocalVtx,
+                                           vector<MilanLongInt> &privateQGhostVtx,
+                                           vector<MilanLongInt> &privateQMsgType,
+                                           vector<MilanInt> &privateQOwner);
+
+    void PROCESS_CROSS_EDGE(MilanLongInt *edge,
+                            MilanLongInt *SPtr);
+
+    void processMatchedVertices(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner);
+
+    void processMatchedVerticesAndSendMessages(
+        MilanLongInt NLVer,
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *NumMessagesBundledPtr,
+        MilanLongInt *SPtr,
+        MilanLongInt *verLocPtr,
+        MilanLongInt *verLocInd,
+        MilanLongInt *verDistance,
+        MilanLongInt *PCounter,
+        vector<MilanLongInt> &Counter,
+        MilanInt myRank,
+        MilanInt numProcs,
+        MilanLongInt *candidateMate,
+        vector<MilanLongInt> &GMate,
+        MilanLongInt *Mate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        MilanReal *edgeLocWeight,
+        vector<MilanLongInt> &QLocalVtx,
+        vector<MilanLongInt> &QGhostVtx,
+        vector<MilanLongInt> &QMsgType,
+        vector<MilanInt> &QOwner,
+        vector<MilanLongInt> &privateQLocalVtx,
+        vector<MilanLongInt> &privateQGhostVtx,
+        vector<MilanLongInt> &privateQMsgType,
+        vector<MilanInt> &privateQOwner,
+        MPI_Comm comm,
+        MilanLongInt *msgActual,
+        vector<MilanLongInt> &Message);
+
+    void sendBundledMessages(MilanLongInt *numGhostEdgesPtr,
+                             MilanInt *BufferSizePtr,
+                             MilanLongInt *Buffer,
+                             vector<MilanLongInt> &PCumulative,
+                             vector<MilanLongInt> &PMessageBundle,
+                             vector<MilanLongInt> &PSizeInfoMessages,
+                             MilanLongInt *PCounter,
+                             MilanLongInt NumMessagesBundled,
+                             MilanLongInt *msgActualPtr,
+                             MilanLongInt *MessageIndexPtr,
+                             MilanInt numProcs,
+                             MilanInt myRank,
+                             MPI_Comm comm,
+                             vector<MilanLongInt> &QLocalVtx,
+                             vector<MilanLongInt> &QGhostVtx,
+                             vector<MilanLongInt> &QMsgType,
+                             vector<MilanInt> &QOwner,
+                             vector<MPI_Request> &SRequest,
+                             vector<MPI_Status> &SStatus);
+
+    void processMessages(
+        MilanLongInt NLVer,
+        MilanLongInt *Mate,
+        MilanLongInt *candidateMate,
+        map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+        vector<MilanLongInt> &GMate,
+        vector<MilanLongInt> &Counter,
+        MilanLongInt StartIndex,
+        MilanLongInt EndIndex,
+        MilanLongInt *myCardPtr,
+        MilanLongInt *msgIndPtr,
+        MilanLongInt *msgActualPtr,
+        MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *verLocPtr,
+        MilanLongInt k,
+        MilanLongInt *verLocInd,
+        MilanInt numProcs,
+        MilanInt myRank,
+        MPI_Comm comm,
+        vector<MilanLongInt> &Message,
+        MilanLongInt numGhostEdges,
+        MilanLongInt u,
+        MilanLongInt v,
+        MilanLongInt *SPtr,
+        vector<MilanLongInt> &U);
+
+    void extractUChunk(
+        vector<MilanLongInt> &UChunkBeingProcessed,
+        vector<MilanLongInt> &U,
+        vector<MilanLongInt> &privateU);
+
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void salgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC(
+        MilanLongInt NLVer, MilanLongInt NLEdge,
+        MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+        MilanLongInt *verDistance,
+        MilanLongInt *Mate,
+        MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+        MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+        MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+        MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void dMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanReal *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);
+
+    void sMatchBoxPC(MilanLongInt NLVer, MilanLongInt NLEdge,
+                     MilanLongInt *verLocPtr, MilanLongInt *verLocInd, MilanFloat *edgeLocWeight,
+                     MilanLongInt *verDistance,
+                     MilanLongInt *Mate,
+                     MilanInt myRank, MilanInt numProcs, MilanInt icomm,
+                     MilanLongInt *msgIndSent, MilanLongInt *msgActualSent, MilanReal *msgPercent,
+                     MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+                     MilanLongInt *ph1_card, MilanLongInt *ph2_card);

 #endif
 #ifdef __cplusplus
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.cpp
@ -72,12 +72,6 @@

 #ifdef SERIAL_MPI
 #else
-//MPI type map
-template<typename T> MPI_Datatype TypeMap();
-template<> inline MPI_Datatype TypeMap<int64_t>() { return MPI_LONG_LONG; }
-template<> inline MPI_Datatype TypeMap<int>() { return MPI_INT; }
-template<> inline MPI_Datatype TypeMap<double>() { return MPI_DOUBLE; }
-template<> inline MPI_Datatype TypeMap<float>() { return MPI_FLOAT; }

 // DOUBLE PRECISION VERSION
 //WARNING: The vertex block on a given rank is contiguous
--- a/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
+++ b/amgprec/impl/aggregator/algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.cpp
@ -0,0 +1,485 @@
+#include "MatchBoxPC.h"
+
+// ***********************************************************************
+//
+//        MatchboxP: A C++ library for approximate weighted matching
+//               Mahantesh Halappanavar (hala@pnnl.gov)
+//               Pacific Northwest National Laboratory
+//
+// ***********************************************************************
+//
+//       Copyright (2021) Battelle Memorial Institute
+//                      All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////// DOMINATING EDGES MODEL ///////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+/* Function	: algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate()
+ *
+ * Date     : New update: Feb 17, 2019, Richland, Washington.
+ * Date		: Original development: May 17, 2009, E&CS Bldg.
+ *
+ * Purpose	: Compute Approximate Maximum Weight Matching in Linear Time
+ *
+ * Args		: inputMatrix - instance of Compressed-Col format of Matrix
+ *                Mate - The Mate array
+ *
+ * Returns	: By Value: (void)
+ *            By Reference: Mate
+ *
+ * Comments	: 1/2 Approx Algorithm. Picks the locally available heaviest edge.
+ *                Assumption: The Mate Array is empty.
+ */
+
+/*
+ NLVer = #of vertices, NLEdge = #of edges
+ CSR/CSC/Compressed format: verLocPtr = Pointer, verLocInd = Index, edgeLocWeight = edge weights (positive real numbers)
+ verDistance = A vector of size |P|+1 containing the cumulative number of vertices per process
+ Mate = A vector of size |V_p| (local subgraph) to store the output (matching)
+ MPI: myRank, numProcs, comm,
+ Statistics: msgIndSent, msgActualSent, msgPercent : Size: |P| number of processes in the comm-world
+ Statistics: ph0_time, ph1_time, ph2_time: Runtimes
+ Statistics: ph1_card, ph2_card : Size: |P| number of processes in the comm-world (number of matched edges in Phase 1 and Phase 2)
+ */
+
+#ifdef SERIAL_MPI
+#else
+
+// DOUBLE PRECISION VERSION
+// WARNING: The vertex block on a given rank is contiguous
+void dalgoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP(
+    MilanLongInt NLVer, MilanLongInt NLEdge,
+    MilanLongInt *verLocPtr, MilanLongInt *verLocInd,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *Mate,
+    MilanInt myRank, MilanInt numProcs, MPI_Comm comm,
+    MilanLongInt *msgIndSent, MilanLongInt *msgActualSent,
+    MilanReal *msgPercent,
+    MilanReal *ph0_time, MilanReal *ph1_time, MilanReal *ph2_time,
+    MilanLongInt *ph1_card, MilanLongInt *ph2_card)
+{
+
+    /*
+     * verDistance: it's a vector long as the number of processors.
+     *              verDistance[i] contains the first node index of the i-th processor
+     *              verDistance[i + 1] contains the last node index of the i-th processor
+     * NLVer: number of elements in the LocPtr
+     * NLEdge: number of edges assigned to the current processor
+     *
+     * Contains the portion of matrix assigned to the processor in
+     * Yale notation
+     * verLocInd: contains the positions on row of the matrix
+     * verLocPtr: i-th value is the position of the first element on the i-th row and
+     *            i+1-th value is the position of the first element on the i+1-th row
+     */
+
+#if !defined(SERIAL_MPI)
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Within algoEdgeApproxDominatingEdgesLinearSearchMessageBundling()";
+    fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") verDistance [" << verDistance[0] << "," << verDistance[1] << "," << verDistance[2] << "," << verDistance[3] << "]";
+    fflush(stdout);
+#endif
+
+    MilanLongInt StartIndex = verDistance[myRank];       // The starting vertex owned by the current rank
+    MilanLongInt EndIndex = verDistance[myRank + 1] - 1; // The ending vertex owned by the current rank
+
+    MPI_Status computeStatus;
+
+    MilanLongInt msgActual = 0, msgInd = 0;
+    MilanReal heaviestEdgeWt = 0.0f; // Assumes positive weight
+    MilanReal startTime, finishTime;
+
+    startTime = MPI_Wtime();
+
+    // Data structures for sending and receiving messages:
+    vector<MilanLongInt> Message; // [ u, v, message_type ]
+    Message.resize(3, -1);
+    // Data structures for Message Bundling:
+    // Although up to two messages can be sent along any cross edge,
+    // only one message will be sent in the initialization phase -
+    // one of: REQUEST/FAILURE/SUCCESS
+    vector<MilanLongInt> QLocalVtx, QGhostVtx, QMsgType;
+    vector<MilanInt> QOwner; // Changed by Fabio to be an integer, addresses needs to be integers!
+
+    MilanLongInt *PCounter = new MilanLongInt[numProcs];
+    for (int i = 0; i < numProcs; i++)
+        PCounter[i] = 0;
+
+    MilanLongInt NumMessagesBundled = 0;
+    // TODO when the last computational section will be refactored this could be eliminated
+    MilanInt ghostOwner = 0; // Changed by Fabio to be an integer, addresses needs to be integers!
+    MilanLongInt *candidateMate = nullptr;
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")NV: " << NLVer << "  Edges: " << NLEdge;
+    fflush(stdout);
+    cout << "\n(" << myRank << ")StartIndex: " << StartIndex << "  EndIndex: " << EndIndex;
+    fflush(stdout);
+#endif
+    // Other Variables:
+    MilanLongInt u = -1, v = -1, w = -1, i = 0;
+    MilanLongInt k = -1, adj1 = -1, adj2 = -1;
+    MilanLongInt k1 = -1, adj11 = -1, adj12 = -1;
+    MilanLongInt myCard = 0;
+
+    // Build the Ghost Vertex Set: Vg
+    map<MilanLongInt, MilanLongInt> Ghost2LocalMap;       // Map each ghost vertex to a local vertex
+    vector<MilanLongInt> Counter;                         // Store the edge count for each ghost vertex
+    MilanLongInt numGhostVertices = 0, numGhostEdges = 0; // Number of Ghost vertices
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ")About to compute Ghost Vertices...";
+    fflush(stdout);
+#endif
+
+    // Define Adjacency Lists for Ghost Vertices:
+    // cout<<"Building Ghost data structures ... \n\n";
+    vector<MilanLongInt> verGhostPtr, verGhostInd, tempCounter;
+    // Mate array for ghost vertices:
+    vector<MilanLongInt> GMate; // Proportional to the number of ghost vertices
+    MilanLongInt S;
+    MilanLongInt privateMyCard = 0;
+    vector<MilanLongInt> PCumulative, PMessageBundle, PSizeInfoMessages;
+    vector<MPI_Request> SRequest;  // Requests that are used for each send message
+    vector<MPI_Status> SStatus;    // Status of sent messages, used in MPI_Wait
+    MilanLongInt MessageIndex = 0; // Pointer for current message
+    MilanInt BufferSize;
+    MilanLongInt *Buffer;
+
+    vector<MilanLongInt> privateQLocalVtx, privateQGhostVtx, privateQMsgType;
+    vector<MilanInt> privateQOwner;
+    vector<MilanLongInt> U, privateU;
+
+    initialize(NLVer, NLEdge, StartIndex,
+               EndIndex, &numGhostEdges,
+               &numGhostVertices, &S,
+               verLocInd, verLocPtr,
+               Ghost2LocalMap, Counter,
+               verGhostPtr, verGhostInd,
+               tempCounter, GMate,
+               Message, QLocalVtx,
+               QGhostVtx, QMsgType, QOwner,
+               candidateMate, U,
+               privateU,
+               privateQLocalVtx,
+               privateQGhostVtx,
+               privateQMsgType,
+               privateQOwner);
+
+    finishTime = MPI_Wtime();
+    *ph0_time = finishTime - startTime; // Time taken for Phase-0: Initialization
+
+    startTime = MPI_Wtime();
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////// INITIALIZATION /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // Compute the Initial Matching Set:
+
+    /*
+     * OMP PARALLEL_COMPUTE_CANDIDATE_MATE_B has been splitted from
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B in order to better parallelize
+     * the two.
+     * PARALLEL_COMPUTE_CANDIDATE_MATE_B is now totally parallel.
+     */
+
+    PARALLEL_COMPUTE_CANDIDATE_MATE_B(NLVer,
+                                      verLocPtr,
+                                      verLocInd,
+                                      myRank,
+                                      edgeLocWeight,
+                                      candidateMate);
+
+    /*
+     * PARALLEL_PROCESS_EXPOSED_VERTEX_B
+     * TODO: write comment
+     *
+     * TODO: Test when it's actually more efficient to execute this code
+     *       in parallel.
+     */
+    PARALLEL_PROCESS_EXPOSED_VERTEX_B(NLVer,
+                                      candidateMate,
+                                      verLocInd,
+                                      verLocPtr,
+                                      StartIndex,
+                                      EndIndex,
+                                      Mate,
+                                      GMate,
+                                      Ghost2LocalMap,
+                                      edgeLocWeight,
+                                      &myCard,
+                                      &msgInd,
+                                      &NumMessagesBundled,
+                                      &S,
+                                      verDistance,
+                                      PCounter,
+                                      Counter,
+                                      myRank,
+                                      numProcs,
+                                      U,
+                                      privateU,
+                                      QLocalVtx,
+                                      QGhostVtx,
+                                      QMsgType,
+                                      QOwner,
+                                      privateQLocalVtx,
+                                      privateQGhostVtx,
+                                      privateQMsgType,
+                                      privateQOwner);
+
+    tempCounter.clear(); // Do not need this any more
+
+    ///////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////
+
+    // TODO what would be the optimal UCHUNK
+    vector<MilanLongInt> UChunkBeingProcessed;
+    UChunkBeingProcessed.reserve(UCHUNK);
+
+    processMatchedVertices(NLVer,
+                           UChunkBeingProcessed,
+                           U,
+                           privateU,
+                           StartIndex,
+                           EndIndex,
+                           &myCard,
+                           &msgInd,
+                           &NumMessagesBundled,
+                           &S,
+                           verLocPtr,
+                           verLocInd,
+                           verDistance,
+                           PCounter,
+                           Counter,
+                           myRank,
+                           numProcs,
+                           candidateMate,
+                           GMate,
+                           Mate,
+                           Ghost2LocalMap,
+                           edgeLocWeight,
+                           QLocalVtx,
+                           QGhostVtx,
+                           QMsgType,
+                           QOwner,
+                           privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////// SEND BUNDLED MESSAGES /////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    sendBundledMessages(&numGhostEdges,
+                        &BufferSize,
+                        Buffer,
+                        PCumulative,
+                        PMessageBundle,
+                        PSizeInfoMessages,
+                        PCounter,
+                        NumMessagesBundled,
+                        &msgActual,
+                        &MessageIndex,
+                        numProcs,
+                        myRank,
+                        comm,
+                        QLocalVtx,
+                        QGhostVtx,
+                        QMsgType,
+                        QOwner,
+                        SRequest,
+                        SStatus);
+
+    ///////////////////////// END OF SEND BUNDLED MESSAGES //////////////////////////////////
+
+    finishTime = MPI_Wtime();
+    *ph1_time = finishTime - startTime; // Time taken for Phase-1
+    *ph1_card = myCard;                 // Cardinality at the end of Phase-1
+    startTime = MPI_Wtime();
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////// MAIN LOOP //////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // Main While Loop:
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Entering While(true) loop..";
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+    while (true)
+    {
+#ifdef DEBUG_HANG_
+        if (myRank == 0)
+            cout << "\n(" << myRank << ") Main loop" << endl;
+        fflush(stdout);
+#endif
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MATCHED VERTICES //////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+
+        processMatchedVerticesAndSendMessages(NLVer,
+                                              UChunkBeingProcessed,
+                                              U,
+                                              privateU,
+                                              StartIndex,
+                                              EndIndex,
+                                              &myCard,
+                                              &msgInd,
+                                              &NumMessagesBundled,
+                                              &S,
+                                              verLocPtr,
+                                              verLocInd,
+                                              verDistance,
+                                              PCounter,
+                                              Counter,
+                                              myRank,
+                                              numProcs,
+                                              candidateMate,
+                                              GMate,
+                                              Mate,
+                                              Ghost2LocalMap,
+                                              edgeLocWeight,
+                                              QLocalVtx,
+                                              QGhostVtx,
+                                              QMsgType,
+                                              QOwner,
+                                              privateQLocalVtx,
+                                              privateQGhostVtx,
+                                              privateQMsgType,
+                                              privateQOwner,
+                                              comm,
+                                              &msgActual,
+                                              Message);
+
+        ///////////////////////// END OF PROCESS MATCHED VERTICES /////////////////////////
+
+        //// BREAK IF NO MESSAGES EXPECTED /////////
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Deciding whether to break: S= " << S << endl;
+#endif
+
+        if (S == 0)
+        {
+#ifdef DEBUG_HANG_
+            cout << "\n(" << myRank << ") Breaking out" << endl;
+            fflush(stdout);
+#endif
+            break;
+        }
+        ///////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////// PROCESS MESSAGES //////////////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////
+
+        processMessages(NLVer,
+                        Mate,
+                        candidateMate,
+                        Ghost2LocalMap,
+                        GMate,
+                        Counter,
+                        StartIndex,
+                        EndIndex,
+                        &myCard,
+                        &msgInd,
+                        &msgActual,
+                        edgeLocWeight,
+                        verDistance,
+                        verLocPtr,
+                        k,
+                        verLocInd,
+                        numProcs,
+                        myRank,
+                        comm,
+                        Message,
+                        numGhostEdges,
+                        u,
+                        v,
+                        &S,
+                        U);
+
+        ///////////////////////// END OF PROCESS MESSAGES /////////////////////////////////
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Finished Message processing phase: S= " << S;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : ACTUAL= " << msgActual;
+        fflush(stdout);
+        cout << "\n(" << myRank << ")** SENT     : INDIVIDUAL= " << msgInd << endl;
+        fflush(stdout);
+#endif
+    } // End of while (true)
+
+    clean(NLVer,
+          myRank,
+          MessageIndex,
+          SRequest,
+          SStatus,
+          BufferSize,
+          Buffer,
+          msgActual,
+          msgActualSent,
+          msgInd,
+          msgIndSent,
+          NumMessagesBundled,
+          msgPercent);
+
+    finishTime = MPI_Wtime();
+    *ph2_time = finishTime - startTime; // Time taken for Phase-2
+    *ph2_card = myCard;                 // Cardinality at the end of Phase-2
+}
+// End of algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMate
+#endif
+
+#endif
--- a/amgprec/impl/aggregator/clean.cpp
+++ b/amgprec/impl/aggregator/clean.cpp
@ -0,0 +1,91 @@
+#include "MatchBoxPC.h"
+
+// TODO comment
+
+void clean(MilanLongInt NLVer,
+           MilanInt myRank,
+           MilanLongInt MessageIndex,
+           vector<MPI_Request> &SRequest,
+           vector<MPI_Status> &SStatus,
+           MilanInt BufferSize,
+           MilanLongInt *Buffer,
+           MilanLongInt msgActual,
+           MilanLongInt *msgActualSent,
+           MilanLongInt msgInd,
+           MilanLongInt *msgIndSent,
+           MilanLongInt NumMessagesBundled,
+           MilanReal *msgPercent)
+{
+    // Cleanup Phase
+
+#pragma omp parallel
+    {
+#pragma omp master
+        {
+#pragma omp task
+            {
+
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ") Waitall= " << endl;
+                fflush(stdout);
+#endif
+#ifdef DEBUG_HANG_
+                cout << "\n(" << myRank << ") Waitall " << endl;
+                fflush(stdout);
+#endif
+                return;
+
+                MPI_Waitall(MessageIndex, &SRequest[0], &SStatus[0]);
+
+                // MPI_Buffer_attach(&Buffer, BufferSize); //Attach the Buffer
+                if (BufferSize > 0)
+                {
+                    MPI_Buffer_detach(&Buffer, &BufferSize); // Detach the Buffer
+                    free(Buffer);                            // Free the memory that was allocated
+                }
+            }
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")End of function to compute matching: " << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")myCardinality: " << myCard << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")Matching took " << finishTime - startTime << "seconds" << endl;
+            fflush(stdout);
+            cout << "\n(" << myRank << ")** Getting out of the matching function **" << endl;
+            fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ") Number of Ghost edges = " << numGhostEdges;
+            cout << "\n(" << myRank << ") Total number of potential message X 2 = " << numGhostEdges * 2;
+            cout << "\n(" << myRank << ") Number messages bundled = " << NumMessagesBundled;
+            cout << "\n(" << myRank << ") Total Individual Messages sent = " << msgInd;
+            if (msgInd > 0)
+            {
+                cout << "\n(" << myRank << ") Percentage of messages bundled = " << ((double)NumMessagesBundled / (double)(msgInd)) * 100.0 << "% \n";
+            }
+            fflush(stdout);
+#endif
+
+#pragma omp task
+            {
+                *msgActualSent = msgActual;
+                *msgIndSent = msgInd;
+                if (msgInd > 0)
+                {
+                    *msgPercent = ((double)NumMessagesBundled / (double)(msgInd)) * 100.0;
+                }
+                else
+                {
+                    *msgPercent = 0;
+                }
+            }
+
+#ifdef DEBUG_HANG_
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Done" << endl;
+            fflush(stdout);
+#endif
+        }
+    }
+}
--- a/amgprec/impl/aggregator/computeCandidateMate.cpp
+++ b/amgprec/impl/aggregator/computeCandidateMate.cpp
@ -0,0 +1,78 @@
+#include "MatchBoxPC.h"
+
+/**
+ * Execute the research fr the Candidate Mate without controlling if the vertices are already matched.
+ * Returns the vertices with the highest weight
+ * @param adj1
+ * @param adj2
+ * @param verLocInd
+ * @param edgeLocWeight
+ * @return
+ */
+MilanLongInt firstComputeCandidateMate(MilanLongInt adj1,
+                                       MilanLongInt adj2,
+                                       MilanLongInt *verLocInd,
+                                       MilanReal *edgeLocWeight)
+{
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+    int finalK;
+    for (int k = adj1; k < adj2; k++)
+    {
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
+        {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+            finalK = k;
+        }
+    } // End of for loop
+    return finalK;
+}
+
+/**
+ * //TODO documentation
+ * @param adj1
+ * @param adj2
+ * @param edgeLocWeight
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+MilanLongInt computeCandidateMate(MilanLongInt adj1,
+                                  MilanLongInt adj2,
+                                  MilanReal *edgeLocWeight,
+                                  MilanLongInt k,
+                                  MilanLongInt *verLocInd,
+                                  MilanLongInt StartIndex,
+                                  MilanLongInt EndIndex,
+                                  vector<MilanLongInt> &GMate,
+                                  MilanLongInt *Mate,
+                                  map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+{
+    // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
+    MilanInt w = -1;
+    MilanReal heaviestEdgeWt = MilanRealMin; // Assign the smallest Value possible first LDBL_MIN
+    for (k = adj1; k < adj2; k++)
+    {
+        if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+            continue;
+
+        if ((edgeLocWeight[k] > heaviestEdgeWt) ||
+            ((edgeLocWeight[k] == heaviestEdgeWt) && (w < verLocInd[k])))
+        {
+            heaviestEdgeWt = edgeLocWeight[k];
+            w = verLocInd[k];
+        }
+    } // End of for loop
+      //  End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+
+    return w;
+}
--- a/amgprec/impl/aggregator/dataStrStaticQueue.h
+++ b/amgprec/impl/aggregator/dataStrStaticQueue.h
@ -80,9 +80,11 @@ class staticQueue
 		MilanLongInt squeueTail;
 		MilanLongInt NumNodes;

+		//FIXME I had to comment this piece of code in order to make everything work.
+		//		why?
 		//Prevent Assignment and Pass by Value:
-		staticQueue(const staticQueue& src);
-		staticQueue& operator=(const staticQueue& rhs);
+		//staticQueue(const staticQueue& src);
+		//staticQueue& operator=(const staticQueue& rhs);

 	public:
 		//Constructors and Destructors
--- a/amgprec/impl/aggregator/extractUChunk.cpp
+++ b/amgprec/impl/aggregator/extractUChunk.cpp
@ -0,0 +1,31 @@
+#include "MatchBoxPC.h"
+
+void extractUChunk(
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU)
+{
+
+    UChunkBeingProcessed.clear();
+#pragma omp critical(U)
+    {
+
+        if (U.empty() && !privateU.empty()) // If U is empty but there are nodes in private U
+        {
+            while (!privateU.empty())
+                UChunkBeingProcessed.push_back(privateU.back());
+            privateU.pop_back();
+        }
+        else
+        {
+            for (int i = 0; i < UCHUNK; i++)
+            { // Pop the new nodes
+                if (U.empty())
+                    break;
+                UChunkBeingProcessed.push_back(U.back());
+                U.pop_back();
+            }
+        }
+
+    } // End of critical U // End of critical U
+}
--- a/amgprec/impl/aggregator/findOwnerOfGhost.cpp
+++ b/amgprec/impl/aggregator/findOwnerOfGhost.cpp
@ -0,0 +1,29 @@
+#include "MatchBoxPC.h"
+
+/// Find the owner of a ghost node:
+MilanInt findOwnerOfGhost(MilanLongInt vtxIndex, MilanLongInt *mVerDistance,
+                          MilanInt myRank, MilanInt numProcs)
+{
+
+  MilanLongInt mStartInd = mVerDistance[myRank];
+  MilanInt Start = 0;
+  MilanInt End = numProcs;
+  MilanInt Current = 0;
+
+  while (Start <= End)
+  {
+    Current = (End + Start) / 2;
+    // CASE-1:
+    if (mVerDistance[Current] == vtxIndex) return Current;
+    else // CASE 2:
+      if (mVerDistance[Current] > vtxIndex)
+        End = Current - 1;
+      else // CASE 3:
+        Start = Current + 1;
+  } // End of While()
+
+ if (mVerDistance[Current] > vtxIndex)
+      return (Current - 1);
+
+  return Current;
+} // End of findOwnerOfGhost()
--- a/amgprec/impl/aggregator/initialize.cpp
+++ b/amgprec/impl/aggregator/initialize.cpp
@ -0,0 +1,304 @@
+#include "MatchBoxPC.h"
+
+void initialize(MilanLongInt NLVer, MilanLongInt NLEdge,
+                MilanLongInt StartIndex, MilanLongInt EndIndex,
+                MilanLongInt *numGhostEdges,
+                MilanLongInt *numGhostVertices,
+                MilanLongInt *S,
+                MilanLongInt *verLocInd,
+                MilanLongInt *verLocPtr,
+                map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                vector<MilanLongInt> &Counter,
+                vector<MilanLongInt> &verGhostPtr,
+                vector<MilanLongInt> &verGhostInd,
+                vector<MilanLongInt> &tempCounter,
+                vector<MilanLongInt> &GMate,
+                vector<MilanLongInt> &Message,
+                vector<MilanLongInt> &QLocalVtx,
+                vector<MilanLongInt> &QGhostVtx,
+                vector<MilanLongInt> &QMsgType,
+                vector<MilanInt> &QOwner,
+                MilanLongInt *&candidateMate,
+                vector<MilanLongInt> &U,
+                vector<MilanLongInt> &privateU,
+                vector<MilanLongInt> &privateQLocalVtx,
+                vector<MilanLongInt> &privateQGhostVtx,
+                vector<MilanLongInt> &privateQMsgType,
+                vector<MilanInt> &privateQOwner)
+{
+
+    MilanLongInt insertMe = 0;
+    MilanLongInt adj1, adj2;
+    int i, v, k, w;
+    // index that starts with zero to |Vg|  - 1
+    map<MilanLongInt, MilanLongInt>::iterator storedAlready;
+
+#pragma omp parallel private(insertMe, k, w, v, adj1, adj2) firstprivate(StartIndex, EndIndex) default(shared) num_threads(NUM_THREAD)
+    {
+
+#pragma omp single
+        {
+
+#ifdef TIME_TRACKER
+            double Ghost2LocalInitialization = MPI_Wtime();
+#endif
+
+            /*
+             * OMP Ghost2LocalInitialization
+             * This loop analyzes all the edges and when finds a ghost edge
+             * puts it in the Ghost2LocalMap.
+             * A critical region is needed when inserting data in the map.
+             *
+             * Despite the critical region it is still productive to
+             * parallelize this cycle because the critical region is exeuted
+             * only when a ghost edge is found and ghost edges are a minority,
+             * circa 3.5% during the tests.
+             */
+#pragma omp task depend(out \
+                        : *numGhostEdges, Counter, Ghost2LocalMap, insertMe, storedAlready, *numGhostVertices)
+            {
+#pragma omp taskloop num_tasks(NUM_THREAD) reduction(+ \
+                                                     : numGhostEdges[:1])
+                for (i = 0; i < NLEdge; i++)
+                { // O(m) - Each edge stored twice
+                    insertMe = verLocInd[i];
+                    if ((insertMe < StartIndex) || (insertMe > EndIndex))
+                    { // Find a ghost
+                        (*numGhostEdges)++;
+#pragma omp critical
+                        {
+                            storedAlready = Ghost2LocalMap.find(insertMe);
+                            if (storedAlready != Ghost2LocalMap.end())
+                            {                                     // Has already been added
+                                Counter[storedAlready->second]++; // Increment the counter
+                            }
+                            else
+                            {                                                 // Insert an entry for the ghost:
+                                Ghost2LocalMap[insertMe] = *numGhostVertices; // Add a map entry
+                                Counter.push_back(1);                         // Initialize the counter
+                                (*numGhostVertices)++;                        // Increment the number of ghost vertices
+                            }                                                 // End of else()
+                        }
+                    } // End of if ( (insertMe < StartIndex) || (insertMe > EndIndex) )
+                }     // End of for(ghost vertices)
+            }         // end of task depend
+
+            // *numGhostEdges = atomicNumGhostEdges;
+#ifdef TIME_TRACKER
+            Ghost2LocalInitialization = MPI_Wtime() - Ghost2LocalInitialization;
+            fprintf(stderr, "Ghost2LocalInitialization time: %f\n", Ghost2LocalInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")NGhosts:" << *numGhostVertices << " GhostEdges: " << *numGhostEdges;
+            if (!Ghost2LocalMap.empty())
+            {
+                cout << "\n(" << myRank << ")Final Map : on process ";
+                cout << "\n(" << myRank << ")Key \t Value \t Counter \n";
+                fflush(stdout);
+                storedAlready = Ghost2LocalMap.begin();
+                do
+                {
+                    cout << storedAlready->second << " - " << storedAlready->first << " : " << Counter[storedAlready->second] << endl;
+                    fflush(stdout);
+                    storedAlready++;
+                } while (storedAlready != Ghost2LocalMap.end());
+            }
+#endif
+
+#pragma omp task depend(out                                                       \
+                        : verGhostPtr, tempCounter, verGhostInd, GMate) depend(in \
+                                                                               : *numGhostVertices, *numGhostEdges)
+            {
+
+                // Initialize adjacency Lists for Ghost Vertices:
+                try
+                {
+                    verGhostPtr.reserve(*numGhostVertices + 1); // Pointer Vector
+                    tempCounter.reserve(*numGhostVertices);     // Pointer Vector
+                    verGhostInd.reserve(*numGhostEdges);        // Index Vector
+                    GMate.reserve(*numGhostVertices);           // Ghost Mate Vector
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+                // Initialize the Vectors:
+                verGhostPtr.resize(*numGhostVertices + 1, 0); // Pointer Vector
+                tempCounter.resize(*numGhostVertices, 0);     // Temporary Counter
+                verGhostInd.resize(*numGhostEdges, -1);       // Index Vector
+                GMate.resize(*numGhostVertices, -1);          // Temporary Counter
+                verGhostPtr[0] = 0;                           // The first value
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Ghost Vertex Pointer: ";
+                fflush(stdout);
+#endif
+
+            } // End of task
+
+#pragma omp task depend(out                      \
+                        : verGhostPtr) depend(in \
+                                              : Counter, *numGhostVertices)
+            {
+
+#ifdef TIME_TRACKER
+                double verGhostPtrInitialization = MPI_Wtime();
+#endif
+                for (i = 0; i < *numGhostVertices; i++)
+                { // O(|Ghost Vertices|)
+                    verGhostPtr[i + 1] = verGhostPtr[i] + Counter[i];
+#ifdef PRINT_DEBUG_INFO_
+                    cout << verGhostPtr[i] << "\t";
+                    fflush(stdout);
+#endif
+                }
+
+#ifdef TIME_TRACKER
+                verGhostPtrInitialization = MPI_Wtime() - verGhostPtrInitialization;
+                fprintf(stderr, "verGhostPtrInitialization time: %f\n", verGhostPtrInitialization);
+#endif
+            } // End of task
+
+#ifdef PRINT_DEBUG_INFO_
+            if (*numGhostVertices > 0)
+                cout << verGhostPtr[*numGhostVertices] << "\n";
+            fflush(stdout);
+#endif
+
+#ifdef TIME_TRACKER
+            double verGhostIndInitialization = MPI_Wtime();
+#endif
+
+            /*
+             * OMP verGhostIndInitialization
+             *
+             * In this cycle the verGhostInd is initialized
+             * with the datas related to ghost edges.
+             * The check to see if a node is a ghost node is
+             * executed in paralle and when a ghost node
+             * is found a critical region is started.
+             *
+             * Despite the critical region it's still useful to
+             * parallelize the for cause the ghost nodes
+             * are a minority hence the critical region is executed
+             * few times, circa 3.5% of the times in the tests.
+             */
+#pragma omp task depend(in                                                               \
+                        : insertMe, Ghost2LocalMap, tempCounter, verGhostPtr) depend(out \
+                                                                                     : verGhostInd)
+            {
+#pragma omp taskloop num_tasks(NUM_THREAD)
+                for (v = 0; v < NLVer; v++)
+                {
+                    adj1 = verLocPtr[v]; // Vertex Pointer
+                    adj2 = verLocPtr[v + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        w = verLocInd[k]; // Get the adjacent vertex
+                        if ((w < StartIndex) || (w > EndIndex))
+                        { // Find a ghost
+#pragma omp critical
+                            {
+                                insertMe = verGhostPtr[Ghost2LocalMap[w]] + tempCounter[Ghost2LocalMap[w]]; // Where to insert
+                                tempCounter[Ghost2LocalMap[w]]++;                                           // Increment the counter
+                            }
+                            verGhostInd[insertMe] = v + StartIndex; // Add the adjacency
+                        }                                           // End of if((w < StartIndex) || (w > EndIndex))
+                    }                                               // End of for(k)
+                }                                                   // End of for (v)
+            }                                                       // end of tasklopp
+
+#ifdef TIME_TRACKER
+            verGhostIndInitialization = MPI_Wtime() - verGhostIndInitialization;
+            fprintf(stderr, "verGhostIndInitialization time: %f\n", verGhostIndInitialization);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Ghost Vertex Index: ";
+            for (v = 0; v < *numGhostEdges; v++)
+                cout << verGhostInd[v] << "\t";
+            cout << endl;
+            fflush(stdout);
+#endif
+
+#pragma omp task depend(in                           \
+                        : *numGhostEdges) depend(out \
+                                                 : QLocalVtx, QGhostVtx, QMsgType, QOwner)
+            {
+                try
+                {
+                    QLocalVtx.reserve(*numGhostEdges); // Local Vertex
+                    QGhostVtx.reserve(*numGhostEdges); // Ghost Vertex
+                    QMsgType.reserve(*numGhostEdges);  // Message Type (Request/Failure)
+                    QOwner.reserve(*numGhostEdges);    // Owner of the ghost: COmpute once and use later
+                }
+                catch (length_error)
+                {
+                    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+                    cout << "Not enough memory to allocate the internal variables \n";
+                    exit(1);
+                }
+            } // end of task
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Allocating CandidateMate.. ";
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << "=========================************===============================" << endl;
+            fflush(stdout);
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
+            fflush(stdout);
+#endif
+
+#ifdef DEBUG_HANG_
+            if (myRank == 0)
+                cout << "\n(" << myRank << ") Setup Time :" << *ph0_time << endl;
+            fflush(stdout);
+#endif
+
+#pragma omp task depend(in                              \
+                        : *numGhostVertices) depend(out \
+                                                    : candidateMate, S, U, privateU, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner)
+            {
+
+                // Allocate Data Structures:
+                /*
+                 * candidateMate was a vector and has been replaced with an array
+                 * there is no point in using the vector (or maybe there is (???))
+                 * so I replaced it with an array wich is slightly faster
+                 */
+                candidateMate = new MilanLongInt[NLVer + (*numGhostVertices)];
+
+                *S = (*numGhostVertices); // Initialize S with number of Ghost Vertices
+
+                /*
+                 * Create the Queue Data Structure for the Dominating Set
+                 *
+                 * I had to declare the staticuQueue U before the parallel region
+                 * to have it in the correct scope. Since we can't change the dimension
+                 * of a staticQueue I had to destroy the previous object and instantiate
+                 * a new one of the correct size.
+                 */
+                //new (&U) staticQueue(NLVer + (*numGhostVertices));
+                U.reserve(NLVer + (*numGhostVertices));
+
+                // Initialize the private vectors                
+                privateQLocalVtx.reserve(*numGhostVertices);
+                privateQGhostVtx.reserve(*numGhostVertices);
+                privateQMsgType.reserve(*numGhostVertices);
+                privateQOwner.reserve(*numGhostVertices);
+                privateU.reserve(*numGhostVertices);
+            } // end of task
+
+        } // End of single region
+    }     // End of parallel region
+}
--- a/amgprec/impl/aggregator/isAlreadyMatched.cpp
+++ b/amgprec/impl/aggregator/isAlreadyMatched.cpp
@ -0,0 +1,46 @@
+#include "MatchBoxPC.h"
+
+/**
+ * //TODO documentation
+ * @param k
+ * @param verLocInd
+ * @param StartIndex
+ * @param EndIndex
+ * @param GMate
+ * @param Mate
+ * @param Ghost2LocalMap
+ * @return
+ */
+bool isAlreadyMatched(MilanLongInt node,
+                      MilanLongInt StartIndex,
+                      MilanLongInt EndIndex,
+                      vector<MilanLongInt> &GMate,
+                      MilanLongInt *Mate,
+                      map<MilanLongInt, MilanLongInt> &Ghost2LocalMap)
+{
+
+    /*
+#pragma omp critical(Mate)
+    {
+        if ((node < StartIndex) || (node > EndIndex)) { //Is it a ghost vertex?
+            result = GMate[Ghost2LocalMap[node]] >= 0;// Already matched
+        } else { //A local vertex
+            result = (Mate[node - StartIndex] >= 0); // Already matched
+        }
+
+    }
+    */
+    MilanLongInt val;
+    if ((node < StartIndex) || (node > EndIndex)) // if ghost vertex
+    { 
+#pragma omp atomic read
+        val = GMate[Ghost2LocalMap[node]];
+        return val >= 0; // Already matched
+    }
+
+    // If not ghost vertex
+#pragma omp atomic read
+    val = Mate[node - StartIndex];
+
+    return val >= 0; // Already matched
+}
--- a/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
+++ b/amgprec/impl/aggregator/parallelComputeCandidateMateB.cpp
@ -0,0 +1,28 @@
+#include "MatchBoxPC.h"
+
+void PARALLEL_COMPUTE_CANDIDATE_MATE_B(MilanLongInt NLVer,
+                                              MilanLongInt *verLocPtr,
+                                              MilanLongInt *verLocInd,
+                                              MilanInt myRank,
+                                              MilanReal *edgeLocWeight,
+                                              MilanLongInt *candidateMate)
+{
+
+    MilanLongInt v = -1;
+
+#pragma omp parallel private(v) default(shared) num_threads(NUM_THREAD)
+    {
+
+#pragma omp for schedule(static)
+        for (v = 0; v < NLVer; v++)
+        {
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+            // Start: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+            candidateMate[v] = firstComputeCandidateMate(verLocPtr[v], verLocPtr[v + 1], verLocInd, edgeLocWeight);
+            // End: PARALLEL_COMPUTE_CANDIDATE_MATE_B(v)
+        }
+    }
+}
--- a/amgprec/impl/aggregator/processCrossEdge.cpp
+++ b/amgprec/impl/aggregator/processCrossEdge.cpp
@ -0,0 +1,24 @@
+#include "MatchBoxPC.h"
+
+void PROCESS_CROSS_EDGE(MilanLongInt *edge,
+                        MilanLongInt *S)
+{
+    // Start: PARALLEL_PROCESS_CROSS_EDGE_B
+    MilanLongInt captureCounter;
+
+#pragma omp atomic capture
+    captureCounter = --(*edge); // Decrement
+
+    //assert(captureCounter >= 0);
+
+    if (captureCounter == 0)
+#pragma omp atomic
+        (*S)--; // Decrement S
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Decrementing S: Ghost vertex " << edge << " has received all its messages";
+    fflush(stdout);
+#endif
+
+    // End: PARALLEL_PROCESS_CROSS_EDGE_B
+}
--- a/amgprec/impl/aggregator/processExposedVertex.cpp
+++ b/amgprec/impl/aggregator/processExposedVertex.cpp
@ -0,0 +1,205 @@
+#include "MatchBoxPC.h"
+
+void PARALLEL_PROCESS_EXPOSED_VERTEX_B(MilanLongInt NLVer,
+                                       MilanLongInt *candidateMate,
+                                       MilanLongInt *verLocInd,
+                                       MilanLongInt *verLocPtr,
+                                       MilanLongInt StartIndex,
+                                       MilanLongInt EndIndex,
+                                       MilanLongInt *Mate,
+                                       vector<MilanLongInt> &GMate,
+                                       map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+                                       MilanReal *edgeLocWeight,
+                                       MilanLongInt *myCard,
+                                       MilanLongInt *msgInd,
+                                       MilanLongInt *NumMessagesBundled,
+                                       MilanLongInt *S,
+                                       MilanLongInt *verDistance,
+                                       MilanLongInt *PCounter,
+                                       vector<MilanLongInt> &Counter,
+                                       MilanInt myRank,
+                                       MilanInt numProcs,
+                                       vector<MilanLongInt> &U,
+                                       vector<MilanLongInt> &privateU,
+                                       vector<MilanLongInt> &QLocalVtx,
+                                       vector<MilanLongInt> &QGhostVtx,
+                                       vector<MilanLongInt> &QMsgType,
+                                       vector<MilanInt> &QOwner,
+                                       vector<MilanLongInt> &privateQLocalVtx,
+                                       vector<MilanLongInt> &privateQGhostVtx,
+                                       vector<MilanLongInt> &privateQMsgType,
+                                       vector<MilanInt> &privateQOwner)
+{
+
+    MilanLongInt v = -1, k = -1, w = -1, adj11 = 0, adj12 = 0, k1 = 0;
+    MilanInt ghostOwner = 0, option;
+
+#pragma omp parallel private(option, k, w, v, k1, adj11, adj12, ghostOwner)                                                          \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner) default(shared) \
+        num_threads(NUM_THREAD)
+
+    {
+#pragma omp for reduction(+                             \
+                          : PCounter[:numProcs], myCard \
+                          [:1], msgInd                  \
+                          [:1], NumMessagesBundled      \
+                          [:1]) schedule(static)
+        for (v = 0; v < NLVer; v++)
+        {
+            option = -1;
+            // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+            k = candidateMate[v];
+            candidateMate[v] = verLocInd[k];
+            w = candidateMate[v];
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Processing: " << v + StartIndex << endl;
+            fflush(stdout);
+#endif
+
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")" << v + StartIndex << " Points to: " << w;
+            fflush(stdout);
+#endif
+            // If found a dominating edge:
+            if (w >= 0)
+            {
+
+#pragma omp critical(processExposed)
+                {
+                    if (isAlreadyMatched(verLocInd[k], StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap))
+                    {
+                        w = computeCandidateMate(verLocPtr[v],
+                                                 verLocPtr[v + 1],
+                                                 edgeLocWeight, 0,
+                                                 verLocInd,
+                                                 StartIndex,
+                                                 EndIndex,
+                                                 GMate,
+                                                 Mate,
+                                                 Ghost2LocalMap);
+                        candidateMate[v] = w;
+                    }
+
+                    if (w >= 0)
+                    {
+                        (*myCard)++;
+                        if ((w < StartIndex) || (w > EndIndex))
+                        { // w is a ghost vertex
+                            option = 2;
+
+                            if (candidateMate[NLVer + Ghost2LocalMap[w]] == v + StartIndex)
+                            {
+                                option = 1;
+                                Mate[v] = w;
+                                GMate[Ghost2LocalMap[w]] = v + StartIndex; // w is a Ghost
+
+                            } // End of if CandidateMate[w] = v
+
+                        } // End of if a Ghost Vertex
+                        else
+                        { // w is a local vertex
+
+                            if (candidateMate[w - StartIndex] == (v + StartIndex))
+                            {
+                                option = 3;
+                                Mate[v] = w;                           // v is local
+                                Mate[w - StartIndex] = v + StartIndex; // w is local
+
+#ifdef PRINT_DEBUG_INFO_
+                                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ") ";
+                                fflush(stdout);
+#endif
+
+                            } // End of if ( candidateMate[w-StartIndex] == (v+StartIndex) )
+                        }     // End of Else
+
+                    } // End of second if
+
+                } // End critical processExposed
+
+            } // End of if(w >=0)
+            else
+            {
+                // This piece of code is executed a really small amount of times
+                adj11 = verLocPtr[v];
+                adj12 = verLocPtr[v + 1];
+                for (k1 = adj11; k1 < adj12; k1++)
+                {
+                    w = verLocInd[k1];
+                    if ((w < StartIndex) || (w > EndIndex))
+                    { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                        cout << "\n(" << myRank << ")Sending a failure message: ";
+                        cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        fflush(stdout);
+#endif
+                        (*msgInd)++;
+                        (*NumMessagesBundled)++;
+                        ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                        // assert(ghostOwner != -1);
+                        // assert(ghostOwner != myRank);
+                        PCounter[ghostOwner]++;
+
+                        privateQLocalVtx.push_back(v + StartIndex);
+                        privateQGhostVtx.push_back(w);
+                        privateQMsgType.push_back(FAILURE);
+                        privateQOwner.push_back(ghostOwner);
+
+                    } // End of if(GHOST)
+                }     // End of for loop
+            }
+            // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+
+            switch (option)
+            {
+            case -1:
+                break;
+            case 1:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
+
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")MATCH: (" << v + StartIndex << "," << w << ")";
+                fflush(stdout);
+#endif
+
+                //  Decrement the counter:
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+            case 2:
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Sending a request message (291):";
+                cout << "\n(" << myRank << ")Local is: " << v + StartIndex << " Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                fflush(stdout);
+#endif
+                (*msgInd)++;
+                (*NumMessagesBundled)++;
+                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                // assert(ghostOwner != -1);
+                // assert(ghostOwner != myRank);
+                PCounter[ghostOwner]++;
+
+                privateQLocalVtx.push_back(v + StartIndex);
+                privateQGhostVtx.push_back(w);
+                privateQMsgType.push_back(REQUEST);
+                privateQOwner.push_back(ghostOwner);
+                break;
+            case 3:
+            default:
+                privateU.push_back(v + StartIndex);
+                privateU.push_back(w);
+                break;
+            }
+
+        } // End of for ( v=0; v < NLVer; v++ )
+
+        queuesTransfer(U, privateU, QLocalVtx,
+                       QGhostVtx,
+                       QMsgType, QOwner, privateQLocalVtx,
+                       privateQGhostVtx,
+                       privateQMsgType,
+                       privateQOwner);
+
+    } // End of parallel region
+}
--- a/amgprec/impl/aggregator/processMatchedVertices.cpp
+++ b/amgprec/impl/aggregator/processMatchedVertices.cpp
@ -0,0 +1,309 @@
+#include "MatchBoxPC.h"
+
+void processMatchedVertices(
+    MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
+    MilanLongInt *SPtr,
+    MilanLongInt *verLocPtr,
+    MilanLongInt *verLocInd,
+    MilanLongInt *verDistance,
+    MilanLongInt *PCounter,
+    vector<MilanLongInt> &Counter,
+    MilanInt myRank,
+    MilanInt numProcs,
+    MilanLongInt *candidateMate,
+    vector<MilanLongInt> &GMate,
+    MilanLongInt *Mate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    MilanReal *edgeLocWeight,
+    vector<MilanLongInt> &QLocalVtx,
+    vector<MilanLongInt> &QGhostVtx,
+    vector<MilanLongInt> &QMsgType,
+    vector<MilanInt> &QOwner,
+    vector<MilanLongInt> &privateQLocalVtx,
+    vector<MilanLongInt> &privateQGhostVtx,
+    vector<MilanLongInt> &privateQMsgType,
+    vector<MilanInt> &privateQOwner)
+{
+
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
+    MilanLongInt mateVal;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                    \
+    firstprivate(privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                                                                                            \
+            reduction(+                                                                                                                                    \
+                      : msgInd[:1], PCounter                                                                                                               \
+                      [:numProcs], myCard                                                                                                                  \
+                      [:1], NumMessagesBundled                                                                                                             \
+                      [:1])
+    {
+
+        while (!U.empty())
+        {
+
+            extractUChunk(UChunkBeingProcessed, U, privateU);
+
+            for (MilanLongInt u : UChunkBeingProcessed)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
+#endif
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
+
+#ifdef COUNT_LOCAL_VERTEX
+                    localVertices++;
+#endif
+
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        option = -1;
+                        v = verLocInd[k];
+
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                            fflush(stdout);
+#endif
+#pragma omp atomic read
+                            mateVal = Mate[v - StartIndex];
+                            // If the current vertex is pointing to a matched vertex and is not matched
+                            if (mateVal < 0)
+                            {
+#pragma omp critical
+                                {
+                                    if (candidateMate[v - StartIndex] == u)
+                                    {
+                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                                 verLocPtr[v - StartIndex + 1],
+                                                                 edgeLocWeight, 0,
+                                                                 verLocInd,
+                                                                 StartIndex,
+                                                                 EndIndex,
+                                                                 GMate,
+                                                                 Mate,
+                                                                 Ghost2LocalMap);
+
+                                        candidateMate[v - StartIndex] = w;
+
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                        fflush(stdout);
+#endif
+                                        // If found a dominating edge:
+                                        if (w >= 0)
+                                        {
+
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")Sending a request message:";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+                                                option = 2;
+
+                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                                {
+                                                    option = 1;
+                                                    Mate[v - StartIndex] = w;     // v is a local vertex
+                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+
+                                                } // End of if CandidateMate[w] = v
+                                            }     // End of if a Ghost Vertex
+                                            else
+                                            { // w is a local vertex
+                                                if (candidateMate[w - StartIndex] == v)
+                                                {
+                                                    option = 3;
+                                                    Mate[v - StartIndex] = w; // v is a local vertex
+                                                    Mate[w - StartIndex] = v; // w is a local vertex
+
+#ifdef PRINT_DEBUG_INFO_
+                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                    fflush(stdout);
+#endif
+                                                } // End of if(CandidateMate(w) = v
+                                            }     // End of Else
+                                        }         // End of if(w >=0)
+                                        else
+                                            option = 4; // End of Else: w == -1
+                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    } // End of If (candidateMate[v-StartIndex] == u
+                                }     // End of task
+                            }         // mateval < 0
+                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
+
+#pragma omp critical
+                            {
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex])
+                                    option = 5; // u is local
+                            }                   // End of critical
+                        }                       // End of Else //A Ghost Vertex
+
+                        switch (option)
+                        {
+                        case -1:
+                            // No things to do
+                            break;
+                        case 1:
+                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+
+                            (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                            fflush(stdout);
+#endif
+                            // Decrement the counter:
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+                        case 2:
+
+                            // Found a dominating edge, it is a ghost
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
+                            PCounter[ghostOwner]++;
+                            (*NumMessagesBundled)++;
+                            (*msgInd)++;
+
+                            privateQLocalVtx.push_back(v);
+                            privateQGhostVtx.push_back(w);
+                            privateQMsgType.push_back(REQUEST);
+                            privateQOwner.push_back(ghostOwner);
+                            break;
+                        case 3:
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+
+                            (*myCard)++;
+                            break;
+                        case 4:
+                            // Could not find a dominating vertex
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
+
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    // assert(ghostOwner != -1);
+                                    // assert(ghostOwner != myRank);
+
+                                    PCounter[ghostOwner]++;
+                                    (*NumMessagesBundled)++;
+                                    (*msgInd)++;
+
+                                    privateQLocalVtx.push_back(v);
+                                    privateQGhostVtx.push_back(w);
+                                    privateQMsgType.push_back(FAILURE);
+                                    privateQOwner.push_back(ghostOwner);
+
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                            break;
+                        case 5:
+                        default:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Sending a success message: ";
+                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                            fflush(stdout);
+#endif
+
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+                            // assert(ghostOwner != -1);
+                            // assert(ghostOwner != myRank);
+
+                            (*NumMessagesBundled)++;
+                            PCounter[ghostOwner]++;
+                            (*msgInd)++;
+
+                            privateQLocalVtx.push_back(u);
+                            privateQGhostVtx.push_back(v);
+                            privateQMsgType.push_back(SUCCESS);
+                            privateQOwner.push_back(ghostOwner);
+
+                            break;
+                        } // End of switch
+
+                    } // End of inner for
+                }
+            } // End of outer for
+
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+#pragma omp critical(U)
+            {
+                U.insert(U.end(), privateU.begin(), privateU.end());
+            }
+
+            privateU.clear();
+
+#pragma omp critical(sendMessageTransfer)
+            {
+
+                QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+                QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+                QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+                QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+            }
+
+            privateQLocalVtx.clear();
+            privateQGhostVtx.clear();
+            privateQMsgType.clear();
+            privateQOwner.clear();
+
+        } // End of while ( !U.empty() )
+
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+
+#endif
+    } // End of parallel region
+}
--- a/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
+++ b/amgprec/impl/aggregator/processMatchedVerticesAndSendMessages.cpp
@ -0,0 +1,315 @@
+#include "MatchBoxPC.h"
+
+void processMatchedVerticesAndSendMessages(
+    MilanLongInt NLVer,
+    vector<MilanLongInt> &UChunkBeingProcessed,
+    vector<MilanLongInt> &U,
+    vector<MilanLongInt> &privateU,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *NumMessagesBundled,
+    MilanLongInt *SPtr,
+    MilanLongInt *verLocPtr,
+    MilanLongInt *verLocInd,
+    MilanLongInt *verDistance,
+    MilanLongInt *PCounter,
+    vector<MilanLongInt> &Counter,
+    MilanInt myRank,
+    MilanInt numProcs,
+    MilanLongInt *candidateMate,
+    vector<MilanLongInt> &GMate,
+    MilanLongInt *Mate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    MilanReal *edgeLocWeight,
+    vector<MilanLongInt> &QLocalVtx,
+    vector<MilanLongInt> &QGhostVtx,
+    vector<MilanLongInt> &QMsgType,
+    vector<MilanInt> &QOwner,
+    vector<MilanLongInt> &privateQLocalVtx,
+    vector<MilanLongInt> &privateQGhostVtx,
+    vector<MilanLongInt> &privateQMsgType,
+    vector<MilanInt> &privateQOwner,
+    MPI_Comm comm,
+    MilanLongInt *msgActual,
+    vector<MilanLongInt> &Message)
+{
+
+    MilanLongInt initialSize = QLocalVtx.size();
+    MilanLongInt adj1, adj2, adj11, adj12, k, k1, v = -1, w = -1, ghostOwner;
+    int option;
+    MilanLongInt mateVal;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+#ifdef COUNT_LOCAL_VERTEX
+    MilanLongInt localVertices = 0;
+#endif
+#pragma omp parallel private(k, w, v, k1, adj1, adj2, adj11, adj12, ghostOwner, option)                                                                             \
+    firstprivate(Message, privateU, StartIndex, EndIndex, privateQLocalVtx, privateQGhostVtx, privateQMsgType, privateQOwner, UChunkBeingProcessed) default(shared) \
+        num_threads(NUM_THREAD)                                                                                                                                     \
+            reduction(+                                                                                                                                             \
+                      : msgInd[:1], PCounter                                                                                                                        \
+                      [:numProcs], myCard                                                                                                                           \
+                      [:1], NumMessagesBundled                                                                                                                      \
+                      [:1], msgActual                                                                                                                               \
+                      [:1])
+    {
+
+        while (!U.empty())
+        {
+
+            extractUChunk(UChunkBeingProcessed, U, privateU);
+
+            for (MilanLongInt u : UChunkBeingProcessed)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")u: " << u;
+                fflush(stdout);
+#endif
+                if ((u >= StartIndex) && (u <= EndIndex))
+                { // Process Only the Local Vertices
+
+#ifdef COUNT_LOCAL_VERTEX
+                    localVertices++;
+#endif
+
+                    // Get the Adjacency list for u
+                    adj1 = verLocPtr[u - StartIndex]; // Pointer
+                    adj2 = verLocPtr[u - StartIndex + 1];
+                    for (k = adj1; k < adj2; k++)
+                    {
+                        option = -1;
+                        v = verLocInd[k];
+
+                        if ((v >= StartIndex) && (v <= EndIndex))
+                        { // If Local Vertex:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")v: " << v << " c(v)= " << candidateMate[v - StartIndex] << " Mate[v]: " << Mate[v];
+                            fflush(stdout);
+#endif
+#pragma omp atomic read
+                            mateVal = Mate[v - StartIndex];
+                            // If the current vertex is pointing to a matched vertex and is not matched
+                            if (mateVal < 0)
+                            {
+#pragma omp critical
+                                {
+                                    if (candidateMate[v - StartIndex] == u)
+                                    {
+                                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                        w = computeCandidateMate(verLocPtr[v - StartIndex],
+                                                                 verLocPtr[v - StartIndex + 1],
+                                                                 edgeLocWeight, 0,
+                                                                 verLocInd,
+                                                                 StartIndex,
+                                                                 EndIndex,
+                                                                 GMate,
+                                                                 Mate,
+                                                                 Ghost2LocalMap);
+
+                                        candidateMate[v - StartIndex] = w;
+
+#ifdef PRINT_DEBUG_INFO_
+                                        cout << "\n(" << myRank << ")" << v << " Points to: " << w;
+                                        fflush(stdout);
+#endif
+                                        // If found a dominating edge:
+                                        if (w >= 0)
+                                        {
+
+                                            if ((w < StartIndex) || (w > EndIndex))
+                                            { // A ghost
+#ifdef PRINT_DEBUG_INFO_
+                                                cout << "\n(" << myRank << ")Sending a request message:";
+                                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+#endif
+                                                option = 2;
+
+                                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                                {
+                                                    option = 1;
+                                                    Mate[v - StartIndex] = w;     // v is a local vertex
+                                                    GMate[Ghost2LocalMap[w]] = v; // w is a ghost vertex
+
+                                                } // End of if CandidateMate[w] = v
+                                            }     // End of if a Ghost Vertex
+                                            else
+                                            { // w is a local vertex
+                                                if (candidateMate[w - StartIndex] == v)
+                                                {
+                                                    option = 3;
+                                                    Mate[v - StartIndex] = w; // v is a local vertex
+                                                    Mate[w - StartIndex] = v; // w is a local vertex
+
+#ifdef PRINT_DEBUG_INFO_
+                                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                                                    fflush(stdout);
+#endif
+                                                } // End of if(CandidateMate(w) = v
+                                            }     // End of Else
+                                        }         // End of if(w >=0)
+                                        else
+                                            option = 4; // End of Else: w == -1
+                                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                                    } // End of If (candidateMate[v-StartIndex] == u
+                                }     // End of task
+                            }         // mateval < 0
+                        }             // End of if ( (v >= StartIndex) && (v <= EndIndex) ) //If Local Vertex:
+                        else
+                        { // Neighbor is a ghost vertex
+
+#pragma omp critical
+                            {
+                                if (candidateMate[NLVer + Ghost2LocalMap[v]] == u)
+                                    candidateMate[NLVer + Ghost2LocalMap[v]] = -1;
+                                if (v != Mate[u - StartIndex])
+                                    option = 5; // u is local
+                            }                   // End of critical
+                        }                       // End of Else //A Ghost Vertex
+
+                        switch (option)
+                        {
+                        case -1:
+                            // No things to do
+                            break;
+                        case 1:
+                            // Found a dominating edge, it is a ghost and candidateMate[NLVer + Ghost2LocalMap[w]] == v
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+                            (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") ";
+                            fflush(stdout);
+#endif
+                            // Decrement the counter:
+                            PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], SPtr);
+                        case 2:
+
+                            // Found a dominating edge, it is a ghost
+                            ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+
+                            // Build the Message Packet:
+                            // Message[0] = v;       // LOCAL
+                            // Message[1] = w;       // GHOST
+                            // Message[2] = REQUEST; // TYPE
+                            // Send a Request (Asynchronous)
+                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                            (*msgActual)++;
+                            (*msgInd)++;
+
+                            privateQLocalVtx.push_back(v);
+                            privateQGhostVtx.push_back(w);
+                            privateQMsgType.push_back(REQUEST);
+                            privateQOwner.push_back(ghostOwner);
+                            break;
+                        case 3:
+                            privateU.push_back(v);
+                            privateU.push_back(w);
+                            (*myCard)++;
+                            break;
+                        case 4:
+                            // Could not find a dominating vertex
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
+
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    fflush(stdout);
+#endif
+
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+
+                                    // Build the Message Packet:
+                                    // Message[0] = v;       // LOCAL
+                                    // Message[1] = w;       // GHOST
+                                    // Message[2] = FAILURE; // TYPE
+                                    // Send a Request (Asynchronous)
+                                    // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                                    (*msgActual)++;
+                                    (*msgInd)++;
+
+                                    privateQLocalVtx.push_back(v);
+                                    privateQGhostVtx.push_back(w);
+                                    privateQMsgType.push_back(FAILURE);
+                                    privateQOwner.push_back(ghostOwner);
+
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                            break;
+                        case 5:
+                        default:
+
+#ifdef PRINT_DEBUG_INFO_
+                            cout << "\n(" << myRank << ")Sending a success message: ";
+                            cout << "\n(" << myRank << ")Ghost is " << v << " Owner is: " << findOwnerOfGhost(v, verDistance, myRank, numProcs) << "\n";
+                            fflush(stdout);
+#endif
+
+                            ghostOwner = findOwnerOfGhost(v, verDistance, myRank, numProcs);
+
+                            // Build the Message Packet:
+                            // Message[0] = u;       // LOCAL
+                            // Message[1] = v;       // GHOST
+                            // Message[2] = SUCCESS; // TYPE
+                            // Send a Request (Asynchronous)
+                            // MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+
+                            (*msgActual)++;
+                            (*msgInd)++;
+
+                            privateQLocalVtx.push_back(u);
+                            privateQGhostVtx.push_back(v);
+                            privateQMsgType.push_back(SUCCESS);
+                            privateQOwner.push_back(ghostOwner);
+
+                            break;
+                        } // End of switch
+                    }     // End of inner for
+                }
+            } // End of outer for
+
+            queuesTransfer(U, privateU, QLocalVtx,
+                           QGhostVtx,
+                           QMsgType, QOwner, privateQLocalVtx,
+                           privateQGhostVtx,
+                           privateQMsgType,
+                           privateQOwner);
+
+        } // End of while ( !U.empty() )
+
+#ifdef COUNT_LOCAL_VERTEX
+        printf("Count local vertexes: %ld for thread %d of processor %d\n",
+               localVertices,
+               omp_get_thread_num(),
+               myRank);
+
+#endif
+    } // End of parallel region
+
+    // Send the messages
+    for (int i = initialSize; i < QOwner.size(); i++)
+    {
+
+        Message[0] = QLocalVtx[i];
+        Message[1] = QGhostVtx[i];
+        Message[2] = QMsgType[i];
+        ghostOwner = QOwner[i];
+
+        MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+    }
+}
--- a/amgprec/impl/aggregator/processMessages.cpp
+++ b/amgprec/impl/aggregator/processMessages.cpp
@ -0,0 +1,329 @@
+#include "MatchBoxPC.h"
+
+void processMessages(
+    MilanLongInt NLVer,
+    MilanLongInt *Mate,
+    MilanLongInt *candidateMate,
+    map<MilanLongInt, MilanLongInt> &Ghost2LocalMap,
+    vector<MilanLongInt> &GMate,
+    vector<MilanLongInt> &Counter,
+    MilanLongInt StartIndex,
+    MilanLongInt EndIndex,
+    MilanLongInt *myCard,
+    MilanLongInt *msgInd,
+    MilanLongInt *msgActual,
+    MilanReal *edgeLocWeight,
+    MilanLongInt *verDistance,
+    MilanLongInt *verLocPtr,
+    MilanLongInt k,
+    MilanLongInt *verLocInd,
+    MilanInt numProcs,
+    MilanInt myRank,
+    MPI_Comm comm,
+    vector<MilanLongInt> &Message,
+    MilanLongInt numGhostEdges,
+    MilanLongInt u,
+    MilanLongInt v,
+    MilanLongInt *S,
+    vector<MilanLongInt> &U)
+{
+
+    //#define PRINT_DEBUG_INFO_
+
+    MilanInt Sender;
+    MPI_Status computeStatus;
+    MilanLongInt bundleSize, w;
+    MilanLongInt adj11, adj12, k1;
+    MilanLongInt ghostOwner;
+    int error_codeC;
+    error_codeC = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+    char error_message[MPI_MAX_ERROR_STRING];
+    int message_length;
+    MilanLongInt message_type = 0;
+
+    // Buffer to receive bundled messages
+    // Maximum messages that can be received from any processor is
+    // twice the edge cut: REQUEST; REQUEST+(FAILURE/SUCCESS)
+    vector<MilanLongInt> ReceiveBuffer;
+    try
+    {
+        ReceiveBuffer.reserve(numGhostEdges * 2 * 3); // Three integers per cross edge
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+
+#ifdef PRINT_DEBUG_INFO_
+    cout
+        << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")About to begin Message processing phase ... *S=" << *S << endl;
+    fflush(stdout);
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << "=========================************===============================" << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+    // BLOCKING RECEIVE:
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << " Waiting for blocking receive..." << endl;
+    fflush(stdout);
+    fflush(stdout);
+#endif
+
+    error_codeC = MPI_Recv(&Message[0], 3, TypeMap<MilanLongInt>(), MPI_ANY_SOURCE, ComputeTag, comm, &computeStatus);
+    if (error_codeC != MPI_SUCCESS)
+    {
+        MPI_Error_string(error_codeC, error_message, &message_length);
+        cout << "\n*Error in call to MPI_Receive on Slave: " << error_message << "\n";
+        fflush(stdout);
+    }
+    Sender = computeStatus.MPI_SOURCE;
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Received message from Process " << Sender << " Type= " << Message[2] << endl;
+    fflush(stdout);
+#endif
+
+    if (Message[2] == SIZEINFO)
+    {
+
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Received bundled message from Process " << Sender << " Size= " << Message[0] << endl;
+        fflush(stdout);
+#endif
+        bundleSize = Message[0]; //#of integers in the message
+        // Build the Message Buffer:
+        if (!ReceiveBuffer.empty())
+            ReceiveBuffer.clear();            // Empty it out first
+        ReceiveBuffer.resize(bundleSize, -1); // Initialize
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Message Bundle Before: " << endl;
+        for (int i = 0; i < bundleSize; i++)
+            cout << ReceiveBuffer[i] << ",";
+        cout << endl;
+        fflush(stdout);
+#endif
+        // Receive the message
+        error_codeC = MPI_Recv(&ReceiveBuffer[0], bundleSize, TypeMap<MilanLongInt>(), Sender, BundleTag, comm, &computeStatus);
+        if (error_codeC != MPI_SUCCESS)
+        {
+            MPI_Error_string(error_codeC, error_message, &message_length);
+            cout << "\n*Error in call to MPI_Receive on processor " << myRank << " Error: " << error_message << "\n";
+            fflush(stdout);
+        }
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Message Bundle After: " << endl;
+        for (int i = 0; i < bundleSize; i++)
+            cout << ReceiveBuffer[i] << ",";
+        cout << endl;
+        fflush(stdout);
+#endif
+    }
+    else
+    { // Just a single message:
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Received regular message from Process " << Sender << " u= " << Message[0] << " v= " << Message[1] << endl;
+        fflush(stdout);
+#endif
+        // Add the current message to Queue:
+        bundleSize = 3; //#of integers in the message
+        // Build the Message Buffer:
+        if (!ReceiveBuffer.empty())
+            ReceiveBuffer.clear();            // Empty it out first
+        ReceiveBuffer.resize(bundleSize, -1); // Initialize
+
+        ReceiveBuffer[0] = Message[0]; // u
+        ReceiveBuffer[1] = Message[1]; // v
+        ReceiveBuffer[2] = Message[2]; // message_type
+    }
+
+#ifdef DEBUG_GHOST_
+    if ((v < StartIndex) || (v > EndIndex))
+    {
+        cout << "\n(" << myRank << ") From ReceiveBuffer: This should not happen: u= " << u << " v= " << v << " Type= " << message_type << " StartIndex " << StartIndex << " EndIndex " << EndIndex << endl;
+        fflush(stdout);
+    }
+#endif
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Processing message: u= " << u << " v= " << v << " Type= " << message_type << endl;
+    fflush(stdout);
+#endif
+
+    // Most of the time bundleSize == 3, thus, it's not worth parallelizing thi loop
+    for (MilanLongInt bundleCounter = 3; bundleCounter < bundleSize + 3; bundleCounter += 3)
+    {
+        u = ReceiveBuffer[bundleCounter - 3];            // GHOST
+        v = ReceiveBuffer[bundleCounter - 2];            // LOCAL
+        message_type = ReceiveBuffer[bundleCounter - 1]; // TYPE
+
+        // CASE I: REQUEST
+        if (message_type == REQUEST)
+        {
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")Message type is REQUEST" << endl;
+            fflush(stdout);
+#endif
+#ifdef DEBUG_GHOST_
+            if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+            {
+                cout << "\n(" << myRank << ") case 1 Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                fflush(stdout);
+            }
+
+#endif
+
+            if (Mate[v - StartIndex] == -1)
+            {                                                 // Process only if not already matched  (v is local)
+                candidateMate[NLVer + Ghost2LocalMap[u]] = v; // Set CandidateMate for the ghost
+                if (candidateMate[v - StartIndex] == u)
+                {
+                    GMate[Ghost2LocalMap[u]] = v; // u is ghost
+                    Mate[v - StartIndex] = u;     // v is local
+                    U.push_back(v);
+                    U.push_back(u);
+                    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << u << ") " << endl;
+                    fflush(stdout);
+#endif
+
+                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+                } // End of if ( candidateMate[v-StartIndex] == u )e
+            }     // End of if ( Mate[v] == -1 )
+        }         // End of REQUEST
+        else
+        { // CASE II: SUCCESS
+            if (message_type == SUCCESS)
+            {
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Message type is SUCCESS" << endl;
+                fflush(stdout);
+#endif
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1; // Set a Dummy Mate to make sure that we do not (u is a ghost) process it again
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S);
+#ifdef DEBUG_GHOST_
+                if ((v < 0) || (v < StartIndex) || ((v - StartIndex) > NLVer))
+                {
+                    cout << "\n(" << myRank << ") case 2  Bad address " << v << " " << StartIndex << " " << v - StartIndex << " " << NLVer << endl;
+                    fflush(stdout);
+                }
+#endif
+                if (Mate[v - StartIndex] == -1)
+                { // Process only if not already matched ( v is local)
+                    if (candidateMate[v - StartIndex] == u)
+                    {
+                        // Start: PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                        w = computeCandidateMate(verLocPtr[v - StartIndex], verLocPtr[v - StartIndex + 1], edgeLocWeight, k, verLocInd, StartIndex, EndIndex, GMate, Mate, Ghost2LocalMap);
+                        candidateMate[v - StartIndex] = w;
+#ifdef PRINT_DEBUG_INFO_
+                        cout << "\n(" << myRank << ")" << v << " Points to: " << w << endl;
+                        fflush(stdout);
+#endif
+                        // If found a dominating edge:
+                        if (w >= 0)
+                        {
+                            if ((w < StartIndex) || (w > EndIndex))
+                            { // w is a ghost
+                                // Build the Message Packet:
+                                Message[0] = v;       // LOCAL
+                                Message[1] = w;       // GHOST
+                                Message[2] = REQUEST; // TYPE
+                                                      // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                cout << "\n(" << myRank << ")Sending a request message: ";
+                                cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                                fflush(stdout);
+#endif
+                                ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                //assert(ghostOwner != -1);
+                                //assert(ghostOwner != myRank);
+
+                                MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                (*msgInd)++;
+                                (*msgActual)++;
+                                if (candidateMate[NLVer + Ghost2LocalMap[w]] == v)
+                                {
+                                    Mate[v - StartIndex] = w;     // v is local
+                                    GMate[Ghost2LocalMap[w]] = v; // w is ghost
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                    fflush(stdout);
+#endif
+
+                                    PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[w]], S);
+                                } // End of if CandidateMate[w] = v
+                            }     // End of if a Ghost Vertex
+                            else
+                            { // w is a local vertex
+                                if (candidateMate[w - StartIndex] == v)
+                                {
+                                    Mate[v - StartIndex] = w; // v is local
+                                    Mate[w - StartIndex] = v; // w is local
+                                    // Q.push_back(u);
+                                    U.push_back(v);
+                                    U.push_back(w);
+                                    (*myCard)++;
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")MATCH: (" << v << "," << w << ") " << endl;
+                                    fflush(stdout);
+#endif
+                                } // End of if(CandidateMate(w) = v
+                            }     // End of Else
+                        }         // End of if(w >=0)
+                        else
+                        { // No dominant edge found
+                            adj11 = verLocPtr[v - StartIndex];
+                            adj12 = verLocPtr[v - StartIndex + 1];
+                            for (k1 = adj11; k1 < adj12; k1++)
+                            {
+                                w = verLocInd[k1];
+                                if ((w < StartIndex) || (w > EndIndex))
+                                { // A ghost
+                                    // Build the Message Packet:
+                                    Message[0] = v;       // LOCAL
+                                    Message[1] = w;       // GHOST
+                                    Message[2] = FAILURE; // TYPE
+                                                          // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+                                    cout << "\n(" << myRank << ")Sending a failure message: ";
+                                    cout << "\n(" << myRank << ")Ghost is " << w << " Owner is: " << findOwnerOfGhost(w, verDistance, myRank, numProcs) << endl;
+                                    fflush(stdout);
+#endif
+                                    ghostOwner = findOwnerOfGhost(w, verDistance, myRank, numProcs);
+                                    //assert(ghostOwner != -1);
+                                    //assert(ghostOwner != myRank);
+                                    MPI_Bsend(&Message[0], 3, TypeMap<MilanLongInt>(), ghostOwner, ComputeTag, comm);
+                                    (*msgInd)++;
+                                    (*msgActual)++;
+                                } // End of if(GHOST)
+                            }     // End of for loop
+                        }         // End of Else: w == -1
+                        // End:   PARALLEL_PROCESS_EXPOSED_VERTEX_B(v)
+                    } // End of if ( candidateMate[v-StartIndex] == u )
+                }     // End of if ( Mate[v] == -1 )
+            }         // End of if ( message_type == SUCCESS )
+            else
+            { // CASE III: FAILURE
+#ifdef PRINT_DEBUG_INFO_
+                cout << "\n(" << myRank << ")Message type is FAILURE" << endl;
+                fflush(stdout);
+#endif
+                GMate[Ghost2LocalMap[u]] = EndIndex + 1;            // Set a Dummy Mate to make sure that we do not (u is a ghost) process this anymore
+                PROCESS_CROSS_EDGE(&Counter[Ghost2LocalMap[u]], S); // Decrease the counter
+            }                                                       // End of else: CASE III
+        }                                                           // End of else: CASE I
+    }
+
+    return;
+}
--- a/amgprec/impl/aggregator/queueTransfer.cpp
+++ b/amgprec/impl/aggregator/queueTransfer.cpp
@ -0,0 +1,35 @@
+#include "MatchBoxPC.h"
+
+void queuesTransfer(vector<MilanLongInt> &U,
+                    vector<MilanLongInt> &privateU,
+                    vector<MilanLongInt> &QLocalVtx,
+                    vector<MilanLongInt> &QGhostVtx,
+                    vector<MilanLongInt> &QMsgType,
+                    vector<MilanInt> &QOwner,
+                    vector<MilanLongInt> &privateQLocalVtx,
+                    vector<MilanLongInt> &privateQGhostVtx,
+                    vector<MilanLongInt> &privateQMsgType,
+                    vector<MilanInt> &privateQOwner)
+{
+
+#pragma omp critical(U)
+    {
+        U.insert(U.end(), privateU.begin(), privateU.end());
+    }
+
+    privateU.clear();
+
+#pragma omp critical(sendMessageTransfer)
+    {
+
+        QLocalVtx.insert(QLocalVtx.end(), privateQLocalVtx.begin(), privateQLocalVtx.end());
+        QGhostVtx.insert(QGhostVtx.end(), privateQGhostVtx.begin(), privateQGhostVtx.end());
+        QMsgType.insert(QMsgType.end(), privateQMsgType.begin(), privateQMsgType.end());
+        QOwner.insert(QOwner.end(), privateQOwner.begin(), privateQOwner.end());
+    }
+
+    privateQLocalVtx.clear();
+    privateQGhostVtx.clear();
+    privateQMsgType.clear();
+    privateQOwner.clear();
+}
--- a/amgprec/impl/aggregator/sendBundledMessages.cpp
+++ b/amgprec/impl/aggregator/sendBundledMessages.cpp
@ -0,0 +1,210 @@
+#include "MatchBoxPC.h"
+
+void sendBundledMessages(MilanLongInt *numGhostEdges,
+                                MilanInt *BufferSize,
+                                MilanLongInt *Buffer,
+                                vector<MilanLongInt> &PCumulative,
+                                vector<MilanLongInt> &PMessageBundle,
+                                vector<MilanLongInt> &PSizeInfoMessages,
+                                MilanLongInt *PCounter,
+                                MilanLongInt NumMessagesBundled,
+                                MilanLongInt *msgActual,
+                                MilanLongInt *msgInd,
+                                MilanInt numProcs,
+                                MilanInt myRank,
+                                MPI_Comm comm,
+                                vector<MilanLongInt> &QLocalVtx,
+                                vector<MilanLongInt> &QGhostVtx,
+                                vector<MilanLongInt> &QMsgType,
+                                vector<MilanInt> &QOwner,
+                                vector<MPI_Request> &SRequest,
+                                vector<MPI_Status> &SStatus)
+{
+
+    MilanLongInt myIndex = 0, numMessagesToSend;
+    MilanInt i = 0, OneMessageSize = 0;
+
+#ifdef DEBUG_HANG_
+    if (myRank == 0)
+        cout << "\n(" << myRank << ") Send Bundles" << endl;
+    fflush(stdout);
+#endif
+
+#pragma omp parallel private(i) default(shared) num_threads(NUM_THREAD)
+    {
+#pragma omp master
+        {
+// Data structures for Bundled Messages:
+#pragma omp task depend(inout                                                       \
+                        : PCumulative, PMessageBundle, PSizeInfoMessages) depend(in \
+                                                                                 : NumMessagesBundled, numProcs)
+            {try {
+                PMessageBundle.reserve(NumMessagesBundled * 3); // Three integers per message
+    PCumulative.reserve(numProcs + 1);                          // Similar to Row Pointer vector in CSR data structure
+    PSizeInfoMessages.reserve(numProcs * 3);                    // Buffer to hold the Size info message packets
+}
+catch (length_error)
+{
+    cout << "Error in function algoDistEdgeApproxDominatingEdgesMessageBundling: \n";
+    cout << "Not enough memory to allocate the internal variables \n";
+    exit(1);
+}
+PMessageBundle.resize(NumMessagesBundled * 3, -1); // Initialize
+PCumulative.resize(numProcs + 1, 0);               // Only initialize the counter variable
+PSizeInfoMessages.resize(numProcs * 3, 0);
+}
+
+#pragma omp task depend(inout                    \
+                        : PCumulative) depend(in \
+                                              : PCounter)
+{
+    for (i = 0; i < numProcs; i++)
+        PCumulative[i + 1] = PCumulative[i] + PCounter[i];
+}
+
+#pragma omp task depend(inout \
+                        : PCounter)
+{
+    // Reuse PCounter to keep track of how many messages were inserted:
+    for (MilanInt i = 0; i < numProcs; i++) // Changed by Fabio to be an integer, addresses needs to be integers!
+        PCounter[i] = 0;
+}
+
+// Build the Message Bundle packet:
+#pragma omp task depend(in                                                                                          \
+                        : PCounter, QLocalVtx, QGhostVtx, QMsgType, QOwner, PMessageBundle, PCumulative) depend(out \
+                                                                                                                : myIndex, PMessageBundle, PCounter)
+{
+    for (i = 0; i < NumMessagesBundled; i++)
+    {
+        myIndex = (PCumulative[QOwner[i]] + PCounter[QOwner[i]]) * 3;
+        PMessageBundle[myIndex + 0] = QLocalVtx[i];
+        PMessageBundle[myIndex + 1] = QGhostVtx[i];
+        PMessageBundle[myIndex + 2] = QMsgType[i];
+        PCounter[QOwner[i]]++;
+    }
+}
+
+// Send the Bundled Messages: Use ISend
+#pragma omp task depend(out \
+                        : SRequest, SStatus)
+{
+    try
+    {
+        SRequest.reserve(numProcs * 2); // At most two messages per processor
+        SStatus.reserve(numProcs * 2);  // At most two messages per processor
+    }
+    catch (length_error)
+    {
+        cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearchImmediateSend: \n";
+        cout << "Not enough memory to allocate the internal variables \n";
+        exit(1);
+    }
+}
+
+// Send the Messages
+#pragma omp task depend(inout                                                  \
+                        : SRequest, PSizeInfoMessages, PCumulative) depend(out \
+                                                                           : *msgActual, *msgInd)
+{
+    for (i = 0; i < numProcs; i++)
+    {                    // Changed by Fabio to be an integer, addresses needs to be integers!
+        if (i == myRank) // Do not send anything to yourself
+            continue;
+        // Send the Message with information about the size of next message:
+        // Build the Message Packet:
+        PSizeInfoMessages[i * 3 + 0] = (PCumulative[i + 1] - PCumulative[i]) * 3; // # of integers in the next message
+        PSizeInfoMessages[i * 3 + 1] = -1;                                        // Dummy packet
+        PSizeInfoMessages[i * 3 + 2] = SIZEINFO;                                  // TYPE
+                                                                                  // Send a Request (Asynchronous)
+#ifdef PRINT_DEBUG_INFO_
+        cout << "\n(" << myRank << ")Sending bundled message to process " << i << " size: " << PSizeInfoMessages[i * 3 + 0] << endl;
+        fflush(stdout);
+#endif
+        if (PSizeInfoMessages[i * 3 + 0] > 0)
+        { // Send only if it is a nonempty packet
+            MPI_Isend(&PSizeInfoMessages[i * 3 + 0], 3, TypeMap<MilanLongInt>(), i, ComputeTag, comm,
+                      &SRequest[(*msgInd)]);
+            (*msgActual)++;
+            (*msgInd)++;
+            // Now Send the message with the data packet:
+#ifdef PRINT_DEBUG_INFO_
+            cout << "\n(" << myRank << ")SendiFFng Bundle to : " << i << endl;
+            for (k = (PCumulative[i] * 3); k < (PCumulative[i] * 3 + PSizeInfoMessages[i * 3 + 0]); k++)
+                cout << PMessageBundle[k] << ",";
+            cout << endl;
+            fflush(stdout);
+#endif
+            MPI_Isend(&PMessageBundle[PCumulative[i] * 3], PSizeInfoMessages[i * 3 + 0],
+                      TypeMap<MilanLongInt>(), i, BundleTag, comm, &SRequest[(*msgInd)]);
+            (*msgInd)++;
+        } // End of if size > 0
+    }
+}
+
+#pragma omp task depend(inout \
+                        : PCumulative, QLocalVtx, QGhostVtx, QMsgType, QOwner)
+{
+
+    // Free up temporary memory:
+    PCumulative.clear();
+    QLocalVtx.clear();
+    QGhostVtx.clear();
+    QMsgType.clear();
+    QOwner.clear();
+}
+
+#pragma omp task depend(inout : OneMessageSize, *BufferSize) depend(out : numMessagesToSend) depend(in : *numGhostEdges)
+{
+
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+    cout << "\n(" << myRank << ")Total number of potential message X 2 = " << *numGhostEdges * 2;
+    cout << "\n(" << myRank << ")Number messages already sent in bundles = " << NumMessagesBundled;
+    if (*numGhostEdges > 0)
+    {
+        cout << "\n(" << myRank << ")Percentage of total = " << ((double)NumMessagesBundled / (double)(*numGhostEdges * 2)) * 100.0 << "% \n";
+    }
+    fflush(stdout);
+#endif
+
+    // Allocate memory for MPI Send messages:
+    /* WILL COME BACK HERE - NO NEED TO STORE ALL THIS MEMORY !! */
+    OneMessageSize = 0;
+    MPI_Pack_size(3, TypeMap<MilanLongInt>(), comm, &OneMessageSize); // Size of one message packet
+    // How many messages to send?
+    // Potentially three kinds of messages will be sent/received:
+    // Request, Success, Failure.
+    // But only two will be sent from a given processor.
+    // Substract the number of messages that have already been sent as bundled messages:
+    numMessagesToSend = (*numGhostEdges) * 2 - NumMessagesBundled;
+    *BufferSize = (OneMessageSize + MPI_BSEND_OVERHEAD) * numMessagesToSend;
+}
+
+#pragma omp task depend(out : Buffer) depend(in : *BufferSize)
+{
+    Buffer = 0;
+#ifdef PRINT_DEBUG_INFO_
+    cout << "\n(" << myRank << ")Size of One Message from PACK= " << OneMessageSize;
+    cout << "\n(" << myRank << ")Size of Message overhead = " << MPI_BSEND_OVERHEAD;
+    cout << "\n(" << myRank << ")Number of Ghost edges = " << *numGhostEdges;
+    cout << "\n(" << myRank << ")Number of remaining message = " << numMessagesToSend;
+    cout << "\n(" << myRank << ")BufferSize = " << (*BufferSize);
+    cout << "\n(" << myRank << ")Attaching Buffer on.. ";
+    fflush(stdout);
+#endif
+    if ((*BufferSize) > 0)
+    {
+        Buffer = (MilanLongInt *)malloc((*BufferSize)); // Allocate memory
+        if (Buffer == 0)
+        {
+            cout << "Error in function algoDistEdgeApproxDominatingEdgesLinearSearch: \n";
+            cout << "Not enough memory to allocate for send buffer on process " << myRank << "\n";
+            exit(1);
+        }
+        MPI_Buffer_attach(Buffer, *BufferSize); // Attach the Buffer
+    }
+}
+}
+}
+}
--- a/amgprec/stZnqhkT
+++ b/amgprec/stZnqhkT
@ -0,0 +1 @@
+!<arch>
--- a/exec.sh
+++ b/exec.sh
@ -0,0 +1,25 @@
+cd amgprec/impl/aggregator/
+rm MatchBoxPC.o 
+rm sendBundledMessages.o 
+rm initialize.o 
+rm extractUChunk.o 
+rm isAlreadyMatched.o 
+rm findOwnerOfGhost.o 
+rm computeCandidateMate.o 
+rm parallelComputeCandidateMateB.o 
+rm processMatchedVertices.o 
+rm processCrossEdge.o 
+rm queueTransfer.o 
+rm processMessages.o 
+rm processExposedVertex.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateC.o 
+rm algoDistEdgeApproxDomEdgesLinearSearchMesgBndlSmallMateCMP.o
+cd ../../../
+make all
+cd samples/advanced/pdegen
+make amg_d_pde3d
+cd runs
+mpirun -np 4 amg_d_pde3d amg_pde3d.inp
+
+
+
--- a/samples/advanced/pdegen/Makefile
+++ b/samples/advanced/pdegen/Makefile
@ -3,7 +3,7 @@ AMGINCDIR=$(AMGDIR)/include
 include $(AMGINCDIR)/Make.inc.amg4psblas
 AMGMODDIR=$(AMGDIR)/modules
 AMGLIBDIR=$(AMGDIR)/lib
-AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec
+AMG_LIBS=-L$(AMGLIBDIR) -lpsb_krylov -lamg_prec -lpsb_prec -llapack -lblas
 FINCLUDES=$(FMFLAG). $(FMFLAG)$(AMGMODDIR) $(FMFLAG)$(AMGINCDIR) $(PSBLAS_INCLUDES) $(FIFLAG).

 LINKOPT=
--- a/samples/advanced/pdegen/runs/amg_pde3d.inp
+++ b/samples/advanced/pdegen/runs/amg_pde3d.inp
@ -1,6 +1,6 @@
 %%%%%%%%%%%  General  arguments % Lines starting with % are ignored.
 CSR                         ! Storage format CSR COO JAD
-0080                        ! IDIM; domain size. Linear system size is IDIM**3
+0020                       ! IDIM; domain size. Linear system size is IDIM**3
 CONST                       ! PDECOEFF: CONST, EXP, GAUSS Coefficients of the PDE
 BICGSTAB                    ! Iterative method: BiCGSTAB BiCGSTABL BiCG CG CGS FCG GCR RGMRES
 2                           ! ISTOPC